diff --git a/sys/contrib/ipfilter/netinet/ip_fil_freebsd.c b/sys/contrib/ipfilter/netinet/ip_fil_freebsd.c index 072ab8bcd4e5..7e0ac4bc2927 100644 --- a/sys/contrib/ipfilter/netinet/ip_fil_freebsd.c +++ b/sys/contrib/ipfilter/netinet/ip_fil_freebsd.c @@ -1,1496 +1,1497 @@ /* $FreeBSD$ */ /* * Copyright (C) 2012 by Darren Reed. * * See the IPFILTER.LICENCE file for details on licencing. */ #if !defined(lint) static const char sccsid[] = "@(#)ip_fil.c 2.41 6/5/96 (C) 1993-2000 Darren Reed"; static const char rcsid[] = "@(#)$Id$"; #endif #if defined(KERNEL) || defined(_KERNEL) # undef KERNEL # undef _KERNEL # define KERNEL 1 # define _KERNEL 1 #endif #if defined(__FreeBSD__) && \ !defined(KLD_MODULE) && !defined(IPFILTER_LKM) # include "opt_inet6.h" #endif #include #include #include #include #include #include #include #include #include #include #include #if defined(__FreeBSD__) # include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "netinet/ip_compat.h" #ifdef USE_INET6 # include #endif #include "netinet/ip_fil.h" #include "netinet/ip_nat.h" #include "netinet/ip_frag.h" #include "netinet/ip_state.h" #include "netinet/ip_proxy.h" #include "netinet/ip_auth.h" #include "netinet/ip_sync.h" #include "netinet/ip_lookup.h" #include "netinet/ip_dstlist.h" #ifdef IPFILTER_SCAN # include "netinet/ip_scan.h" #endif #include "netinet/ip_pool.h" #include #include #ifdef CSUM_DATA_VALID # include #endif extern int ip_optcopy(struct ip *, struct ip *); #ifdef IPFILTER_M_IPFILTER MALLOC_DEFINE(M_IPFILTER, "ipfilter", "IP Filter packet filter data structures"); #endif static int ipf_send_ip(fr_info_t *, mb_t *); static void ipf_timer_func(void *arg); VNET_DEFINE(ipf_main_softc_t, ipfmain) = { .ipf_running = -2, }; #define V_ipfmain VNET(ipfmain) #include #include VNET_DEFINE_STATIC(eventhandler_tag, ipf_arrivetag); VNET_DEFINE_STATIC(eventhandler_tag, ipf_departtag); #define V_ipf_arrivetag VNET(ipf_arrivetag) #define V_ipf_departtag VNET(ipf_departtag) #if 0 /* * Disable the "cloner" event handler; we are getting interface * events before the firewall is fully initiallized and also no vnet * information thus leading to uninitialised memory accesses. * In addition it is unclear why we need it in first place. * If it turns out to be needed, well need a dedicated event handler * for it to deal with the ifc and the correct vnet. */ VNET_DEFINE_STATIC(eventhandler_tag, ipf_clonetag); #define V_ipf_clonetag VNET(ipf_clonetag) #endif static void ipf_ifevent(void *arg, struct ifnet *ifp); static void ipf_ifevent(arg, ifp) void *arg; struct ifnet *ifp; { CURVNET_SET(ifp->if_vnet); if (V_ipfmain.ipf_running > 0) ipf_sync(&V_ipfmain, NULL); CURVNET_RESTORE(); } static pfil_return_t ipf_check_wrapper(struct mbuf **mp, struct ifnet *ifp, int flags, void *ruleset __unused, struct inpcb *inp) { struct ip *ip = mtod(*mp, struct ip *); pfil_return_t rv; CURVNET_SET(ifp->if_vnet); rv = ipf_check(&V_ipfmain, ip, ip->ip_hl << 2, ifp, !!(flags & PFIL_OUT), mp); CURVNET_RESTORE(); return (rv == 0 ? PFIL_PASS : PFIL_DROPPED); } #ifdef USE_INET6 static pfil_return_t ipf_check_wrapper6(struct mbuf **mp, struct ifnet *ifp, int flags, void *ruleset __unused, struct inpcb *inp) { pfil_return_t rv; CURVNET_SET(ifp->if_vnet); rv = ipf_check(&V_ipfmain, mtod(*mp, struct ip *), sizeof(struct ip6_hdr), ifp, !!(flags & PFIL_OUT), mp); CURVNET_RESTORE(); return (rv == 0 ? PFIL_PASS : PFIL_DROPPED); } # endif #if defined(IPFILTER_LKM) int ipf_identify(s) char *s; { if (strcmp(s, "ipl") == 0) return 1; return 0; } #endif /* IPFILTER_LKM */ static void ipf_timer_func(arg) void *arg; { ipf_main_softc_t *softc = arg; SPL_INT(s); SPL_NET(s); READ_ENTER(&softc->ipf_global); if (softc->ipf_running > 0) ipf_slowtimer(softc); if (softc->ipf_running == -1 || softc->ipf_running == 1) { #if 0 softc->ipf_slow_ch = timeout(ipf_timer_func, softc, hz/2); #endif callout_init(&softc->ipf_slow_ch, 1); callout_reset(&softc->ipf_slow_ch, (hz / IPF_HZ_DIVIDE) * IPF_HZ_MULT, ipf_timer_func, softc); } RWLOCK_EXIT(&softc->ipf_global); SPL_X(s); } int ipfattach(softc) ipf_main_softc_t *softc; { #ifdef USE_SPL int s; #endif SPL_NET(s); if (softc->ipf_running > 0) { SPL_X(s); return EBUSY; } if (ipf_init_all(softc) < 0) { SPL_X(s); return EIO; } bzero((char *)V_ipfmain.ipf_selwait, sizeof(V_ipfmain.ipf_selwait)); softc->ipf_running = 1; if (softc->ipf_control_forwarding & 1) V_ipforwarding = 1; SPL_X(s); #if 0 softc->ipf_slow_ch = timeout(ipf_timer_func, softc, (hz / IPF_HZ_DIVIDE) * IPF_HZ_MULT); #endif callout_init(&softc->ipf_slow_ch, 1); callout_reset(&softc->ipf_slow_ch, (hz / IPF_HZ_DIVIDE) * IPF_HZ_MULT, ipf_timer_func, softc); return 0; } /* * Disable the filter by removing the hooks from the IP input/output * stream. */ int ipfdetach(softc) ipf_main_softc_t *softc; { #ifdef USE_SPL int s; #endif if (softc->ipf_control_forwarding & 2) V_ipforwarding = 0; SPL_NET(s); #if 0 if (softc->ipf_slow_ch.callout != NULL) untimeout(ipf_timer_func, softc, softc->ipf_slow_ch); bzero(&softc->ipf_slow, sizeof(softc->ipf_slow)); #endif callout_drain(&softc->ipf_slow_ch); ipf_fini_all(softc); softc->ipf_running = -2; SPL_X(s); return 0; } /* * Filter ioctl interface. */ int ipfioctl(dev, cmd, data, mode, p) struct thread *p; #define p_cred td_ucred #define p_uid td_ucred->cr_ruid struct cdev *dev; ioctlcmd_t cmd; caddr_t data; int mode; { int error = 0, unit = 0; SPL_INT(s); CURVNET_SET(TD_TO_VNET(p)); if (securelevel_ge(p->p_cred, 3) && (mode & FWRITE)) { V_ipfmain.ipf_interror = 130001; CURVNET_RESTORE(); return EPERM; } unit = GET_MINOR(dev); if ((IPL_LOGMAX < unit) || (unit < 0)) { V_ipfmain.ipf_interror = 130002; CURVNET_RESTORE(); return ENXIO; } if (V_ipfmain.ipf_running <= 0) { if (unit != IPL_LOGIPF && cmd != SIOCIPFINTERROR) { V_ipfmain.ipf_interror = 130003; CURVNET_RESTORE(); return EIO; } if (cmd != SIOCIPFGETNEXT && cmd != SIOCIPFGET && cmd != SIOCIPFSET && cmd != SIOCFRENB && cmd != SIOCGETFS && cmd != SIOCGETFF && cmd != SIOCIPFINTERROR) { V_ipfmain.ipf_interror = 130004; CURVNET_RESTORE(); return EIO; } } SPL_NET(s); error = ipf_ioctlswitch(&V_ipfmain, unit, data, cmd, mode, p->p_uid, p); CURVNET_RESTORE(); if (error != -1) { SPL_X(s); return error; } SPL_X(s); return error; } /* * ipf_send_reset - this could conceivably be a call to tcp_respond(), but that * requires a large amount of setting up and isn't any more efficient. */ int ipf_send_reset(fin) fr_info_t *fin; { struct tcphdr *tcp, *tcp2; int tlen = 0, hlen; struct mbuf *m; #ifdef USE_INET6 ip6_t *ip6; #endif ip_t *ip; tcp = fin->fin_dp; if (tcp->th_flags & TH_RST) return -1; /* feedback loop */ if (ipf_checkl4sum(fin) == -1) return -1; tlen = fin->fin_dlen - (TCP_OFF(tcp) << 2) + ((tcp->th_flags & TH_SYN) ? 1 : 0) + ((tcp->th_flags & TH_FIN) ? 1 : 0); #ifdef USE_INET6 hlen = (fin->fin_v == 6) ? sizeof(ip6_t) : sizeof(ip_t); #else hlen = sizeof(ip_t); #endif #ifdef MGETHDR MGETHDR(m, M_NOWAIT, MT_HEADER); #else MGET(m, M_NOWAIT, MT_HEADER); #endif if (m == NULL) return -1; if (sizeof(*tcp2) + hlen > MLEN) { if (!(MCLGET(m, M_NOWAIT))) { FREE_MB_T(m); return -1; } } m->m_len = sizeof(*tcp2) + hlen; m->m_data += max_linkhdr; m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = (struct ifnet *)0; ip = mtod(m, struct ip *); bzero((char *)ip, hlen); #ifdef USE_INET6 ip6 = (ip6_t *)ip; #endif tcp2 = (struct tcphdr *)((char *)ip + hlen); tcp2->th_sport = tcp->th_dport; tcp2->th_dport = tcp->th_sport; if (tcp->th_flags & TH_ACK) { tcp2->th_seq = tcp->th_ack; tcp2->th_flags = TH_RST; tcp2->th_ack = 0; } else { tcp2->th_seq = 0; tcp2->th_ack = ntohl(tcp->th_seq); tcp2->th_ack += tlen; tcp2->th_ack = htonl(tcp2->th_ack); tcp2->th_flags = TH_RST|TH_ACK; } TCP_X2_A(tcp2, 0); TCP_OFF_A(tcp2, sizeof(*tcp2) >> 2); tcp2->th_win = tcp->th_win; tcp2->th_sum = 0; tcp2->th_urp = 0; #ifdef USE_INET6 if (fin->fin_v == 6) { ip6->ip6_flow = ((ip6_t *)fin->fin_ip)->ip6_flow; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_hlim = 0; ip6->ip6_src = fin->fin_dst6.in6; ip6->ip6_dst = fin->fin_src6.in6; tcp2->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*ip6), sizeof(*tcp2)); return ipf_send_ip(fin, m); } #endif ip->ip_p = IPPROTO_TCP; ip->ip_len = htons(sizeof(struct tcphdr)); ip->ip_src.s_addr = fin->fin_daddr; ip->ip_dst.s_addr = fin->fin_saddr; tcp2->th_sum = in_cksum(m, hlen + sizeof(*tcp2)); ip->ip_len = htons(hlen + sizeof(*tcp2)); return ipf_send_ip(fin, m); } /* * ip_len must be in network byte order when called. */ static int ipf_send_ip(fin, m) fr_info_t *fin; mb_t *m; { fr_info_t fnew; ip_t *ip, *oip; int hlen; ip = mtod(m, ip_t *); bzero((char *)&fnew, sizeof(fnew)); fnew.fin_main_soft = fin->fin_main_soft; IP_V_A(ip, fin->fin_v); switch (fin->fin_v) { case 4 : oip = fin->fin_ip; hlen = sizeof(*oip); fnew.fin_v = 4; fnew.fin_p = ip->ip_p; fnew.fin_plen = ntohs(ip->ip_len); IP_HL_A(ip, sizeof(*oip) >> 2); ip->ip_tos = oip->ip_tos; ip->ip_id = fin->fin_ip->ip_id; ip->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0); ip->ip_ttl = V_ip_defttl; ip->ip_sum = 0; break; #ifdef USE_INET6 case 6 : { ip6_t *ip6 = (ip6_t *)ip; ip6->ip6_vfc = 0x60; ip6->ip6_hlim = IPDEFTTL; hlen = sizeof(*ip6); fnew.fin_p = ip6->ip6_nxt; fnew.fin_v = 6; fnew.fin_plen = ntohs(ip6->ip6_plen) + hlen; break; } #endif default : return EINVAL; } #ifdef IPSEC_SUPPORT m->m_pkthdr.rcvif = NULL; #endif fnew.fin_ifp = fin->fin_ifp; fnew.fin_flx = FI_NOCKSUM; fnew.fin_m = m; fnew.fin_ip = ip; fnew.fin_mp = &m; fnew.fin_hlen = hlen; fnew.fin_dp = (char *)ip + hlen; (void) ipf_makefrip(hlen, ip, &fnew); return ipf_fastroute(m, &m, &fnew, NULL); } int ipf_send_icmp_err(type, fin, dst) int type; fr_info_t *fin; int dst; { int err, hlen, xtra, iclen, ohlen, avail, code; struct in_addr dst4; struct icmp *icmp; struct mbuf *m; i6addr_t dst6; void *ifp; #ifdef USE_INET6 ip6_t *ip6; #endif ip_t *ip, *ip2; if ((type < 0) || (type >= ICMP_MAXTYPE)) return -1; code = fin->fin_icode; #ifdef USE_INET6 /* See NetBSD ip_fil_netbsd.c r1.4: */ if ((code < 0) || (code >= sizeof(icmptoicmp6unreach)/sizeof(int))) return -1; #endif if (ipf_checkl4sum(fin) == -1) return -1; #ifdef MGETHDR MGETHDR(m, M_NOWAIT, MT_HEADER); #else MGET(m, M_NOWAIT, MT_HEADER); #endif if (m == NULL) return -1; avail = MHLEN; xtra = 0; hlen = 0; ohlen = 0; dst4.s_addr = 0; ifp = fin->fin_ifp; if (fin->fin_v == 4) { if ((fin->fin_p == IPPROTO_ICMP) && !(fin->fin_flx & FI_SHORT)) switch (ntohs(fin->fin_data[0]) >> 8) { case ICMP_ECHO : case ICMP_TSTAMP : case ICMP_IREQ : case ICMP_MASKREQ : break; default : FREE_MB_T(m); return 0; } if (dst == 0) { if (ipf_ifpaddr(&V_ipfmain, 4, FRI_NORMAL, ifp, &dst6, NULL) == -1) { FREE_MB_T(m); return -1; } dst4 = dst6.in4; } else dst4.s_addr = fin->fin_daddr; hlen = sizeof(ip_t); ohlen = fin->fin_hlen; iclen = hlen + offsetof(struct icmp, icmp_ip) + ohlen; if (fin->fin_hlen < fin->fin_plen) xtra = MIN(fin->fin_dlen, 8); else xtra = 0; } #ifdef USE_INET6 else if (fin->fin_v == 6) { hlen = sizeof(ip6_t); ohlen = sizeof(ip6_t); iclen = hlen + offsetof(struct icmp, icmp_ip) + ohlen; type = icmptoicmp6types[type]; if (type == ICMP6_DST_UNREACH) code = icmptoicmp6unreach[code]; if (iclen + max_linkhdr + fin->fin_plen > avail) { if (!(MCLGET(m, M_NOWAIT))) { FREE_MB_T(m); return -1; } avail = MCLBYTES; } xtra = MIN(fin->fin_plen, avail - iclen - max_linkhdr); xtra = MIN(xtra, IPV6_MMTU - iclen); if (dst == 0) { if (ipf_ifpaddr(&V_ipfmain, 6, FRI_NORMAL, ifp, &dst6, NULL) == -1) { FREE_MB_T(m); return -1; } } else dst6 = fin->fin_dst6; } #endif else { FREE_MB_T(m); return -1; } avail -= (max_linkhdr + iclen); if (avail < 0) { FREE_MB_T(m); return -1; } if (xtra > avail) xtra = avail; iclen += xtra; m->m_data += max_linkhdr; m->m_pkthdr.rcvif = (struct ifnet *)0; m->m_pkthdr.len = iclen; m->m_len = iclen; ip = mtod(m, ip_t *); icmp = (struct icmp *)((char *)ip + hlen); ip2 = (ip_t *)&icmp->icmp_ip; icmp->icmp_type = type; icmp->icmp_code = fin->fin_icode; icmp->icmp_cksum = 0; #ifdef icmp_nextmtu if (type == ICMP_UNREACH && fin->fin_icode == ICMP_UNREACH_NEEDFRAG) { if (fin->fin_mtu != 0) { icmp->icmp_nextmtu = htons(fin->fin_mtu); } else if (ifp != NULL) { icmp->icmp_nextmtu = htons(GETIFMTU_4(ifp)); } else { /* make up a number... */ icmp->icmp_nextmtu = htons(fin->fin_plen - 20); } } #endif bcopy((char *)fin->fin_ip, (char *)ip2, ohlen); #ifdef USE_INET6 ip6 = (ip6_t *)ip; if (fin->fin_v == 6) { ip6->ip6_flow = ((ip6_t *)fin->fin_ip)->ip6_flow; ip6->ip6_plen = htons(iclen - hlen); ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 0; ip6->ip6_src = dst6.in6; ip6->ip6_dst = fin->fin_src6.in6; if (xtra > 0) bcopy((char *)fin->fin_ip + ohlen, (char *)&icmp->icmp_ip + ohlen, xtra); icmp->icmp_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), iclen - hlen); } else #endif { ip->ip_p = IPPROTO_ICMP; ip->ip_src.s_addr = dst4.s_addr; ip->ip_dst.s_addr = fin->fin_saddr; if (xtra > 0) bcopy((char *)fin->fin_ip + ohlen, (char *)&icmp->icmp_ip + ohlen, xtra); icmp->icmp_cksum = ipf_cksum((u_short *)icmp, sizeof(*icmp) + 8); ip->ip_len = htons(iclen); ip->ip_p = IPPROTO_ICMP; } err = ipf_send_ip(fin, m); return err; } /* * m0 - pointer to mbuf where the IP packet starts * mpp - pointer to the mbuf pointer that is the start of the mbuf chain */ int ipf_fastroute(m0, mpp, fin, fdp) mb_t *m0, **mpp; fr_info_t *fin; frdest_t *fdp; { register struct ip *ip, *mhip; register struct mbuf *m = *mpp; int len, off, error = 0, hlen, code; struct ifnet *ifp, *sifp; - struct sockaddr_in dst; + struct route ro; + struct sockaddr_in *dst; + const struct sockaddr *gw; struct nhop_object *nh; u_long fibnum = 0; u_short ip_off; frdest_t node; frentry_t *fr; #ifdef M_WRITABLE /* * HOT FIX/KLUDGE: * * If the mbuf we're about to send is not writable (because of * a cluster reference, for example) we'll need to make a copy * of it since this routine modifies the contents. * * If you have non-crappy network hardware that can transmit data * from the mbuf, rather than making a copy, this is gonna be a * problem. */ if (M_WRITABLE(m) == 0) { m0 = m_dup(m, M_NOWAIT); if (m0 != NULL) { FREE_MB_T(m); m = m0; *mpp = m; } else { error = ENOBUFS; FREE_MB_T(m); goto done; } } #endif #ifdef USE_INET6 if (fin->fin_v == 6) { /* * currently "to " and "to :ip#" are not supported * for IPv6 */ return ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } #endif hlen = fin->fin_hlen; ip = mtod(m0, struct ip *); ifp = NULL; /* * Route packet. */ - bzero(&dst, sizeof (dst)); - dst.sin_family = AF_INET; - dst.sin_addr = ip->ip_dst; - dst.sin_len = sizeof(dst); + bzero(&ro, sizeof (ro)); + dst = (struct sockaddr_in *)&ro.ro_dst; + dst->sin_family = AF_INET; + dst->sin_addr = ip->ip_dst; + dst->sin_len = sizeof(dst); + gw = (const struct sockaddr *)dst; fr = fin->fin_fr; if ((fr != NULL) && !(fr->fr_flags & FR_KEEPSTATE) && (fdp != NULL) && (fdp->fd_type == FRD_DSTLIST)) { if (ipf_dstlist_select_node(fin, fdp->fd_ptr, NULL, &node) == 0) fdp = &node; } if (fdp != NULL) ifp = fdp->fd_ptr; else ifp = fin->fin_ifp; if ((ifp == NULL) && ((fr == NULL) || !(fr->fr_flags & FR_FASTROUTE))) { error = -2; goto bad; } if ((fdp != NULL) && (fdp->fd_ip.s_addr != 0)) - dst.sin_addr = fdp->fd_ip; + dst->sin_addr = fdp->fd_ip; fibnum = M_GETFIB(m0); NET_EPOCH_ASSERT(); - nh = fib4_lookup(fibnum, dst.sin_addr, 0, NHR_NONE, 0); + nh = fib4_lookup(fibnum, dst->sin_addr, 0, NHR_NONE, 0); if (nh == NULL) { if (in_localaddr(ip->ip_dst)) error = EHOSTUNREACH; else error = ENETUNREACH; goto bad; } if (ifp == NULL) ifp = nh->nh_ifp; - if (nh->nh_flags & NHF_GATEWAY) - dst.sin_addr = nh->gw4_sa.sin_addr; + if (nh->nh_flags & NHF_GATEWAY) { + gw = &nh->gw_sa; + ro.ro_flags |= RT_HAS_GW; + } /* * For input packets which are being "fastrouted", they won't * go back through output filtering and miss their chance to get * NAT'd and counted. Duplicated packets aren't considered to be * part of the normal packet stream, so do not NAT them or pass * them through stateful checking, etc. */ if ((fdp != &fr->fr_dif) && (fin->fin_out == 0)) { sifp = fin->fin_ifp; fin->fin_ifp = ifp; fin->fin_out = 1; (void) ipf_acctpkt(fin, NULL); fin->fin_fr = NULL; if (!fr || !(fr->fr_flags & FR_RETMASK)) { u_32_t pass; (void) ipf_state_check(fin, &pass); } switch (ipf_nat_checkout(fin, NULL)) { case 0 : break; case 1 : ip->ip_sum = 0; break; case -1 : error = -1; goto bad; break; } fin->fin_ifp = sifp; fin->fin_out = 0; } else ip->ip_sum = 0; /* * If small enough for interface, can just send directly. */ if (ntohs(ip->ip_len) <= ifp->if_mtu) { if (!ip->ip_sum) ip->ip_sum = in_cksum(m, hlen); - error = (*ifp->if_output)(ifp, m, (struct sockaddr *)&dst, - NULL - ); + error = (*ifp->if_output)(ifp, m, gw, &ro); goto done; } /* * Too large for interface; fragment if possible. * Must be able to put at least 8 bytes per fragment. */ ip_off = ntohs(ip->ip_off); if (ip_off & IP_DF) { error = EMSGSIZE; goto bad; } len = (ifp->if_mtu - hlen) &~ 7; if (len < 8) { error = EMSGSIZE; goto bad; } { int mhlen, firstlen = len; struct mbuf **mnext = &m->m_act; /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. */ m0 = m; mhlen = sizeof (struct ip); for (off = hlen + len; off < ntohs(ip->ip_len); off += len) { #ifdef MGETHDR MGETHDR(m, M_NOWAIT, MT_HEADER); #else MGET(m, M_NOWAIT, MT_HEADER); #endif if (m == NULL) { m = m0; error = ENOBUFS; goto bad; } m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); bcopy((char *)ip, (char *)mhip, sizeof(*ip)); if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); IP_HL_A(mhip, mhlen >> 2); } m->m_len = mhlen; mhip->ip_off = ((off - hlen) >> 3) + ip_off; if (off + len >= ntohs(ip->ip_len)) len = ntohs(ip->ip_len) - off; else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); *mnext = m; m->m_next = m_copym(m0, off, len, M_NOWAIT); if (m->m_next == 0) { error = ENOBUFS; /* ??? */ goto sendorfree; } m->m_pkthdr.len = mhlen + len; m->m_pkthdr.rcvif = NULL; mhip->ip_off = htons((u_short)mhip->ip_off); mhip->ip_sum = 0; mhip->ip_sum = in_cksum(m, mhlen); mnext = &m->m_act; } /* * Update first fragment by trimming what's been copied out * and updating header, then send each fragment (in order). */ m_adj(m0, hlen + firstlen - ip->ip_len); ip->ip_len = htons((u_short)(hlen + firstlen)); ip->ip_off = htons((u_short)IP_MF); ip->ip_sum = 0; ip->ip_sum = in_cksum(m0, hlen); sendorfree: for (m = m0; m; m = m0) { m0 = m->m_act; m->m_act = 0; if (error == 0) - error = (*ifp->if_output)(ifp, m, - (struct sockaddr *)&dst, - NULL - ); + error = (*ifp->if_output)(ifp, m, gw, &ro); else FREE_MB_T(m); } } done: if (!error) V_ipfmain.ipf_frouteok[0]++; else V_ipfmain.ipf_frouteok[1]++; return 0; bad: if (error == EMSGSIZE) { sifp = fin->fin_ifp; code = fin->fin_icode; fin->fin_icode = ICMP_UNREACH_NEEDFRAG; fin->fin_ifp = ifp; (void) ipf_send_icmp_err(ICMP_UNREACH, fin, 1); fin->fin_ifp = sifp; fin->fin_icode = code; } FREE_MB_T(m); goto done; } int ipf_verifysrc(fin) fr_info_t *fin; { struct nhop_object *nh; NET_EPOCH_ASSERT(); nh = fib4_lookup(RT_DEFAULT_FIB, fin->fin_src, 0, NHR_NONE, 0); if (nh == NULL) return (0); return (fin->fin_ifp == nh->nh_ifp); } /* * return the first IP Address associated with an interface */ int ipf_ifpaddr(softc, v, atype, ifptr, inp, inpmask) ipf_main_softc_t *softc; int v, atype; void *ifptr; i6addr_t *inp, *inpmask; { #ifdef USE_INET6 struct in6_addr *ia6 = NULL; #endif struct sockaddr *sock, *mask; struct sockaddr_in *sin; struct ifaddr *ifa; struct ifnet *ifp; if ((ifptr == NULL) || (ifptr == (void *)-1)) return -1; sin = NULL; ifp = ifptr; if (v == 4) inp->in4.s_addr = 0; #ifdef USE_INET6 else if (v == 6) bzero((char *)inp, sizeof(*inp)); #endif ifa = CK_STAILQ_FIRST(&ifp->if_addrhead); sock = ifa->ifa_addr; while (sock != NULL && ifa != NULL) { sin = (struct sockaddr_in *)sock; if ((v == 4) && (sin->sin_family == AF_INET)) break; #ifdef USE_INET6 if ((v == 6) && (sin->sin_family == AF_INET6)) { ia6 = &((struct sockaddr_in6 *)sin)->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(ia6) && !IN6_IS_ADDR_LOOPBACK(ia6)) break; } #endif ifa = CK_STAILQ_NEXT(ifa, ifa_link); if (ifa != NULL) sock = ifa->ifa_addr; } if (ifa == NULL || sin == NULL) return -1; mask = ifa->ifa_netmask; if (atype == FRI_BROADCAST) sock = ifa->ifa_broadaddr; else if (atype == FRI_PEERADDR) sock = ifa->ifa_dstaddr; if (sock == NULL) return -1; #ifdef USE_INET6 if (v == 6) { return ipf_ifpfillv6addr(atype, (struct sockaddr_in6 *)sock, (struct sockaddr_in6 *)mask, inp, inpmask); } #endif return ipf_ifpfillv4addr(atype, (struct sockaddr_in *)sock, (struct sockaddr_in *)mask, &inp->in4, &inpmask->in4); } u_32_t ipf_newisn(fin) fr_info_t *fin; { u_32_t newiss; newiss = arc4random(); return newiss; } INLINE int ipf_checkv4sum(fin) fr_info_t *fin; { #ifdef CSUM_DATA_VALID int manual = 0; u_short sum; ip_t *ip; mb_t *m; if ((fin->fin_flx & FI_NOCKSUM) != 0) return 0; if ((fin->fin_flx & FI_SHORT) != 0) return 1; if (fin->fin_cksum != FI_CK_NEEDED) return (fin->fin_cksum > FI_CK_NEEDED) ? 0 : -1; m = fin->fin_m; if (m == NULL) { manual = 1; goto skipauto; } ip = fin->fin_ip; if ((m->m_pkthdr.csum_flags & (CSUM_IP_CHECKED|CSUM_IP_VALID)) == CSUM_IP_CHECKED) { fin->fin_cksum = FI_CK_BAD; fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv4sum_csum_ip_checked, fr_info_t *, fin, u_int, m->m_pkthdr.csum_flags & (CSUM_IP_CHECKED|CSUM_IP_VALID)); return -1; } if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { /* Depending on the driver, UDP may have zero checksum */ if (fin->fin_p == IPPROTO_UDP && (fin->fin_flx & (FI_FRAG|FI_SHORT|FI_BAD)) == 0) { udphdr_t *udp = fin->fin_dp; if (udp->uh_sum == 0) { /* * we're good no matter what the hardware * checksum flags and csum_data say (handling * of csum_data for zero UDP checksum is not * consistent across all drivers) */ fin->fin_cksum = 1; return 0; } } if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) sum = m->m_pkthdr.csum_data; else sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + fin->fin_dlen + fin->fin_p)); sum ^= 0xffff; if (sum != 0) { fin->fin_cksum = FI_CK_BAD; fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv4sum_sum, fr_info_t *, fin, u_int, sum); } else { fin->fin_cksum = FI_CK_SUMOK; return 0; } } else { if (m->m_pkthdr.csum_flags == CSUM_DELAY_DATA) { fin->fin_cksum = FI_CK_L4FULL; return 0; } else if (m->m_pkthdr.csum_flags == CSUM_TCP || m->m_pkthdr.csum_flags == CSUM_UDP) { fin->fin_cksum = FI_CK_L4PART; return 0; } else if (m->m_pkthdr.csum_flags == CSUM_IP) { fin->fin_cksum = FI_CK_L4PART; return 0; } else { manual = 1; } } skipauto: if (manual != 0) { if (ipf_checkl4sum(fin) == -1) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv4sum_manual, fr_info_t *, fin, u_int, manual); return -1; } } #else if (ipf_checkl4sum(fin) == -1) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv4sum_checkl4sum, fr_info_t *, fin, u_int, -1); return -1; } #endif return 0; } #ifdef USE_INET6 INLINE int ipf_checkv6sum(fin) fr_info_t *fin; { if ((fin->fin_flx & FI_NOCKSUM) != 0) { DT(ipf_checkv6sum_fi_nocksum); return 0; } if ((fin->fin_flx & FI_SHORT) != 0) { DT(ipf_checkv6sum_fi_short); return 1; } if (fin->fin_cksum != FI_CK_NEEDED) { DT(ipf_checkv6sum_fi_ck_needed); return (fin->fin_cksum > FI_CK_NEEDED) ? 0 : -1; } if (ipf_checkl4sum(fin) == -1) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv6sum_checkl4sum, fr_info_t *, fin, u_int, -1); return -1; } return 0; } #endif /* USE_INET6 */ size_t mbufchainlen(m0) struct mbuf *m0; { size_t len; if ((m0->m_flags & M_PKTHDR) != 0) { len = m0->m_pkthdr.len; } else { struct mbuf *m; for (m = m0, len = 0; m != NULL; m = m->m_next) len += m->m_len; } return len; } /* ------------------------------------------------------------------------ */ /* Function: ipf_pullup */ /* Returns: NULL == pullup failed, else pointer to protocol header */ /* Parameters: xmin(I)- pointer to buffer where data packet starts */ /* fin(I) - pointer to packet information */ /* len(I) - number of bytes to pullup */ /* */ /* Attempt to move at least len bytes (from the start of the buffer) into a */ /* single buffer for ease of access. Operating system native functions are */ /* used to manage buffers - if necessary. If the entire packet ends up in */ /* a single buffer, set the FI_COALESCE flag even though ipf_coalesce() has */ /* not been called. Both fin_ip and fin_dp are updated before exiting _IF_ */ /* and ONLY if the pullup succeeds. */ /* */ /* We assume that 'xmin' is a pointer to a buffer that is part of the chain */ /* of buffers that starts at *fin->fin_mp. */ /* ------------------------------------------------------------------------ */ void * ipf_pullup(xmin, fin, len) mb_t *xmin; fr_info_t *fin; int len; { int dpoff, ipoff; mb_t *m = xmin; char *ip; if (m == NULL) return NULL; ip = (char *)fin->fin_ip; if ((fin->fin_flx & FI_COALESCE) != 0) return ip; ipoff = fin->fin_ipoff; if (fin->fin_dp != NULL) dpoff = (char *)fin->fin_dp - (char *)ip; else dpoff = 0; if (M_LEN(m) < len) { mb_t *n = *fin->fin_mp; /* * Assume that M_PKTHDR is set and just work with what is left * rather than check.. * Should not make any real difference, anyway. */ if (m != n) { /* * Record the mbuf that points to the mbuf that we're * about to go to work on so that we can update the * m_next appropriately later. */ for (; n->m_next != m; n = n->m_next) ; } else { n = NULL; } #ifdef MHLEN if (len > MHLEN) #else if (len > MLEN) #endif { #ifdef HAVE_M_PULLDOWN if (m_pulldown(m, 0, len, NULL) == NULL) m = NULL; #else FREE_MB_T(*fin->fin_mp); m = NULL; n = NULL; #endif } else { m = m_pullup(m, len); } if (n != NULL) n->m_next = m; if (m == NULL) { /* * When n is non-NULL, it indicates that m pointed to * a sub-chain (tail) of the mbuf and that the head * of this chain has not yet been free'd. */ if (n != NULL) { FREE_MB_T(*fin->fin_mp); } *fin->fin_mp = NULL; fin->fin_m = NULL; return NULL; } if (n == NULL) *fin->fin_mp = m; while (M_LEN(m) == 0) { m = m->m_next; } fin->fin_m = m; ip = MTOD(m, char *) + ipoff; fin->fin_ip = (ip_t *)ip; if (fin->fin_dp != NULL) fin->fin_dp = (char *)fin->fin_ip + dpoff; if (fin->fin_fraghdr != NULL) fin->fin_fraghdr = (char *)ip + ((char *)fin->fin_fraghdr - (char *)fin->fin_ip); } if (len == fin->fin_plen) fin->fin_flx |= FI_COALESCE; return ip; } int ipf_inject(fin, m) fr_info_t *fin; mb_t *m; { struct epoch_tracker et; int error = 0; NET_EPOCH_ENTER(et); if (fin->fin_out == 0) { netisr_dispatch(NETISR_IP, m); } else { fin->fin_ip->ip_len = ntohs(fin->fin_ip->ip_len); fin->fin_ip->ip_off = ntohs(fin->fin_ip->ip_off); error = ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); } NET_EPOCH_EXIT(et); return error; } VNET_DEFINE_STATIC(pfil_hook_t, ipf_inet_hook); VNET_DEFINE_STATIC(pfil_hook_t, ipf_inet6_hook); #define V_ipf_inet_hook VNET(ipf_inet_hook) #define V_ipf_inet6_hook VNET(ipf_inet6_hook) int ipf_pfil_unhook(void) { pfil_remove_hook(V_ipf_inet_hook); #ifdef USE_INET6 pfil_remove_hook(V_ipf_inet6_hook); #endif return (0); } int ipf_pfil_hook(void) { struct pfil_hook_args pha; struct pfil_link_args pla; int error, error6; pha.pa_version = PFIL_VERSION; pha.pa_flags = PFIL_IN | PFIL_OUT; pha.pa_modname = "ipfilter"; pha.pa_rulname = "default-ip4"; pha.pa_func = ipf_check_wrapper; pha.pa_ruleset = NULL; pha.pa_type = PFIL_TYPE_IP4; V_ipf_inet_hook = pfil_add_hook(&pha); #ifdef USE_INET6 pha.pa_rulname = "default-ip6"; pha.pa_func = ipf_check_wrapper6; pha.pa_type = PFIL_TYPE_IP6; V_ipf_inet6_hook = pfil_add_hook(&pha); #endif pla.pa_version = PFIL_VERSION; pla.pa_flags = PFIL_IN | PFIL_OUT | PFIL_HEADPTR | PFIL_HOOKPTR; pla.pa_head = V_inet_pfil_head; pla.pa_hook = V_ipf_inet_hook; error = pfil_link(&pla); error6 = 0; #ifdef USE_INET6 pla.pa_head = V_inet6_pfil_head; pla.pa_hook = V_ipf_inet6_hook; error6 = pfil_link(&pla); #endif if (error || error6) error = ENODEV; else error = 0; return (error); } void ipf_event_reg(void) { V_ipf_arrivetag = EVENTHANDLER_REGISTER(ifnet_arrival_event, \ ipf_ifevent, NULL, \ EVENTHANDLER_PRI_ANY); V_ipf_departtag = EVENTHANDLER_REGISTER(ifnet_departure_event, \ ipf_ifevent, NULL, \ EVENTHANDLER_PRI_ANY); #if 0 V_ipf_clonetag = EVENTHANDLER_REGISTER(if_clone_event, ipf_ifevent, \ NULL, EVENTHANDLER_PRI_ANY); #endif } void ipf_event_dereg(void) { if (V_ipf_arrivetag != NULL) { EVENTHANDLER_DEREGISTER(ifnet_arrival_event, V_ipf_arrivetag); } if (V_ipf_departtag != NULL) { EVENTHANDLER_DEREGISTER(ifnet_departure_event, V_ipf_departtag); } #if 0 if (V_ipf_clonetag != NULL) { EVENTHANDLER_DEREGISTER(if_clone_event, V_ipf_clonetag); } #endif } u_32_t ipf_random() { return arc4random(); } u_int ipf_pcksum(fin, hlen, sum) fr_info_t *fin; int hlen; u_int sum; { struct mbuf *m; u_int sum2; int off; m = fin->fin_m; off = (char *)fin->fin_dp - (char *)fin->fin_ip; m->m_data += hlen; m->m_len -= hlen; sum2 = in_cksum(fin->fin_m, fin->fin_plen - off); m->m_len += hlen; m->m_data -= hlen; /* * Both sum and sum2 are partial sums, so combine them together. */ sum += ~sum2 & 0xffff; while (sum > 0xffff) sum = (sum & 0xffff) + (sum >> 16); sum2 = ~sum & 0xffff; return sum2; } #ifdef USE_INET6 u_int ipf_pcksum6(m, ip6, off, len) struct mbuf *m; ip6_t *ip6; u_int32_t off; u_int32_t len; { #ifdef _KERNEL int sum; if (m->m_len < sizeof(struct ip6_hdr)) { return 0xffff; } sum = in6_cksum(m, ip6->ip6_nxt, off, len); return(sum); #else u_short *sp; u_int sum; sp = (u_short *)&ip6->ip6_src; sum = *sp++; /* ip6_src */ sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; /* ip6_dst */ sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; return(ipf_pcksum(fin, off, sum)); #endif } #endif void ipf_fbsd_kenv_get(ipf_main_softc_t *softc) { TUNABLE_INT_FETCH("net.inet.ipf.large_nat", &softc->ipf_large_nat); } diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c index 8623079fe429..4d98597409d6 100644 --- a/sys/dev/cxgbe/tom/t4_listen.c +++ b/sys/dev/cxgbe/tom/t4_listen.c @@ -1,1601 +1,1604 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "t4_clip.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" /* stid services */ static int alloc_stid(struct adapter *, struct listen_ctx *, int); static struct listen_ctx *lookup_stid(struct adapter *, int); static void free_stid(struct adapter *, struct listen_ctx *); /* lctx services */ static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *, struct vi_info *); static int free_lctx(struct adapter *, struct listen_ctx *); static void hold_lctx(struct listen_ctx *); static void listen_hash_add(struct adapter *, struct listen_ctx *); static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *); static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); static void send_abort_rpl_synqe(struct toedev *, struct synq_entry *, int); static int alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6) { struct tid_info *t = &sc->tids; u_int stid, n, f, mask; struct stid_region *sr = &lctx->stid_region; /* * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in * the TCAM. The start of the stid region is properly aligned (the chip * requires each region to be 128-cell aligned). */ n = isipv6 ? 2 : 1; mask = n - 1; KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0, ("%s: stid region (%u, %u) not properly aligned. n = %u", __func__, t->stid_base, t->nstids, n)); mtx_lock(&t->stid_lock); if (n > t->nstids - t->stids_in_use) { mtx_unlock(&t->stid_lock); return (-1); } if (t->nstids_free_head >= n) { /* * This allocation will definitely succeed because the region * starts at a good alignment and we just checked we have enough * stids free. */ f = t->nstids_free_head & mask; t->nstids_free_head -= n + f; stid = t->nstids_free_head; TAILQ_INSERT_HEAD(&t->stids, sr, link); } else { struct stid_region *s; stid = t->nstids_free_head; TAILQ_FOREACH(s, &t->stids, link) { stid += s->used + s->free; f = stid & mask; if (s->free >= n + f) { stid -= n + f; s->free -= n + f; TAILQ_INSERT_AFTER(&t->stids, s, sr, link); goto allocated; } } if (__predict_false(stid != t->nstids)) { panic("%s: stids TAILQ (%p) corrupt." " At %d instead of %d at the end of the queue.", __func__, &t->stids, stid, t->nstids); } mtx_unlock(&t->stid_lock); return (-1); } allocated: sr->used = n; sr->free = f; t->stids_in_use += n; t->stid_tab[stid] = lctx; mtx_unlock(&t->stid_lock); KASSERT(((stid + t->stid_base) & mask) == 0, ("%s: EDOOFUS.", __func__)); return (stid + t->stid_base); } static struct listen_ctx * lookup_stid(struct adapter *sc, int stid) { struct tid_info *t = &sc->tids; return (t->stid_tab[stid - t->stid_base]); } static void free_stid(struct adapter *sc, struct listen_ctx *lctx) { struct tid_info *t = &sc->tids; struct stid_region *sr = &lctx->stid_region; struct stid_region *s; KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used)); mtx_lock(&t->stid_lock); s = TAILQ_PREV(sr, stid_head, link); if (s != NULL) s->free += sr->used + sr->free; else t->nstids_free_head += sr->used + sr->free; KASSERT(t->stids_in_use >= sr->used, ("%s: stids_in_use (%u) < stids being freed (%u)", __func__, t->stids_in_use, sr->used)); t->stids_in_use -= sr->used; TAILQ_REMOVE(&t->stids, sr, link); mtx_unlock(&t->stid_lock); } static struct listen_ctx * alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi) { struct listen_ctx *lctx; INP_WLOCK_ASSERT(inp); lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO); if (lctx == NULL) return (NULL); lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6); if (lctx->stid < 0) { free(lctx, M_CXGBE); return (NULL); } if (inp->inp_vflag & INP_IPV6 && !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) { lctx->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true); if (lctx->ce == NULL) { free(lctx, M_CXGBE); return (NULL); } } lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id]; lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq]; refcount_init(&lctx->refcount, 1); lctx->inp = inp; lctx->vnet = inp->inp_socket->so_vnet; in_pcbref(inp); return (lctx); } /* Don't call this directly, use release_lctx instead */ static int free_lctx(struct adapter *sc, struct listen_ctx *lctx) { struct inpcb *inp = lctx->inp; INP_WLOCK_ASSERT(inp); KASSERT(lctx->refcount == 0, ("%s: refcount %d", __func__, lctx->refcount)); KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p", __func__, lctx->stid, lctx, lctx->inp); if (lctx->ce) t4_release_clip_entry(sc, lctx->ce); free_stid(sc, lctx); free(lctx, M_CXGBE); return (in_pcbrele_wlocked(inp)); } static void hold_lctx(struct listen_ctx *lctx) { refcount_acquire(&lctx->refcount); } static inline uint32_t listen_hashfn(void *key, u_long mask) { return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); } /* * Add a listen_ctx entry to the listen hash table. */ static void listen_hash_add(struct adapter *sc, struct listen_ctx *lctx) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(lctx->inp, td->listen_mask); mtx_lock(&td->lctx_hash_lock); LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); td->lctx_count++; mtx_unlock(&td->lctx_hash_lock); } /* * Look for the listening socket's context entry in the hash and return it. */ static struct listen_ctx * listen_hash_find(struct adapter *sc, struct inpcb *inp) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(inp, td->listen_mask); struct listen_ctx *lctx; mtx_lock(&td->lctx_hash_lock); LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { if (lctx->inp == inp) break; } mtx_unlock(&td->lctx_hash_lock); return (lctx); } /* * Removes the listen_ctx structure for inp from the hash and returns it. */ static struct listen_ctx * listen_hash_del(struct adapter *sc, struct inpcb *inp) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(inp, td->listen_mask); struct listen_ctx *lctx, *l; mtx_lock(&td->lctx_hash_lock); LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { if (lctx->inp == inp) { LIST_REMOVE(lctx, link); td->lctx_count--; break; } } mtx_unlock(&td->lctx_hash_lock); return (lctx); } /* * Releases a hold on the lctx. Must be called with the listening socket's inp * locked. The inp may be freed by this function and it returns NULL to * indicate this. */ static struct inpcb * release_lctx(struct adapter *sc, struct listen_ctx *lctx) { struct inpcb *inp = lctx->inp; int inp_freed = 0; INP_WLOCK_ASSERT(inp); if (refcount_release(&lctx->refcount)) inp_freed = free_lctx(sc, lctx); return (inp_freed ? NULL : inp); } static void send_flowc_wr_synqe(struct adapter *sc, struct synq_entry *synqe) { struct mbuf *m = synqe->syn; struct ifnet *ifp = m->m_pkthdr.rcvif; struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct wrqe *wr; struct fw_flowc_wr *flowc; struct sge_ofld_txq *ofld_txq; struct sge_ofld_rxq *ofld_rxq; const int nparams = 6; const int flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); const u_int pfvf = sc->pf << S_FW_VIID_PFN; INP_WLOCK_ASSERT(synqe->lctx->inp); MPASS((synqe->flags & TPF_FLOWC_WR_SENT) == 0); ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx]; wr = alloc_wrqe(roundup2(flowclen, 16), &ofld_txq->wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } flowc = wrtod(wr); memset(flowc, 0, wr->wr_len); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | V_FW_WR_FLOWID(synqe->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; flowc->mnemval[0].val = htobe32(pfvf); flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; flowc->mnemval[1].val = htobe32(pi->tx_chan); flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; flowc->mnemval[2].val = htobe32(pi->tx_chan); flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id); flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF; flowc->mnemval[4].val = htobe32(512); flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS; flowc->mnemval[5].val = htobe32(512); synqe->flags |= TPF_FLOWC_WR_SENT; t4_wrq_tx(sc, wr); } static void send_abort_rpl_synqe(struct toedev *tod, struct synq_entry *synqe, int rst_status) { struct adapter *sc = tod->tod_softc; struct wrqe *wr; struct cpl_abort_req *req; INP_WLOCK_ASSERT(synqe->lctx->inp); CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s", __func__, synqe, synqe->flags, synqe->tid, synqe->flags & TPF_ABORT_SHUTDOWN ? " (abort already in progress)" : ""); if (synqe->flags & TPF_ABORT_SHUTDOWN) return; /* abort already in progress */ synqe->flags |= TPF_ABORT_SHUTDOWN; if (!(synqe->flags & TPF_FLOWC_WR_SENT)) send_flowc_wr_synqe(sc, synqe); wr = alloc_wrqe(sizeof(*req), &sc->sge.ofld_txq[synqe->params.txq_idx].wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid); req->rsvd0 = 0; /* don't have a snd_nxt */ req->rsvd1 = 1; /* no data sent yet */ req->cmd = rst_status; t4_l2t_send(sc, wr, &sc->l2t->l2tab[synqe->params.l2t_idx]); } static int create_server(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_pass_open_req *req; struct inpcb *inp = lctx->inp; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { log(LOG_ERR, "%s: allocation failure", __func__); return (ENOMEM); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); req->local_port = inp->inp_lport; req->peer_port = 0; req->local_ip = inp->inp_laddr.s_addr; req->peer_ip = 0; req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); t4_wrq_tx(sc, wr); return (0); } static int create_server6(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_pass_open_req6 *req; struct inpcb *inp = lctx->inp; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { log(LOG_ERR, "%s: allocation failure", __func__); return (ENOMEM); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid)); req->local_port = inp->inp_lport; req->peer_port = 0; req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; req->peer_ip_hi = 0; req->peer_ip_lo = 0; req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); t4_wrq_tx(sc, wr); return (0); } static int destroy_server(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_close_listsvr_req *req; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, lctx->stid)); req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id); req->rsvd = htobe16(0); t4_wrq_tx(sc, wr); return (0); } /* * Start a listening server by sending a passive open request to HW. * * Can't take adapter lock here and access to sc->flags, * sc->offload_map, if_capenable are all race prone. */ int t4_listen_start(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; struct vi_info *vi; struct port_info *pi; struct inpcb *inp = tp->t_inpcb; struct listen_ctx *lctx; int i, rc, v; struct offload_settings settings; INP_WLOCK_ASSERT(inp); rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, EVL_MAKETAG(0xfff, 0, 0), inp); rw_runlock(&sc->policy_lock); if (!settings.offload) return (0); /* Don't start a hardware listener for any loopback address. */ if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr)) return (0); if (!(inp->inp_vflag & INP_IPV6) && IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr))) return (0); if (sc->flags & KERN_TLS_ON) return (0); #if 0 ADAPTER_LOCK(sc); if (IS_BUSY(sc)) { log(LOG_ERR, "%s: listen request ignored, %s is busy", __func__, device_get_nameunit(sc->dev)); goto done; } KASSERT(uld_active(sc, ULD_TOM), ("%s: TOM not initialized", __func__)); #endif /* * Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first * such VI's queues to send the passive open and receive the reply to * it. * * XXX: need a way to mark a port in use by offload. if_cxgbe should * then reject any attempt to bring down such a port (and maybe reject * attempts to disable IFCAP_TOE on that port too?). */ for_each_port(sc, i) { pi = sc->port[i]; for_each_vi(pi, v, vi) { if (vi->flags & VI_INIT_DONE && vi->ifp->if_capenable & IFCAP_TOE) goto found; } } goto done; /* no port that's UP with IFCAP_TOE enabled */ found: if (listen_hash_find(sc, inp) != NULL) goto done; /* already setup */ lctx = alloc_lctx(sc, inp, vi); if (lctx == NULL) { log(LOG_ERR, "%s: listen request ignored, %s couldn't allocate lctx\n", __func__, device_get_nameunit(sc->dev)); goto done; } listen_hash_add(sc, lctx); CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x", __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp, inp->inp_vflag); if (inp->inp_vflag & INP_IPV6) rc = create_server6(sc, lctx); else rc = create_server(sc, lctx); if (rc != 0) { log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n", __func__, device_get_nameunit(sc->dev), rc); (void) listen_hash_del(sc, inp); inp = release_lctx(sc, lctx); /* can't be freed, host stack has a reference */ KASSERT(inp != NULL, ("%s: inp freed", __func__)); goto done; } lctx->flags |= LCTX_RPL_PENDING; done: #if 0 ADAPTER_UNLOCK(sc); #endif return (0); } int t4_listen_stop(struct toedev *tod, struct tcpcb *tp) { struct listen_ctx *lctx; struct adapter *sc = tod->tod_softc; struct inpcb *inp = tp->t_inpcb; INP_WLOCK_ASSERT(inp); lctx = listen_hash_del(sc, inp); if (lctx == NULL) return (ENOENT); /* no hardware listener for this inp */ CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, lctx, lctx->flags); /* * If the reply to the PASS_OPEN is still pending we'll wait for it to * arrive and clean up when it does. */ if (lctx->flags & LCTX_RPL_PENDING) { return (EINPROGRESS); } destroy_server(sc, lctx); return (0); } static inline struct synq_entry * alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags) { struct synq_entry *synqe; INP_RLOCK_ASSERT(lctx->inp); MPASS(flags == M_WAITOK || flags == M_NOWAIT); synqe = malloc(sizeof(*synqe), M_CXGBE, flags); if (__predict_true(synqe != NULL)) { synqe->flags = TPF_SYNQE; refcount_init(&synqe->refcnt, 1); synqe->lctx = lctx; hold_lctx(lctx); /* Every synqe has a ref on its lctx. */ synqe->syn = NULL; } return (synqe); } static inline void hold_synqe(struct synq_entry *synqe) { refcount_acquire(&synqe->refcnt); } static inline struct inpcb * release_synqe(struct adapter *sc, struct synq_entry *synqe) { struct inpcb *inp; MPASS(synqe->flags & TPF_SYNQE); MPASS(synqe->lctx != NULL); inp = synqe->lctx->inp; MPASS(inp != NULL); INP_WLOCK_ASSERT(inp); if (refcount_release(&synqe->refcnt)) { inp = release_lctx(sc, synqe->lctx); m_freem(synqe->syn); free(synqe, M_CXGBE); } return (inp); } void t4_syncache_added(struct toedev *tod __unused, void *arg) { struct synq_entry *synqe = arg; hold_synqe(synqe); } void t4_syncache_removed(struct toedev *tod, void *arg) { struct adapter *sc = tod->tod_softc; struct synq_entry *synqe = arg; struct inpcb *inp = synqe->lctx->inp; /* * XXX: this is a LOR but harmless when running from the softclock. */ INP_WLOCK(inp); inp = release_synqe(sc, synqe); if (inp != NULL) INP_WUNLOCK(inp); } int t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) { struct synq_entry *synqe = arg; if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) { struct tcpopt to; struct ip *ip = mtod(m, struct ip *); struct tcphdr *th; if (ip->ip_v == IPVERSION) th = (void *)(ip + 1); else th = (void *)((struct ip6_hdr *)ip + 1); bzero(&to, sizeof(to)); tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th), TO_SYN); /* save these for later */ synqe->iss = be32toh(th->th_seq); synqe->irs = be32toh(th->th_ack) - 1; synqe->ts = to.to_tsval; } m_freem(m); /* don't need this any more */ return (0); } static int do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1); int stid = GET_TID(cpl); unsigned int status = cpl->status; struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_PASS_OPEN_RPL, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); INP_WLOCK(inp); CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x", __func__, stid, status, lctx->flags); lctx->flags &= ~LCTX_RPL_PENDING; if (status != CPL_ERR_NONE) log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status); #ifdef INVARIANTS /* * If the inp has been dropped (listening socket closed) then * listen_stop must have run and taken the inp out of the hash. */ if (inp->inp_flags & INP_DROPPED) { KASSERT(listen_hash_del(sc, inp) == NULL, ("%s: inp %p still in listen hash", __func__, inp)); } #endif if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) { if (release_lctx(sc, lctx) != NULL) INP_WUNLOCK(inp); return (status); } /* * Listening socket stopped listening earlier and now the chip tells us * it has started the hardware listener. Stop it; the lctx will be * released in do_close_server_rpl. */ if (inp->inp_flags & INP_DROPPED) { destroy_server(sc, lctx); INP_WUNLOCK(inp); return (status); } /* * Failed to start hardware listener. Take inp out of the hash and * release our reference on it. An error message has been logged * already. */ if (status != CPL_ERR_NONE) { listen_hash_del(sc, inp); if (release_lctx(sc, lctx) != NULL) INP_WUNLOCK(inp); return (status); } /* hardware listener open for business */ INP_WUNLOCK(inp); return (status); } static int do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1); int stid = GET_TID(cpl); unsigned int status = cpl->status; struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status); if (status != CPL_ERR_NONE) { log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n", __func__, status, stid); return (status); } INP_WLOCK(inp); inp = release_lctx(sc, lctx); if (inp != NULL) INP_WUNLOCK(inp); return (status); } static void done_with_synqe(struct adapter *sc, struct synq_entry *synqe) { struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; int ntids; INP_WLOCK_ASSERT(inp); ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1; remove_tid(sc, synqe->tid, ntids); release_tid(sc, synqe->tid, lctx->ctrlq); t4_l2t_release(e); inp = release_synqe(sc, synqe); if (inp) INP_WUNLOCK(inp); } void synack_failure_cleanup(struct adapter *sc, int tid) { struct synq_entry *synqe = lookup_tid(sc, tid); INP_WLOCK(synqe->lctx->inp); done_with_synqe(sc, synqe); } int do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; struct sge_ofld_txq *ofld_txq; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_REQ_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); if (negative_advice(cpl->status)) return (0); /* Ignore negative advice */ INP_WLOCK(inp); ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; if (!(synqe->flags & TPF_FLOWC_WR_SENT)) send_flowc_wr_synqe(sc, synqe); /* * If we'd initiated an abort earlier the reply to it is responsible for * cleaning up resources. Otherwise we tear everything down right here * right now. We owe the T4 a CPL_ABORT_RPL no matter what. */ if (synqe->flags & TPF_ABORT_SHUTDOWN) { INP_WUNLOCK(inp); goto done; } done_with_synqe(sc, synqe); /* inp lock released by done_with_synqe */ done: send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); return (0); } int do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_RPL_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); INP_WLOCK(inp); KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, ("%s: wasn't expecting abort reply for synqe %p (0x%x)", __func__, synqe, synqe->flags)); done_with_synqe(sc, synqe); /* inp lock released by done_with_synqe */ return (0); } void t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) { struct adapter *sc = tod->tod_softc; struct synq_entry *synqe = arg; struct inpcb *inp = sotoinpcb(so); struct toepcb *toep = synqe->toep; NET_EPOCH_ASSERT(); /* prevents bad race with accept() */ INP_WLOCK_ASSERT(inp); KASSERT(synqe->flags & TPF_SYNQE, ("%s: %p not a synq_entry?", __func__, arg)); MPASS(toep->tid == synqe->tid); offload_socket(so, toep); make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt); toep->flags |= TPF_CPL_PENDING; update_tid(sc, synqe->tid, toep); synqe->flags |= TPF_SYNQE_EXPANDED; inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ? M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4; inp->inp_flowid = synqe->rss_hash; } static void t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) { bzero(to, sizeof(*to)); if (t4opt->mss) { to->to_flags |= TOF_MSS; to->to_mss = be16toh(t4opt->mss); } if (t4opt->wsf > 0 && t4opt->wsf < 15) { to->to_flags |= TOF_SCALE; to->to_wscale = t4opt->wsf; } if (t4opt->tstamp) to->to_flags |= TOF_TS; if (t4opt->sack) to->to_flags |= TOF_SACKPERM; } static bool encapsulated_syn(struct adapter *sc, const struct cpl_pass_accept_req *cpl) { u_int hlen = be32toh(cpl->hdr_len); if (chip_id(sc) >= CHELSIO_T6) return (G_T6_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); else return (G_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); } static void pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m, struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos) { const struct cpl_pass_accept_req *cpl = mtod(m, const void *); const struct ether_header *eh; unsigned int hlen = be32toh(cpl->hdr_len); uintptr_t l3hdr; const struct tcphdr *tcp; eh = (const void *)(cpl + 1); if (chip_id(sc) >= CHELSIO_T6) { l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen)); tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen)); } else { l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen)); tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen)); } /* extract TOS (DiffServ + ECN) byte for AccECN */ if (iptos) { if (((struct ip *)l3hdr)->ip_v == IPVERSION) { const struct ip *ip = (const void *)l3hdr; *iptos = ip->ip_tos; } #ifdef INET6 else if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) { const struct ip6_hdr *ip6 = (const void *)l3hdr; *iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; } #endif /* INET */ } if (inc) { bzero(inc, sizeof(*inc)); inc->inc_fport = tcp->th_sport; inc->inc_lport = tcp->th_dport; if (((struct ip *)l3hdr)->ip_v == IPVERSION) { const struct ip *ip = (const void *)l3hdr; inc->inc_faddr = ip->ip_src; inc->inc_laddr = ip->ip_dst; } else { const struct ip6_hdr *ip6 = (const void *)l3hdr; inc->inc_flags |= INC_ISIPV6; inc->inc6_faddr = ip6->ip6_src; inc->inc6_laddr = ip6->ip6_dst; } } if (th) { bcopy(tcp, th, sizeof(*th)); tcp_fields_to_host(th); /* just like tcp_input */ } } static struct l2t_entry * get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp, struct in_conninfo *inc) { struct l2t_entry *e; struct sockaddr_in6 sin6; struct sockaddr *dst = (void *)&sin6; struct nhop_object *nh; if (inc->inc_flags & INC_ISIPV6) { bzero(dst, sizeof(struct sockaddr_in6)); dst->sa_len = sizeof(struct sockaddr_in6); dst->sa_family = AF_INET6; if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) { /* no need for route lookup */ e = t4_l2t_get(pi, ifp, dst); return (e); } nh = fib6_lookup(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, NHR_NONE, 0); if (nh == NULL) return (NULL); if (nh->nh_ifp != ifp) return (NULL); if (nh->nh_flags & NHF_GATEWAY) ((struct sockaddr_in6 *)dst)->sin6_addr = nh->gw6_sa.sin6_addr; else ((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr; } else { dst->sa_len = sizeof(struct sockaddr_in); dst->sa_family = AF_INET; nh = fib4_lookup(RT_DEFAULT_FIB, inc->inc_faddr, 0, NHR_NONE, 0); if (nh == NULL) return (NULL); if (nh->nh_ifp != ifp) return (NULL); if (nh->nh_flags & NHF_GATEWAY) - ((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr; + if (nh->gw_sa.sa_family == AF_INET) + ((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr; + else + *((struct sockaddr_in6 *)dst) = nh->gw6_sa; else ((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr; } e = t4_l2t_get(pi, ifp, dst); return (e); } static int send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0, uint32_t opt2, int tid) { struct wrqe *wr; struct cpl_pass_accept_rpl *rpl; struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) : sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]); if (wr == NULL) return (ENOMEM); rpl = wrtod(wr); if (is_t4(sc)) INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); else { struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl; INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid); rpl5->iss = htobe32(synqe->iss); } rpl->opt0 = opt0; rpl->opt2 = opt2; return (t4_l2t_send(sc, wr, e)); } #define REJECT_PASS_ACCEPT_REQ(tunnel) do { \ if (!tunnel) { \ m_freem(m); \ m = NULL; \ } \ reject_reason = __LINE__; \ goto reject; \ } while (0) /* * The context associated with a tid entry via insert_tid could be a synq_entry * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. */ CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags)); /* * Incoming SYN on a listening socket. * * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe, * etc. */ static int do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct toedev *tod; const struct cpl_pass_accept_req *cpl = mtod(m, const void *); unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); unsigned int tid = GET_TID(cpl); struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp; struct socket *so; struct in_conninfo inc; struct tcphdr th; struct tcpopt to; struct port_info *pi; struct vi_info *vi; struct ifnet *hw_ifp, *ifp; struct l2t_entry *e = NULL; struct synq_entry *synqe = NULL; int reject_reason, v, ntids; uint16_t vid, l2info; struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif struct offload_settings settings; uint8_t iptos; KASSERT(opcode == CPL_PASS_ACCEPT_REQ, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, lctx); /* * Figure out the port the SYN arrived on. We'll look for an exact VI * match in a bit but in case we don't find any we'll use the main VI as * the incoming ifnet. */ l2info = be16toh(cpl->l2info); pi = sc->port[G_SYN_INTF(l2info)]; hw_ifp = pi->vi[0].ifp; m->m_pkthdr.rcvif = hw_ifp; CURVNET_SET(lctx->vnet); /* before any potential REJECT */ /* * If VXLAN/NVGRE parsing is enabled then SYNs in the inner traffic will * also hit the listener. We don't want to offload those. */ if (encapsulated_syn(sc, cpl)) { REJECT_PASS_ACCEPT_REQ(true); } /* * Use the MAC index to lookup the associated VI. If this SYN didn't * match a perfect MAC filter, punt. */ if (!(l2info & F_SYN_XACT_MATCH)) { REJECT_PASS_ACCEPT_REQ(true); } for_each_vi(pi, v, vi) { if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info)) goto found; } REJECT_PASS_ACCEPT_REQ(true); found: hw_ifp = vi->ifp; /* the cxgbe ifnet */ m->m_pkthdr.rcvif = hw_ifp; tod = TOEDEV(hw_ifp); /* * Don't offload if the peer requested a TCP option that's not known to * the silicon. Send the SYN to the kernel instead. */ if (__predict_false(cpl->tcpopt.unknown)) REJECT_PASS_ACCEPT_REQ(true); /* * Figure out if there is a pseudo interface (vlan, lagg, etc.) * involved. Don't offload if the SYN had a VLAN tag and the vid * doesn't match anything on this interface. * * XXX: lagg support, lagg + vlan support. */ vid = EVL_VLANOFTAG(be16toh(cpl->vlan)); if (vid != 0xfff && vid != 0) { ifp = VLAN_DEVAT(hw_ifp, vid); if (ifp == NULL) REJECT_PASS_ACCEPT_REQ(true); } else ifp = hw_ifp; /* * Don't offload if the ifnet that the SYN came in on is not in the same * vnet as the listening socket. */ if (lctx->vnet != ifp->if_vnet) REJECT_PASS_ACCEPT_REQ(true); pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos); if (inc.inc_flags & INC_ISIPV6) { /* Don't offload if the ifcap isn't enabled */ if ((ifp->if_capenable & IFCAP_TOE6) == 0) REJECT_PASS_ACCEPT_REQ(true); /* * SYN must be directed to an IP6 address on this ifnet. This * is more restrictive than in6_localip. */ NET_EPOCH_ENTER(et); if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } ntids = 2; } else { /* Don't offload if the ifcap isn't enabled */ if ((ifp->if_capenable & IFCAP_TOE4) == 0) REJECT_PASS_ACCEPT_REQ(true); /* * SYN must be directed to an IP address on this ifnet. This * is more restrictive than in_localip. */ NET_EPOCH_ENTER(et); if (!in_ifhasaddr(ifp, inc.inc_laddr)) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } ntids = 1; } e = get_l2te_for_nexthop(pi, ifp, &inc); if (e == NULL) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } /* Don't offload if the 4-tuple is already in use */ if (toe_4tuple_check(&inc, &th, ifp) != 0) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(false); } inp = lctx->inp; /* listening socket, not owned by TOE */ INP_RLOCK(inp); /* Don't offload if the listening socket has closed */ if (__predict_false(inp->inp_flags & INP_DROPPED)) { INP_RUNLOCK(inp); NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(false); } so = inp->inp_socket; rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, EVL_MAKETAG(0xfff, 0, 0), inp); rw_runlock(&sc->policy_lock); if (!settings.offload) { INP_RUNLOCK(inp); NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); /* Rejected by COP. */ } synqe = alloc_synqe(sc, lctx, M_NOWAIT); if (synqe == NULL) { INP_RUNLOCK(inp); NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } MPASS(rss->hash_type == RSS_HASH_TCP); synqe->rss_hash = be32toh(rss->hash_val); atomic_store_int(&synqe->ok_to_respond, 0); init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx, &synqe->params); /* * If all goes well t4_syncache_respond will get called during * syncache_add. Note that syncache_add releases the pcb lock. */ t4opt_to_tcpopt(&cpl->tcpopt, &to); toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos); if (atomic_load_int(&synqe->ok_to_respond) > 0) { uint64_t opt0; uint32_t opt2; opt0 = calc_options0(vi, &synqe->params); opt2 = calc_options2(vi, &synqe->params); insert_tid(sc, tid, synqe, ntids); synqe->tid = tid; synqe->syn = m; m = NULL; if (send_synack(sc, synqe, opt0, opt2, tid) != 0) { remove_tid(sc, tid, ntids); m = synqe->syn; synqe->syn = NULL; NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } CTR6(KTR_CXGBE, "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x", __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2)); } else { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(false); } NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); reject: CURVNET_RESTORE(); CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, reject_reason); if (e) t4_l2t_release(e); release_tid(sc, tid, lctx->ctrlq); if (synqe) { inp = synqe->lctx->inp; INP_WLOCK(inp); inp = release_synqe(sc, synqe); if (inp) INP_WUNLOCK(inp); } if (m) { /* * The connection request hit a TOE listener but is being passed * on to the kernel sw stack instead of getting offloaded. */ m_adj(m, sizeof(*cpl)); m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; hw_ifp->if_input(hw_ifp, m); } return (reject_reason); } static void synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe, const struct cpl_pass_establish *cpl, struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to) { uint16_t tcp_opt = be16toh(cpl->tcp_opt); uint8_t iptos; /* start off with the original SYN */ pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos); /* modify parts to make it look like the ACK to our SYN|ACK */ th->th_flags = TH_ACK; th->th_ack = synqe->iss + 1; th->th_seq = be32toh(cpl->rcv_isn); bzero(to, sizeof(*to)); if (G_TCPOPT_TSTAMP(tcp_opt)) { to->to_flags |= TOF_TS; to->to_tsecr = synqe->ts; } } static int do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct vi_info *vi; struct ifnet *ifp; const struct cpl_pass_establish *cpl = (const void *)(rss + 1); #if defined(KTR) || defined(INVARIANTS) unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); #endif unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp, *new_inp; struct socket *so; struct tcphdr th; struct tcpopt to; struct in_conninfo inc; struct toepcb *toep; struct epoch_tracker et; int rstreason; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_PASS_ESTABLISH, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); KASSERT(synqe->flags & TPF_SYNQE, ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); CURVNET_SET(lctx->vnet); NET_EPOCH_ENTER(et); /* for syncache_expand */ INP_WLOCK(inp); CTR6(KTR_CXGBE, "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x", __func__, stid, tid, synqe, synqe->flags, inp->inp_flags); ifp = synqe->syn->m_pkthdr.rcvif; vi = ifp->if_softc; KASSERT(vi->adapter == sc, ("%s: vi %p, sc %p mismatch", __func__, vi, sc)); if (__predict_false(inp->inp_flags & INP_DROPPED)) { reset: send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_SEND_RST); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], ("%s: CPL arrived on unexpected rxq. %d %d", __func__, synqe->params.rxq_idx, (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); toep = alloc_toepcb(vi, M_NOWAIT); if (toep == NULL) goto reset; toep->tid = tid; toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx]; toep->vnet = lctx->vnet; bcopy(&synqe->params, &toep->params, sizeof(toep->params)); init_toepcb(vi, toep); MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss); MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs); synqe->tcp_opt = cpl->tcp_opt; synqe->toep = toep; /* Come up with something that syncache_expand should be ok with. */ synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to); if (inc.inc_flags & INC_ISIPV6) { if (lctx->ce == NULL) { toep->ce = t4_get_clip_entry(sc, &inc.inc6_laddr, true); if (toep->ce == NULL) { free_toepcb(toep); goto reset; /* RST without a CLIP entry? */ } } else { t4_hold_clip_entry(sc, lctx->ce); toep->ce = lctx->ce; } } so = inp->inp_socket; KASSERT(so != NULL, ("%s: socket is NULL", __func__)); rstreason = toe_syncache_expand(&inc, &to, &th, &so); if (rstreason < 0) { free_toepcb(toep); send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_NO_RST); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } else if (rstreason == 0 || so == NULL) { free_toepcb(toep); goto reset; } /* New connection inpcb is already locked by syncache_expand(). */ new_inp = sotoinpcb(so); INP_WLOCK_ASSERT(new_inp); MPASS(so->so_vnet == lctx->vnet); /* * This is for expansion from syncookies. * * XXX: we've held the tcbinfo lock throughout so there's no risk of * anyone accept'ing a connection before we've installed our hooks, but * this somewhat defeats the purpose of having a tod_offload_socket :-( */ if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) { tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); t4_offload_socket(TOEDEV(ifp), synqe, so); } INP_WUNLOCK(new_inp); /* Done with the synqe */ inp = release_synqe(sc, synqe); if (inp != NULL) INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } void t4_init_listen_cpl_handlers(void) { t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl); t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); } void t4_uninit_listen_cpl_handlers(void) { t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL); t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL); t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL); t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL); } #endif diff --git a/sys/dev/iicbus/if_ic.c b/sys/dev/iicbus/if_ic.c index 4dac86141230..603265a52b13 100644 --- a/sys/dev/iicbus/if_ic.c +++ b/sys/dev/iicbus/if_ic.c @@ -1,440 +1,440 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1998, 2001 Nicolas Souchu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * I2C bus IP driver */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "iicbus_if.h" #define PCF_MASTER_ADDRESS 0xaa #define ICHDRLEN sizeof(u_int32_t) #define ICMTU 1500 /* default mtu */ struct ic_softc { struct ifnet *ic_ifp; device_t ic_dev; u_char ic_addr; /* peer I2C address */ int ic_flags; char *ic_obuf; char *ic_ifbuf; char *ic_cp; int ic_xfercnt; int ic_iferrs; struct mtx ic_lock; }; #define IC_SENDING 0x0001 #define IC_OBUF_BUSY 0x0002 #define IC_IFBUF_BUSY 0x0004 #define IC_BUFFERS_BUSY (IC_OBUF_BUSY | IC_IFBUF_BUSY) #define IC_BUFFER_WAITER 0x0004 static devclass_t ic_devclass; static int icprobe(device_t); static int icattach(device_t); static int icioctl(struct ifnet *, u_long, caddr_t); static int icoutput(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int icintr(device_t, int, char *); static device_method_t ic_methods[] = { /* device interface */ DEVMETHOD(device_probe, icprobe), DEVMETHOD(device_attach, icattach), /* iicbus interface */ DEVMETHOD(iicbus_intr, icintr), { 0, 0 } }; static driver_t ic_driver = { "ic", ic_methods, sizeof(struct ic_softc), }; static void ic_alloc_buffers(struct ic_softc *sc, int mtu) { char *obuf, *ifbuf; obuf = malloc(mtu + ICHDRLEN, M_DEVBUF, M_WAITOK); ifbuf = malloc(mtu + ICHDRLEN, M_DEVBUF, M_WAITOK); mtx_lock(&sc->ic_lock); while (sc->ic_flags & IC_BUFFERS_BUSY) { sc->ic_flags |= IC_BUFFER_WAITER; mtx_sleep(sc, &sc->ic_lock, 0, "icalloc", 0); sc->ic_flags &= ~IC_BUFFER_WAITER; } free(sc->ic_obuf, M_DEVBUF); free(sc->ic_ifbuf, M_DEVBUF); sc->ic_obuf = obuf; sc->ic_ifbuf = ifbuf; sc->ic_ifp->if_mtu = mtu; mtx_unlock(&sc->ic_lock); } /* * icprobe() */ static int icprobe(device_t dev) { return (BUS_PROBE_NOWILDCARD); } /* * icattach() */ static int icattach(device_t dev) { struct ic_softc *sc = (struct ic_softc *)device_get_softc(dev); struct ifnet *ifp; ifp = sc->ic_ifp = if_alloc(IFT_PARA); if (ifp == NULL) return (ENOSPC); mtx_init(&sc->ic_lock, device_get_nameunit(dev), MTX_NETWORK_LOCK, MTX_DEF); sc->ic_addr = PCF_MASTER_ADDRESS; /* XXX only PCF masters */ sc->ic_dev = dev; ifp->if_softc = sc; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_flags = IFF_SIMPLEX | IFF_POINTOPOINT | IFF_MULTICAST; ifp->if_ioctl = icioctl; ifp->if_output = icoutput; ifp->if_hdrlen = 0; ifp->if_addrlen = 0; ifp->if_snd.ifq_maxlen = ifqmaxlen; ic_alloc_buffers(sc, ICMTU); if_attach(ifp); bpfattach(ifp, DLT_NULL, ICHDRLEN); return (0); } /* * iciotcl() */ static int icioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ic_softc *sc = ifp->if_softc; device_t icdev = sc->ic_dev; device_t parent = device_get_parent(icdev); struct ifaddr *ifa = (struct ifaddr *)data; struct ifreq *ifr = (struct ifreq *)data; int error; switch (cmd) { case SIOCAIFADDR: case SIOCSIFADDR: if (ifa->ifa_addr->sa_family != AF_INET) return (EAFNOSUPPORT); mtx_lock(&sc->ic_lock); ifp->if_flags |= IFF_UP; goto locked; case SIOCSIFFLAGS: mtx_lock(&sc->ic_lock); locked: if ((!(ifp->if_flags & IFF_UP)) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* XXX disable PCF */ ifp->if_drv_flags &= ~IFF_DRV_RUNNING; mtx_unlock(&sc->ic_lock); /* IFF_UP is not set, try to release the bus anyway */ iicbus_release_bus(parent, icdev); break; } if (((ifp->if_flags & IFF_UP)) && (!(ifp->if_drv_flags & IFF_DRV_RUNNING))) { mtx_unlock(&sc->ic_lock); if ((error = iicbus_request_bus(parent, icdev, IIC_WAIT | IIC_INTR))) return (error); mtx_lock(&sc->ic_lock); iicbus_reset(parent, IIC_FASTEST, 0, NULL); ifp->if_drv_flags |= IFF_DRV_RUNNING; } mtx_unlock(&sc->ic_lock); break; case SIOCSIFMTU: ic_alloc_buffers(sc, ifr->ifr_mtu); break; case SIOCGIFMTU: mtx_lock(&sc->ic_lock); ifr->ifr_mtu = sc->ic_ifp->if_mtu; mtx_unlock(&sc->ic_lock); break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifr == NULL) return (EAFNOSUPPORT); /* XXX */ switch (ifr->ifr_addr.sa_family) { case AF_INET: break; default: return (EAFNOSUPPORT); } break; default: return (EINVAL); } return (0); } /* * icintr() */ static int icintr(device_t dev, int event, char *ptr) { struct ic_softc *sc = (struct ic_softc *)device_get_softc(dev); struct mbuf *top; int len; mtx_lock(&sc->ic_lock); switch (event) { case INTR_GENERAL: case INTR_START: sc->ic_cp = sc->ic_ifbuf; sc->ic_xfercnt = 0; sc->ic_flags |= IC_IFBUF_BUSY; break; case INTR_STOP: /* if any error occurred during transfert, * drop the packet */ sc->ic_flags &= ~IC_IFBUF_BUSY; if ((sc->ic_flags & (IC_BUFFERS_BUSY | IC_BUFFER_WAITER)) == IC_BUFFER_WAITER) wakeup(&sc); if (sc->ic_iferrs) goto err; if ((len = sc->ic_xfercnt) == 0) break; /* ignore */ if (len <= ICHDRLEN) goto err; len -= ICHDRLEN; if_inc_counter(sc->ic_ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(sc->ic_ifp, IFCOUNTER_IBYTES, len); BPF_TAP(sc->ic_ifp, sc->ic_ifbuf, len + ICHDRLEN); top = m_devget(sc->ic_ifbuf + ICHDRLEN, len, 0, sc->ic_ifp, 0); if (top) { struct epoch_tracker et; mtx_unlock(&sc->ic_lock); M_SETFIB(top, sc->ic_ifp->if_fib); NET_EPOCH_ENTER(et); netisr_dispatch(NETISR_IP, top); NET_EPOCH_EXIT(et); mtx_lock(&sc->ic_lock); } break; err: if_printf(sc->ic_ifp, "errors (%d)!\n", sc->ic_iferrs); sc->ic_iferrs = 0; /* reset error count */ if_inc_counter(sc->ic_ifp, IFCOUNTER_IERRORS, 1); break; case INTR_RECEIVE: if (sc->ic_xfercnt >= sc->ic_ifp->if_mtu + ICHDRLEN) { sc->ic_iferrs++; } else { *sc->ic_cp++ = *ptr; sc->ic_xfercnt++; } break; case INTR_NOACK: /* xfer terminated by master */ break; case INTR_TRANSMIT: *ptr = 0xff; /* XXX */ break; case INTR_ERROR: sc->ic_iferrs++; break; default: panic("%s: unknown event (%d)!", __func__, event); } mtx_unlock(&sc->ic_lock); return (0); } /* * icoutput() */ static int icoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct ic_softc *sc = ifp->if_softc; device_t icdev = sc->ic_dev; device_t parent = device_get_parent(icdev); int len, sent; struct mbuf *mm; u_char *cp; u_int32_t hdr; /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &hdr, sizeof(hdr)); else - hdr = dst->sa_family; + hdr = RO_GET_FAMILY(ro, dst); mtx_lock(&sc->ic_lock); ifp->if_drv_flags |= IFF_DRV_RUNNING; /* already sending? */ if (sc->ic_flags & IC_SENDING) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); goto error; } /* insert header */ bcopy ((char *)&hdr, sc->ic_obuf, ICHDRLEN); cp = sc->ic_obuf + ICHDRLEN; len = 0; mm = m; do { if (len + mm->m_len > sc->ic_ifp->if_mtu) { /* packet too large */ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); goto error; } bcopy(mtod(mm,char *), cp, mm->m_len); cp += mm->m_len; len += mm->m_len; } while ((mm = mm->m_next)); BPF_MTAP2(ifp, &hdr, sizeof(hdr), m); sc->ic_flags |= (IC_SENDING | IC_OBUF_BUSY); m_freem(m); mtx_unlock(&sc->ic_lock); /* send the packet */ if (iicbus_block_write(parent, sc->ic_addr, sc->ic_obuf, len + ICHDRLEN, &sent)) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); } mtx_lock(&sc->ic_lock); sc->ic_flags &= ~(IC_SENDING | IC_OBUF_BUSY); if ((sc->ic_flags & (IC_BUFFERS_BUSY | IC_BUFFER_WAITER)) == IC_BUFFER_WAITER) wakeup(&sc); mtx_unlock(&sc->ic_lock); return (0); error: m_freem(m); mtx_unlock(&sc->ic_lock); return(0); } DRIVER_MODULE(ic, iicbus, ic_driver, ic_devclass, 0, 0); MODULE_DEPEND(ic, iicbus, IICBUS_MINVER, IICBUS_PREFVER, IICBUS_MAXVER); MODULE_VERSION(ic, 1); diff --git a/sys/net/debugnet.c b/sys/net/debugnet.c index bb59ff33a93f..8652597c55db 100644 --- a/sys/net/debugnet.c +++ b/sys/net/debugnet.c @@ -1,1076 +1,1077 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2019 Isilon Systems, LLC. * Copyright (c) 2005-2014 Sandvine Incorporated. All rights reserved. * Copyright (c) 2000 Darrell Anderson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include #include #include #include #include #include #include #ifdef DDB #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUGNET_INTERNAL #include FEATURE(debugnet, "Debugnet support"); SYSCTL_NODE(_net, OID_AUTO, debugnet, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "debugnet parameters"); unsigned debugnet_debug; SYSCTL_UINT(_net_debugnet, OID_AUTO, debug, CTLFLAG_RWTUN, &debugnet_debug, 0, "Debug message verbosity (0: off; 1: on; 2: verbose)"); int debugnet_npolls = 2000; SYSCTL_INT(_net_debugnet, OID_AUTO, npolls, CTLFLAG_RWTUN, &debugnet_npolls, 0, "Number of times to poll before assuming packet loss (0.5ms per poll)"); int debugnet_nretries = 10; SYSCTL_INT(_net_debugnet, OID_AUTO, nretries, CTLFLAG_RWTUN, &debugnet_nretries, 0, "Number of retransmit attempts before giving up"); int debugnet_fib = RT_DEFAULT_FIB; SYSCTL_INT(_net_debugnet, OID_AUTO, fib, CTLFLAG_RWTUN, &debugnet_fib, 0, "Fib to use when sending dump"); static bool g_debugnet_pcb_inuse; static struct debugnet_pcb g_dnet_pcb; /* * Simple accessors for opaque PCB. */ const unsigned char * debugnet_get_gw_mac(const struct debugnet_pcb *pcb) { MPASS(g_debugnet_pcb_inuse && pcb == &g_dnet_pcb && pcb->dp_state >= DN_STATE_HAVE_GW_MAC); return (pcb->dp_gw_mac.octet); } /* * Start of network primitives, beginning with output primitives. */ /* * Handles creation of the ethernet header, then places outgoing packets into * the tx buffer for the NIC * * Parameters: * m The mbuf containing the packet to be sent (will be freed by * this function or the NIC driver) * ifp The interface to send on * dst The destination ethernet address (source address will be looked * up using ifp) * etype The ETHERTYPE_* value for the protocol that is being sent * * Returns: * int see errno.h, 0 for success */ int debugnet_ether_output(struct mbuf *m, struct ifnet *ifp, struct ether_addr dst, u_short etype) { struct ether_header *eh; if (((ifp->if_flags & (IFF_MONITOR | IFF_UP)) != IFF_UP) || (ifp->if_drv_flags & IFF_DRV_RUNNING) != IFF_DRV_RUNNING) { if_printf(ifp, "%s: interface isn't up\n", __func__); m_freem(m); return (ENETDOWN); } /* Fill in the ethernet header. */ M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); if (m == NULL) { printf("%s: out of mbufs\n", __func__); return (ENOBUFS); } eh = mtod(m, struct ether_header *); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); memcpy(eh->ether_dhost, dst.octet, ETHER_ADDR_LEN); eh->ether_type = htons(etype); return (ifp->if_debugnet_methods->dn_transmit(ifp, m)); } /* * Unreliable transmission of an mbuf chain to the debugnet server * Note: can't handle fragmentation; fails if the packet is larger than * ifp->if_mtu after adding the UDP/IP headers * * Parameters: * pcb The debugnet context block * m mbuf chain * * Returns: * int see errno.h, 0 for success */ static int debugnet_udp_output(struct debugnet_pcb *pcb, struct mbuf *m) { struct udphdr *udp; MPASS(pcb->dp_state >= DN_STATE_HAVE_GW_MAC); M_PREPEND(m, sizeof(*udp), M_NOWAIT); if (m == NULL) { printf("%s: out of mbufs\n", __func__); return (ENOBUFS); } udp = mtod(m, void *); udp->uh_ulen = htons(m->m_pkthdr.len); /* Use this src port so that the server can connect() the socket */ udp->uh_sport = htons(pcb->dp_client_port); udp->uh_dport = htons(pcb->dp_server_port); /* Computed later (protocol-dependent). */ udp->uh_sum = 0; return (debugnet_ip_output(pcb, m)); } int debugnet_ack_output(struct debugnet_pcb *pcb, uint32_t seqno /* net endian */) { struct debugnet_ack *dn_ack; struct mbuf *m; DNETDEBUG("Acking with seqno %u\n", ntohl(seqno)); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { printf("%s: Out of mbufs\n", __func__); return (ENOBUFS); } m->m_len = sizeof(*dn_ack); m->m_pkthdr.len = sizeof(*dn_ack); MH_ALIGN(m, sizeof(*dn_ack)); dn_ack = mtod(m, void *); dn_ack->da_seqno = seqno; return (debugnet_udp_output(pcb, m)); } /* * Dummy free function for debugnet clusters. */ static void debugnet_mbuf_free(struct mbuf *m __unused) { } /* * Construct and reliably send a debugnet packet. May fail from a resource * shortage or extreme number of unacknowledged retransmissions. Wait for * an acknowledgement before returning. Splits packets into chunks small * enough to be sent without fragmentation (looks up the interface MTU) * * Parameters: * type debugnet packet type (HERALD, FINISHED, ...) * data data * datalen data size (bytes) * auxdata optional auxiliary information * * Returns: * int see errno.h, 0 for success */ int debugnet_send(struct debugnet_pcb *pcb, uint32_t type, const void *data, uint32_t datalen, const struct debugnet_proto_aux *auxdata) { struct debugnet_msg_hdr *dn_msg_hdr; struct mbuf *m, *m2; uint64_t want_acks; uint32_t i, pktlen, sent_so_far; int retries, polls, error; if (pcb->dp_state == DN_STATE_REMOTE_CLOSED) return (ECONNRESET); want_acks = 0; pcb->dp_rcvd_acks = 0; retries = 0; retransmit: /* Chunks can be too big to fit in packets. */ for (i = sent_so_far = 0; sent_so_far < datalen || (i == 0 && datalen == 0); i++) { pktlen = datalen - sent_so_far; /* Bound: the interface MTU (assume no IP options). */ pktlen = min(pktlen, pcb->dp_ifp->if_mtu - sizeof(struct udpiphdr) - sizeof(struct debugnet_msg_hdr)); /* * Check if it is retransmitting and this has been ACKed * already. */ if ((pcb->dp_rcvd_acks & (1 << i)) != 0) { sent_so_far += pktlen; continue; } /* * Get and fill a header mbuf, then chain data as an extended * mbuf. */ m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { printf("%s: Out of mbufs\n", __func__); return (ENOBUFS); } m->m_len = sizeof(struct debugnet_msg_hdr); m->m_pkthdr.len = sizeof(struct debugnet_msg_hdr); MH_ALIGN(m, sizeof(struct debugnet_msg_hdr)); dn_msg_hdr = mtod(m, struct debugnet_msg_hdr *); dn_msg_hdr->mh_seqno = htonl(pcb->dp_seqno + i); dn_msg_hdr->mh_type = htonl(type); dn_msg_hdr->mh_len = htonl(pktlen); if (auxdata != NULL) { dn_msg_hdr->mh_offset = htobe64(auxdata->dp_offset_start + sent_so_far); dn_msg_hdr->mh_aux2 = htobe32(auxdata->dp_aux2); } else { dn_msg_hdr->mh_offset = htobe64(sent_so_far); dn_msg_hdr->mh_aux2 = 0; } if (pktlen != 0) { m2 = m_get(M_NOWAIT, MT_DATA); if (m2 == NULL) { m_freem(m); printf("%s: Out of mbufs\n", __func__); return (ENOBUFS); } MEXTADD(m2, __DECONST(char *, data) + sent_so_far, pktlen, debugnet_mbuf_free, NULL, NULL, 0, EXT_DISPOSABLE); m2->m_len = pktlen; m_cat(m, m2); m->m_pkthdr.len += pktlen; } error = debugnet_udp_output(pcb, m); if (error != 0) return (error); /* Note that we're waiting for this packet in the bitfield. */ want_acks |= (1 << i); sent_so_far += pktlen; } if (i >= DEBUGNET_MAX_IN_FLIGHT) printf("Warning: Sent more than %d packets (%d). " "Acknowledgements will fail unless the size of " "rcvd_acks/want_acks is increased.\n", DEBUGNET_MAX_IN_FLIGHT, i); /* * Wait for acks. A *real* window would speed things up considerably. */ polls = 0; while (pcb->dp_rcvd_acks != want_acks) { if (polls++ > debugnet_npolls) { if (retries++ > debugnet_nretries) return (ETIMEDOUT); printf(". "); goto retransmit; } debugnet_network_poll(pcb); DELAY(500); if (pcb->dp_state == DN_STATE_REMOTE_CLOSED) return (ECONNRESET); } pcb->dp_seqno += i; return (0); } /* * Network input primitives. */ /* * Just introspect the header enough to fire off a seqno ack and validate * length fits. */ static void debugnet_handle_rx_msg(struct debugnet_pcb *pcb, struct mbuf **mb) { const struct debugnet_msg_hdr *dnh; struct mbuf *m; int error; m = *mb; if (m->m_pkthdr.len < sizeof(*dnh)) { DNETDEBUG("ignoring small debugnet_msg packet\n"); return; } /* Get ND header. */ if (m->m_len < sizeof(*dnh)) { m = m_pullup(m, sizeof(*dnh)); *mb = m; if (m == NULL) { DNETDEBUG("m_pullup failed\n"); return; } } dnh = mtod(m, const void *); if (ntohl(dnh->mh_len) + sizeof(*dnh) > m->m_pkthdr.len) { DNETDEBUG("Dropping short packet.\n"); return; } /* * If the issue is transient (ENOBUFS), sender should resend. If * non-transient (like driver objecting to rx -> tx from the same * thread), not much else we can do. */ error = debugnet_ack_output(pcb, dnh->mh_seqno); if (error != 0) return; if (ntohl(dnh->mh_type) == DEBUGNET_FINISHED) { printf("Remote shut down the connection on us!\n"); pcb->dp_state = DN_STATE_REMOTE_CLOSED; /* * Continue through to the user handler so they are signalled * not to wait for further rx. */ } pcb->dp_rx_handler(pcb, mb); } static void debugnet_handle_ack(struct debugnet_pcb *pcb, struct mbuf **mb, uint16_t sport) { const struct debugnet_ack *dn_ack; struct mbuf *m; uint32_t rcv_ackno; m = *mb; /* Get Ack. */ if (m->m_len < sizeof(*dn_ack)) { m = m_pullup(m, sizeof(*dn_ack)); *mb = m; if (m == NULL) { DNETDEBUG("m_pullup failed\n"); return; } } dn_ack = mtod(m, const void *); /* Debugnet processing. */ /* * Packet is meant for us. Extract the ack sequence number and the * port number if necessary. */ rcv_ackno = ntohl(dn_ack->da_seqno); if (pcb->dp_state < DN_STATE_GOT_HERALD_PORT) { pcb->dp_server_port = sport; pcb->dp_state = DN_STATE_GOT_HERALD_PORT; } if (rcv_ackno >= pcb->dp_seqno + DEBUGNET_MAX_IN_FLIGHT) printf("%s: ACK %u too far in future!\n", __func__, rcv_ackno); else if (rcv_ackno >= pcb->dp_seqno) { /* We're interested in this ack. Record it. */ pcb->dp_rcvd_acks |= 1 << (rcv_ackno - pcb->dp_seqno); } } void debugnet_handle_udp(struct debugnet_pcb *pcb, struct mbuf **mb) { const struct udphdr *udp; struct mbuf *m; uint16_t sport, ulen; /* UDP processing. */ m = *mb; if (m->m_pkthdr.len < sizeof(*udp)) { DNETDEBUG("ignoring small UDP packet\n"); return; } /* Get UDP headers. */ if (m->m_len < sizeof(*udp)) { m = m_pullup(m, sizeof(*udp)); *mb = m; if (m == NULL) { DNETDEBUG("m_pullup failed\n"); return; } } udp = mtod(m, const void *); /* We expect to receive UDP packets on the configured client port. */ if (ntohs(udp->uh_dport) != pcb->dp_client_port) { DNETDEBUG("not on the expected port.\n"); return; } /* Check that ulen does not exceed actual size of data. */ ulen = ntohs(udp->uh_ulen); if (m->m_pkthdr.len < ulen) { DNETDEBUG("ignoring runt UDP packet\n"); return; } sport = ntohs(udp->uh_sport); m_adj(m, sizeof(*udp)); ulen -= sizeof(*udp); if (ulen == sizeof(struct debugnet_ack)) { debugnet_handle_ack(pcb, mb, sport); return; } if (pcb->dp_rx_handler == NULL) { if (ulen < sizeof(struct debugnet_ack)) DNETDEBUG("ignoring small ACK packet\n"); else DNETDEBUG("ignoring unexpected non-ACK packet on " "half-duplex connection.\n"); return; } debugnet_handle_rx_msg(pcb, mb); } /* * Handler for incoming packets directly from the network adapter * Identifies the packet type (IP or ARP) and passes it along to one of the * helper functions debugnet_handle_ip or debugnet_handle_arp. * * It needs to partially replicate the behaviour of ether_input() and * ether_demux(). * * Parameters: * ifp the interface the packet came from * m an mbuf containing the packet received */ static void debugnet_pkt_in(struct ifnet *ifp, struct mbuf *m) { struct ifreq ifr; struct ether_header *eh; u_short etype; /* Ethernet processing. */ if ((m->m_flags & M_PKTHDR) == 0) { DNETDEBUG_IF(ifp, "discard frame without packet header\n"); goto done; } if (m->m_len < ETHER_HDR_LEN) { DNETDEBUG_IF(ifp, "discard frame without leading eth header (len %u pktlen %u)\n", m->m_len, m->m_pkthdr.len); goto done; } if ((m->m_flags & M_HASFCS) != 0) { m_adj(m, -ETHER_CRC_LEN); m->m_flags &= ~M_HASFCS; } eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); if ((m->m_flags & M_VLANTAG) != 0 || etype == ETHERTYPE_VLAN) { DNETDEBUG_IF(ifp, "ignoring vlan packets\n"); goto done; } if (if_gethwaddr(ifp, &ifr) != 0) { DNETDEBUG_IF(ifp, "failed to get hw addr for interface\n"); goto done; } if (memcmp(ifr.ifr_addr.sa_data, eh->ether_dhost, ETHER_ADDR_LEN) != 0 && (etype != ETHERTYPE_ARP || !ETHER_IS_BROADCAST(eh->ether_dhost))) { DNETDEBUG_IF(ifp, "discard frame with incorrect destination addr\n"); goto done; } MPASS(g_debugnet_pcb_inuse); /* Done ethernet processing. Strip off the ethernet header. */ m_adj(m, ETHER_HDR_LEN); switch (etype) { case ETHERTYPE_ARP: debugnet_handle_arp(&g_dnet_pcb, &m); break; case ETHERTYPE_IP: debugnet_handle_ip(&g_dnet_pcb, &m); break; default: DNETDEBUG_IF(ifp, "dropping unknown ethertype %hu\n", etype); break; } done: if (m != NULL) m_freem(m); } /* * Network polling primitive. * * Instead of assuming that most of the network stack is sane, we just poll the * driver directly for packets. */ void debugnet_network_poll(struct debugnet_pcb *pcb) { struct ifnet *ifp; ifp = pcb->dp_ifp; ifp->if_debugnet_methods->dn_poll(ifp, 1000); } /* * Start of consumer API surface. */ void debugnet_free(struct debugnet_pcb *pcb) { struct ifnet *ifp; MPASS(g_debugnet_pcb_inuse); MPASS(pcb == &g_dnet_pcb); ifp = pcb->dp_ifp; if (ifp != NULL) { if (pcb->dp_drv_input != NULL) ifp->if_input = pcb->dp_drv_input; if (pcb->dp_event_started) ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_END); } debugnet_mbuf_finish(); g_debugnet_pcb_inuse = false; memset(&g_dnet_pcb, 0xfd, sizeof(g_dnet_pcb)); } int debugnet_connect(const struct debugnet_conn_params *dcp, struct debugnet_pcb **pcb_out) { struct debugnet_proto_aux herald_auxdata; struct debugnet_pcb *pcb; struct ifnet *ifp; int error; if (g_debugnet_pcb_inuse) { printf("%s: Only one connection at a time.\n", __func__); return (EBUSY); } pcb = &g_dnet_pcb; *pcb = (struct debugnet_pcb) { .dp_state = DN_STATE_INIT, .dp_client = dcp->dc_client, .dp_server = dcp->dc_server, .dp_gateway = dcp->dc_gateway, .dp_server_port = dcp->dc_herald_port, /* Initially */ .dp_client_port = dcp->dc_client_port, .dp_seqno = 1, .dp_ifp = dcp->dc_ifp, .dp_rx_handler = dcp->dc_rx_handler, }; /* Switch to the debugnet mbuf zones. */ debugnet_mbuf_start(); /* At least one needed parameter is missing; infer it. */ if (pcb->dp_client == INADDR_ANY || pcb->dp_gateway == INADDR_ANY || pcb->dp_ifp == NULL) { struct sockaddr_in dest_sin, *gw_sin, *local_sin; struct ifnet *rt_ifp; struct nhop_object *nh; memset(&dest_sin, 0, sizeof(dest_sin)); dest_sin = (struct sockaddr_in) { .sin_len = sizeof(dest_sin), .sin_family = AF_INET, .sin_addr.s_addr = pcb->dp_server, }; CURVNET_SET(vnet0); nh = fib4_lookup_debugnet(debugnet_fib, dest_sin.sin_addr, 0, NHR_NONE); CURVNET_RESTORE(); if (nh == NULL) { printf("%s: Could not get route for that server.\n", __func__); error = ENOENT; goto cleanup; } + /* TODO support AF_INET6 */ if (nh->gw_sa.sa_family == AF_INET) gw_sin = &nh->gw4_sa; else { if (nh->gw_sa.sa_family == AF_LINK) DNETDEBUG("Destination address is on link.\n"); gw_sin = NULL; } MPASS(nh->nh_ifa->ifa_addr->sa_family == AF_INET); local_sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; rt_ifp = nh->nh_ifp; if (pcb->dp_client == INADDR_ANY) pcb->dp_client = local_sin->sin_addr.s_addr; if (pcb->dp_gateway == INADDR_ANY && gw_sin != NULL) pcb->dp_gateway = gw_sin->sin_addr.s_addr; if (pcb->dp_ifp == NULL) pcb->dp_ifp = rt_ifp; } ifp = pcb->dp_ifp; if (debugnet_debug > 0) { char serbuf[INET_ADDRSTRLEN], clibuf[INET_ADDRSTRLEN], gwbuf[INET_ADDRSTRLEN]; inet_ntop(AF_INET, &pcb->dp_server, serbuf, sizeof(serbuf)); inet_ntop(AF_INET, &pcb->dp_client, clibuf, sizeof(clibuf)); if (pcb->dp_gateway != INADDR_ANY) inet_ntop(AF_INET, &pcb->dp_gateway, gwbuf, sizeof(gwbuf)); DNETDEBUG("Connecting to %s:%d%s%s from %s:%d on %s\n", serbuf, pcb->dp_server_port, (pcb->dp_gateway == INADDR_ANY) ? "" : " via ", (pcb->dp_gateway == INADDR_ANY) ? "" : gwbuf, clibuf, pcb->dp_client_port, if_name(ifp)); } /* Validate iface is online and supported. */ if (!DEBUGNET_SUPPORTED_NIC(ifp)) { printf("%s: interface '%s' does not support debugnet\n", __func__, if_name(ifp)); error = ENODEV; goto cleanup; } if ((if_getflags(ifp) & IFF_UP) == 0) { printf("%s: interface '%s' link is down\n", __func__, if_name(ifp)); error = ENXIO; goto cleanup; } ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_START); pcb->dp_event_started = true; /* * We maintain the invariant that g_debugnet_pcb_inuse is always true * while the debugnet ifp's if_input is overridden with * debugnet_pkt_in. */ g_debugnet_pcb_inuse = true; /* Make the card use *our* receive callback. */ pcb->dp_drv_input = ifp->if_input; ifp->if_input = debugnet_pkt_in; printf("%s: searching for %s MAC...\n", __func__, (dcp->dc_gateway == INADDR_ANY) ? "server" : "gateway"); error = debugnet_arp_gw(pcb); if (error != 0) { printf("%s: failed to locate MAC address\n", __func__); goto cleanup; } MPASS(pcb->dp_state == DN_STATE_HAVE_GW_MAC); herald_auxdata = (struct debugnet_proto_aux) { .dp_offset_start = dcp->dc_herald_offset, .dp_aux2 = dcp->dc_herald_aux2, }; error = debugnet_send(pcb, DEBUGNET_HERALD, dcp->dc_herald_data, dcp->dc_herald_datalen, &herald_auxdata); if (error != 0) { printf("%s: failed to herald debugnet server\n", __func__); goto cleanup; } *pcb_out = pcb; return (0); cleanup: debugnet_free(pcb); return (error); } /* * Pre-allocated dump-time mbuf tracking. * * We just track the high water mark we've ever seen and allocate appropriately * for that iface/mtu combo. */ static struct { int nmbuf; int ncl; int clsize; } dn_hwm; static struct mtx dn_hwm_lk; MTX_SYSINIT(debugnet_hwm_lock, &dn_hwm_lk, "Debugnet HWM lock", MTX_DEF); static void dn_maybe_reinit_mbufs(int nmbuf, int ncl, int clsize) { bool any; any = false; mtx_lock(&dn_hwm_lk); if (nmbuf > dn_hwm.nmbuf) { any = true; dn_hwm.nmbuf = nmbuf; } else nmbuf = dn_hwm.nmbuf; if (ncl > dn_hwm.ncl) { any = true; dn_hwm.ncl = ncl; } else ncl = dn_hwm.ncl; if (clsize > dn_hwm.clsize) { any = true; dn_hwm.clsize = clsize; } else clsize = dn_hwm.clsize; mtx_unlock(&dn_hwm_lk); if (any) debugnet_mbuf_reinit(nmbuf, ncl, clsize); } void debugnet_any_ifnet_update(struct ifnet *ifp) { int clsize, nmbuf, ncl, nrxr; if (!DEBUGNET_SUPPORTED_NIC(ifp)) return; ifp->if_debugnet_methods->dn_init(ifp, &nrxr, &ncl, &clsize); KASSERT(nrxr > 0, ("invalid receive ring count %d", nrxr)); /* * We need two headers per message on the transmit side. Multiply by * four to give us some breathing room. */ nmbuf = ncl * (4 + nrxr); ncl *= nrxr; /* * Bandaid for drivers that (incorrectly) advertise LinkUp before their * dn_init method is available. */ if (nmbuf == 0 || ncl == 0 || clsize == 0) { printf("%s: Bad dn_init result from %s (ifp %p), ignoring.\n", __func__, if_name(ifp), ifp); return; } dn_maybe_reinit_mbufs(nmbuf, ncl, clsize); } /* * Unfortunately, the ifnet_arrival_event eventhandler hook is mostly useless * for us because drivers tend to if_attach before invoking DEBUGNET_SET(). * * On the other hand, hooking DEBUGNET_SET() itself may still be too early, * because the driver is still in attach. Since we cannot use down interfaces, * maybe hooking ifnet_event:IFNET_EVENT_UP is sufficient? ... Nope, at least * with vtnet and dhcpclient that event just never occurs. * * So that's how I've landed on the lower level ifnet_link_event. */ static void dn_ifnet_event(void *arg __unused, struct ifnet *ifp, int link_state) { if (link_state == LINK_STATE_UP) debugnet_any_ifnet_update(ifp); } static eventhandler_tag dn_attach_cookie; static void dn_evh_init(void *ctx __unused) { dn_attach_cookie = EVENTHANDLER_REGISTER(ifnet_link_event, dn_ifnet_event, NULL, EVENTHANDLER_PRI_ANY); } SYSINIT(dn_evh_init, SI_SUB_EVENTHANDLER + 1, SI_ORDER_ANY, dn_evh_init, NULL); /* * DDB parsing helpers for debugnet(4) consumers. */ #ifdef DDB struct my_inet_opt { bool has_opt; const char *printname; in_addr_t *result; }; static int dn_parse_optarg_ipv4(struct my_inet_opt *opt) { in_addr_t tmp; unsigned octet; int t; tmp = 0; for (octet = 0; octet < 4; octet++) { t = db_read_token_flags(DRT_WSPACE | DRT_DECIMAL); if (t != tNUMBER) { db_printf("%s:%s: octet %u expected number; found %d\n", __func__, opt->printname, octet, t); return (EINVAL); } /* * db_lex lexes '-' distinctly from the number itself, but * let's document that invariant. */ MPASS(db_tok_number >= 0); if (db_tok_number > UINT8_MAX) { db_printf("%s:%s: octet %u out of range: %jd\n", __func__, opt->printname, octet, (intmax_t)db_tok_number); return (EDOM); } /* Constructed host-endian and converted to network later. */ tmp = (tmp << 8) | db_tok_number; if (octet < 3) { t = db_read_token_flags(DRT_WSPACE); if (t != tDOT) { db_printf("%s:%s: octet %u expected '.'; found" " %d\n", __func__, opt->printname, octet, t); return (EINVAL); } } } *opt->result = htonl(tmp); opt->has_opt = true; return (0); } int debugnet_parse_ddb_cmd(const char *cmd, struct debugnet_ddb_config *result) { struct ifnet *ifp; int t, error; bool want_ifp; char ch; struct my_inet_opt opt_client = { .printname = "client", .result = &result->dd_client, }, opt_server = { .printname = "server", .result = &result->dd_server, }, opt_gateway = { .printname = "gateway", .result = &result->dd_gateway, }, *cur_inet_opt; ifp = NULL; memset(result, 0, sizeof(*result)); /* * command [space] [-] [opt] [[space] [optarg]] ... * * db_command has already lexed 'command' for us. */ t = db_read_token_flags(DRT_WSPACE); if (t == tWSPACE) t = db_read_token_flags(DRT_WSPACE); while (t != tEOL) { if (t != tMINUS) { db_printf("%s: Bad syntax; expected '-', got %d\n", cmd, t); goto usage; } t = db_read_token_flags(DRT_WSPACE); if (t != tIDENT) { db_printf("%s: Bad syntax; expected tIDENT, got %d\n", cmd, t); goto usage; } if (strlen(db_tok_string) > 1) { db_printf("%s: Bad syntax; expected single option " "flag, got '%s'\n", cmd, db_tok_string); goto usage; } want_ifp = false; cur_inet_opt = NULL; switch ((ch = db_tok_string[0])) { default: DNETDEBUG("Unexpected: '%c'\n", ch); /* FALLTHROUGH */ case 'h': goto usage; case 'c': cur_inet_opt = &opt_client; break; case 'g': cur_inet_opt = &opt_gateway; break; case 's': cur_inet_opt = &opt_server; break; case 'i': want_ifp = true; break; } t = db_read_token_flags(DRT_WSPACE); if (t != tWSPACE) { db_printf("%s: Bad syntax; expected space after " "flag %c, got %d\n", cmd, ch, t); goto usage; } if (want_ifp) { t = db_read_token_flags(DRT_WSPACE); if (t != tIDENT) { db_printf("%s: Expected interface but got %d\n", cmd, t); goto usage; } CURVNET_SET(vnet0); /* * We *don't* take a ref here because the only current * consumer, db_netdump_cmd, does not need it. It * (somewhat redundantly) extracts the if_name(), * re-lookups the ifp, and takes its own reference. */ ifp = ifunit(db_tok_string); CURVNET_RESTORE(); if (ifp == NULL) { db_printf("Could not locate interface %s\n", db_tok_string); goto cleanup; } } else { MPASS(cur_inet_opt != NULL); /* Assume IPv4 for now. */ error = dn_parse_optarg_ipv4(cur_inet_opt); if (error != 0) goto cleanup; } /* Skip (mandatory) whitespace after option, if not EOL. */ t = db_read_token_flags(DRT_WSPACE); if (t == tEOL) break; if (t != tWSPACE) { db_printf("%s: Bad syntax; expected space after " "flag %c option; got %d\n", cmd, ch, t); goto usage; } t = db_read_token_flags(DRT_WSPACE); } if (!opt_server.has_opt) { db_printf("%s: need a destination server address\n", cmd); goto usage; } result->dd_has_client = opt_client.has_opt; result->dd_has_gateway = opt_gateway.has_opt; result->dd_ifp = ifp; /* We parsed the full line to tEOL already, or bailed with an error. */ return (0); usage: db_printf("Usage: %s -s [-g -c " "-i ]\n", cmd); error = EINVAL; /* FALLTHROUGH */ cleanup: db_skip_to_eol(); return (error); } #endif /* DDB */ diff --git a/sys/net/if_disc.c b/sys/net/if_disc.c index ac0028c42f70..14d544dfd86a 100644 --- a/sys/net/if_disc.c +++ b/sys/net/if_disc.c @@ -1,246 +1,246 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)if_loop.c 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ /* * Discard interface driver for protocol testing and timing. * (Based on the loopback.) */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_inet.h" #include "opt_inet6.h" #ifdef TINY_DSMTU #define DSMTU (1024+512) #else #define DSMTU 65532 #endif struct disc_softc { struct ifnet *sc_ifp; }; static int discoutput(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int discioctl(struct ifnet *, u_long, caddr_t); static int disc_clone_create(struct if_clone *, int, caddr_t); static void disc_clone_destroy(struct ifnet *); static const char discname[] = "disc"; static MALLOC_DEFINE(M_DISC, discname, "Discard interface"); VNET_DEFINE_STATIC(struct if_clone *, disc_cloner); #define V_disc_cloner VNET(disc_cloner) static int disc_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct ifnet *ifp; struct disc_softc *sc; sc = malloc(sizeof(struct disc_softc), M_DISC, M_WAITOK | M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_LOOP); if (ifp == NULL) { free(sc, M_DISC); return (ENOSPC); } ifp->if_softc = sc; if_initname(ifp, discname, unit); ifp->if_mtu = DSMTU; /* * IFF_LOOPBACK should not be removed from disc's flags because * it controls what PF-specific routes are magically added when * a network address is assigned to the interface. Things just * won't work as intended w/o such routes because the output * interface selection for a packet is totally route-driven. * A valid alternative to IFF_LOOPBACK can be IFF_BROADCAST or * IFF_POINTOPOINT, but it would result in different properties * of the interface. */ ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST; ifp->if_drv_flags = IFF_DRV_RUNNING; ifp->if_ioctl = discioctl; ifp->if_output = discoutput; ifp->if_hdrlen = 0; ifp->if_addrlen = 0; ifp->if_snd.ifq_maxlen = 20; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); return (0); } static void disc_clone_destroy(struct ifnet *ifp) { struct disc_softc *sc; sc = ifp->if_softc; bpfdetach(ifp); if_detach(ifp); if_free(ifp); free(sc, M_DISC); } static void vnet_disc_init(const void *unused __unused) { V_disc_cloner = if_clone_simple(discname, disc_clone_create, disc_clone_destroy, 0); } VNET_SYSINIT(vnet_disc_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_disc_init, NULL); static void vnet_disc_uninit(const void *unused __unused) { if_clone_detach(V_disc_cloner); } VNET_SYSUNINIT(vnet_disc_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_disc_uninit, NULL); static int disc_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t disc_mod = { "if_disc", disc_modevent, NULL }; DECLARE_MODULE(if_disc, disc_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); static int discoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { u_int32_t af; M_ASSERTPKTHDR(m); /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else - af = dst->sa_family; + af = RO_GET_FAMILY(ro, dst); if (bpf_peers_present(ifp->if_bpf)) bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); m->m_pkthdr.rcvif = ifp; if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); m_freem(m); return (0); } /* * Process an ioctl request. */ static int discioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; int error = 0; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; /* * Everything else is done at a higher level. */ break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifr == NULL) { error = EAFNOSUPPORT; /* XXX */ break; } switch (ifr->ifr_addr.sa_family) { #ifdef INET case AF_INET: break; #endif #ifdef INET6 case AF_INET6: break; #endif default: error = EAFNOSUPPORT; break; } break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; break; default: error = EINVAL; } return (error); } diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index 6d8b79d4dd12..3209e8a82978 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -1,1489 +1,1490 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_netgraph.h" #include "opt_mbuf_profiling.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #endif #ifdef INET6 #include #endif #include #include #ifdef CTASSERT CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2); CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); #endif VNET_DEFINE(pfil_head_t, link_pfil_head); /* Packet filter hooks */ /* netgraph node hooks for ng_ether(4) */ void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m); int (*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_attach_p)(struct ifnet *ifp); void (*ng_ether_detach_p)(struct ifnet *ifp); void (*vlan_input_p)(struct ifnet *, struct mbuf *); /* if_bridge(4) support */ void (*bridge_dn_p)(struct mbuf *, struct ifnet *); /* if_lagg(4) support */ struct mbuf *(*lagg_input_ethernet_p)(struct ifnet *, struct mbuf *); static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static int ether_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); static int ether_requestencap(struct ifnet *, struct if_encap_req *); #define senderr(e) do { error = (e); goto bad;} while (0) static void update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst) { int csum_flags = 0; if (src->m_pkthdr.csum_flags & CSUM_IP) csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA) csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); if (src->m_pkthdr.csum_flags & CSUM_SCTP) csum_flags |= CSUM_SCTP_VALID; dst->m_pkthdr.csum_flags |= csum_flags; if (csum_flags & CSUM_DATA_VALID) dst->m_pkthdr.csum_data = 0xffff; } /* * Handle link-layer encapsulation requests. */ static int ether_requestencap(struct ifnet *ifp, struct if_encap_req *req) { struct ether_header *eh; struct arphdr *ah; uint16_t etype; const u_char *lladdr; if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < ETHER_HDR_LEN) return (ENOMEM); eh = (struct ether_header *)req->buf; lladdr = req->lladdr; req->lladdr_off = 0; switch (req->family) { case AF_INET: etype = htons(ETHERTYPE_IP); break; case AF_INET6: etype = htons(ETHERTYPE_IPV6); break; case AF_ARP: ah = (struct arphdr *)req->hdata; ah->ar_hrd = htons(ARPHRD_ETHER); switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: etype = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: etype = htons(ETHERTYPE_ARP); break; } if (req->flags & IFENCAP_FLAG_BROADCAST) lladdr = ifp->if_broadcastaddr; break; default: return (EAFNOSUPPORT); } memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type)); memcpy(eh->ether_dhost, lladdr, ETHER_ADDR_LEN); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); req->bufsize = sizeof(struct ether_header); return (0); } static int ether_resolve_addr(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro, u_char *phdr, uint32_t *pflags, struct llentry **plle) { struct ether_header *eh; uint32_t lleflags = 0; int error = 0; #if defined(INET) || defined(INET6) uint16_t etype; #endif if (plle) *plle = NULL; eh = (struct ether_header *)phdr; switch (dst->sa_family) { #ifdef INET case AF_INET: if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle); else { if (m->m_flags & M_BCAST) memcpy(eh->ether_dhost, ifp->if_broadcastaddr, ETHER_ADDR_LEN); else { const struct in_addr *a; a = &(((const struct sockaddr_in *)dst)->sin_addr); ETHER_MAP_IP_MULTICAST(a, eh->ether_dhost); } etype = htons(ETHERTYPE_IP); memcpy(&eh->ether_type, &etype, sizeof(etype)); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); } break; #endif #ifdef INET6 case AF_INET6: - if ((m->m_flags & M_MCAST) == 0) - error = nd6_resolve(ifp, LLE_SF(AF_INET6, 0), m, dst, phdr, + if ((m->m_flags & M_MCAST) == 0) { + int af = RO_GET_FAMILY(ro, dst); + error = nd6_resolve(ifp, LLE_SF(af, 0), m, dst, phdr, &lleflags, plle); - else { + } else { const struct in6_addr *a6; a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr); ETHER_MAP_IPV6_MULTICAST(a6, eh->ether_dhost); etype = htons(ETHERTYPE_IPV6); memcpy(&eh->ether_type, &etype, sizeof(etype)); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); } break; #endif default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); if (m != NULL) m_freem(m); return (EAFNOSUPPORT); } if (error == EHOSTDOWN) { if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0) error = EHOSTUNREACH; } if (error != 0) return (error); *pflags = RT_MAY_LOOP; if (lleflags & LLE_IFADDR) *pflags |= RT_L2_ME; return (0); } /* * Ethernet output routine. * Encapsulate a packet of type family for the local net. * Use trailer local net encapsulation if enough data in first * packet leaves a multiple of 512 bytes of data in remainder. */ int ether_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { int error = 0; char linkhdr[ETHER_HDR_LEN], *phdr; struct ether_header *eh; struct pf_mtag *t; bool loop_copy; int hlen; /* link layer header length */ uint32_t pflags; struct llentry *lle = NULL; int addref = 0; phdr = NULL; pflags = 0; if (ro != NULL) { /* XXX BPF uses ro_prepend */ if (ro->ro_prepend != NULL) { phdr = ro->ro_prepend; hlen = ro->ro_plen; } else if (!(m->m_flags & (M_BCAST | M_MCAST))) { if ((ro->ro_flags & RT_LLE_CACHE) != 0) { lle = ro->ro_lle; if (lle != NULL && (lle->la_flags & LLE_VALID) == 0) { LLE_FREE(lle); lle = NULL; /* redundant */ ro->ro_lle = NULL; } if (lle == NULL) { /* if we lookup, keep cache */ addref = 1; } else /* * Notify LLE code that * the entry was used * by datapath. */ llentry_provide_feedback(lle); } if (lle != NULL) { phdr = lle->r_linkdata; hlen = lle->r_hdrlen; pflags = lle->r_flags; } } } #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) senderr(error); #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) senderr(ENETDOWN); if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) senderr(ENETDOWN); if (phdr == NULL) { /* No prepend data supplied. Try to calculate ourselves. */ phdr = linkhdr; hlen = ETHER_HDR_LEN; error = ether_resolve_addr(ifp, m, dst, ro, phdr, &pflags, addref ? &lle : NULL); if (addref && lle != NULL) ro->ro_lle = lle; if (error != 0) return (error == EWOULDBLOCK ? 0 : error); } if ((pflags & RT_L2_ME) != 0) { update_mbuf_csumflags(m, m); - return (if_simloop(ifp, m, dst->sa_family, 0)); + return (if_simloop(ifp, m, RO_GET_FAMILY(ro, dst), 0)); } loop_copy = (pflags & RT_MAY_LOOP) != 0; /* * Add local net header. If no space in first mbuf, * allocate another. * * Note that we do prepend regardless of RT_HAS_HEADER flag. * This is done because BPF code shifts m_data pointer * to the end of ethernet header prior to calling if_output(). */ M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) senderr(ENOBUFS); if ((pflags & RT_HAS_HEADER) == 0) { eh = mtod(m, struct ether_header *); memcpy(eh, phdr, hlen); } /* * If a simplex interface, and the packet is being sent to our * Ethernet address or a broadcast address, loopback a copy. * XXX To make a simplex device behave exactly like a duplex * device, we should copy in the case of sending to our own * ethernet address (thus letting the original actually appear * on the wire). However, we don't do that here for security * reasons and compatibility with the original behavior. */ if ((m->m_flags & M_BCAST) && loop_copy && (ifp->if_flags & IFF_SIMPLEX) && ((t = pf_find_mtag(m)) == NULL || !t->routed)) { struct mbuf *n; /* * Because if_simloop() modifies the packet, we need a * writable copy through m_dup() instead of a readonly * one as m_copy[m] would give us. The alternative would * be to modify if_simloop() to handle the readonly mbuf, * but performancewise it is mostly equivalent (trading * extra data copying vs. extra locking). * * XXX This is a local workaround. A number of less * often used kernel parts suffer from the same bug. * See PR kern/105943 for a proposed general solution. */ if ((n = m_dup(m, M_NOWAIT)) != NULL) { update_mbuf_csumflags(m, n); - (void)if_simloop(ifp, n, dst->sa_family, hlen); + (void)if_simloop(ifp, n, RO_GET_FAMILY(ro, dst), hlen); } else if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); } /* * Bridges require special output handling. */ if (ifp->if_bridge) { BRIDGE_OUTPUT(ifp, m, error); return (error); } #if defined(INET) || defined(INET6) if (ifp->if_carp && (error = (*carp_output_p)(ifp, m, dst))) goto bad; #endif /* Handle ng_ether(4) processing, if any */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_output_p != NULL, ("ng_ether_output_p is NULL")); if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) { bad: if (m != NULL) m_freem(m); return (error); } if (m == NULL) return (0); } /* Continue with link-layer output */ return ether_output_frame(ifp, m); } static bool ether_set_pcp(struct mbuf **mp, struct ifnet *ifp, uint8_t pcp) { struct ether_8021q_tag qtag; struct ether_header *eh; eh = mtod(*mp, struct ether_header *); if (ntohs(eh->ether_type) == ETHERTYPE_VLAN || ntohs(eh->ether_type) == ETHERTYPE_QINQ) return (true); qtag.vid = 0; qtag.pcp = pcp; qtag.proto = ETHERTYPE_VLAN; if (ether_8021q_frame(mp, ifp, ifp, &qtag)) return (true); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (false); } /* * Ethernet link layer output routine to send a raw frame to the device. * * This assumes that the 14 byte Ethernet header is present and contiguous * in the first mbuf (if BRIDGE'ing). */ int ether_output_frame(struct ifnet *ifp, struct mbuf *m) { uint8_t pcp; pcp = ifp->if_pcp; if (pcp != IFNET_PCP_NONE && ifp->if_type != IFT_L2VLAN && !ether_set_pcp(&m, ifp, pcp)) return (0); if (PFIL_HOOKED_OUT(V_link_pfil_head)) switch (pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_OUT, NULL)) { case PFIL_DROPPED: return (EACCES); case PFIL_CONSUMED: return (0); } #ifdef EXPERIMENTAL #if defined(INET6) && defined(INET) /* draft-ietf-6man-ipv6only-flag */ /* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) { struct ether_header *eh; eh = mtod(m, struct ether_header *); switch (ntohs(eh->ether_type)) { case ETHERTYPE_IP: case ETHERTYPE_ARP: case ETHERTYPE_REVARP: m_freem(m); return (EAFNOSUPPORT); /* NOTREACHED */ break; }; } #endif #endif /* * Queue message on interface, update output statistics if successful, * and start output if interface not yet active. * * If KMSAN is enabled, use it to verify that the data does not contain * any uninitialized bytes. */ kmsan_check_mbuf(m, "ether_output"); return ((ifp->if_transmit)(ifp, m)); } /* * Process a received Ethernet packet; the packet is in the * mbuf chain m with the ethernet header at the front. */ static void ether_input_internal(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; u_short etype; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } #ifdef DIAGNOSTIC if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n"); m_freem(m); return; } #endif if (m->m_len < ETHER_HDR_LEN) { /* XXX maybe should pullup? */ if_printf(ifp, "discard frame w/o leading ethernet " "header (len %u pkt len %u)\n", m->m_len, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); random_harvest_queue_ether(m, sizeof(*m)); #ifdef EXPERIMENTAL #if defined(INET6) && defined(INET) /* draft-ietf-6man-ipv6only-flag */ /* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) { switch (etype) { case ETHERTYPE_IP: case ETHERTYPE_ARP: case ETHERTYPE_REVARP: m_freem(m); return; /* NOTREACHED */ break; }; } #endif #endif CURVNET_SET_QUIET(ifp->if_vnet); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* * Give bpf a chance at the packet. */ ETHER_BPF_MTAP(ifp, m); /* * If the CRC is still on the packet, trim it off. We do this once * and once only in case we are re-entered. Nothing else on the * Ethernet receive path expects to see the FCS. */ if (m->m_flags & M_HASFCS) { m_adj(m, -ETHER_CRC_LEN); m->m_flags &= ~M_HASFCS; } if (!(ifp->if_capenable & IFCAP_HWSTATS)) if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); CURVNET_RESTORE(); return; } /* Handle input from a lagg(4) port */ if (ifp->if_type == IFT_IEEE8023ADLAG) { KASSERT(lagg_input_ethernet_p != NULL, ("%s: if_lagg not loaded!", __func__)); m = (*lagg_input_ethernet_p)(ifp, m); if (m != NULL) ifp = m->m_pkthdr.rcvif; else { CURVNET_RESTORE(); return; } } /* * If the hardware did not process an 802.1Q tag, do this now, * to allow 802.1P priority frames to be passed to the main input * path correctly. */ if ((m->m_flags & M_VLANTAG) == 0 && ((etype == ETHERTYPE_VLAN) || (etype == ETHERTYPE_QINQ))) { struct ether_vlan_header *evl; if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { #ifdef DIAGNOSTIC if_printf(ifp, "cannot pullup VLAN header\n"); #endif if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); CURVNET_RESTORE(); return; } evl = mtod(m, struct ether_vlan_header *); m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag); m->m_flags |= M_VLANTAG; bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); eh = mtod(m, struct ether_header *); } M_SETFIB(m, ifp->if_fib); /* Allow ng_ether(4) to claim this frame. */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_p != NULL, ("%s: ng_ether_input_p is NULL", __func__)); m->m_flags &= ~M_PROMISC; (*ng_ether_input_p)(ifp, &m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } /* * Allow if_bridge(4) to claim this frame. * The BRIDGE_INPUT() macro will update ifp if the bridge changed it * and the frame should be delivered locally. */ if (ifp->if_bridge != NULL) { m->m_flags &= ~M_PROMISC; BRIDGE_INPUT(ifp, m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } #if defined(INET) || defined(INET6) /* * Clear M_PROMISC on frame so that carp(4) will see it when the * mbuf flows up to Layer 3. * FreeBSD's implementation of carp(4) uses the inprotosw * to dispatch IPPROTO_CARP. carp(4) also allocates its own * Ethernet addresses of the form 00:00:5e:00:01:xx, which * is outside the scope of the M_PROMISC test below. * TODO: Maintain a hash table of ethernet addresses other than * ether_dhost which may be active on this ifp. */ if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) { m->m_flags &= ~M_PROMISC; } else #endif { /* * If the frame received was not for our MAC address, set the * M_PROMISC flag on the mbuf chain. The frame may need to * be seen by the rest of the Ethernet input path in case of * re-entry (e.g. bridge, vlan, netgraph) but should not be * seen by upper protocol layers. */ if (!ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0) m->m_flags |= M_PROMISC; } ether_demux(ifp, m); CURVNET_RESTORE(); } /* * Ethernet input dispatch; by default, direct dispatch here regardless of * global configuration. However, if RSS is enabled, hook up RSS affinity * so that when deferred or hybrid dispatch is enabled, we can redistribute * load based on RSS. * * XXXRW: Would be nice if the ifnet passed up a flag indicating whether or * not it had already done work distribution via multi-queue. Then we could * direct dispatch in the event load balancing was already complete and * handle the case of interfaces with different capabilities better. * * XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions * at multiple layers? * * XXXRW: For now, enable all this only if RSS is compiled in, although it * works fine without RSS. Need to characterise the performance overhead * of the detour through the netisr code in the event the result is always * direct dispatch. */ static void ether_nh_input(struct mbuf *m) { M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: NULL interface pointer", __func__)); ether_input_internal(m->m_pkthdr.rcvif, m); } static struct netisr_handler ether_nh = { .nh_name = "ether", .nh_handler = ether_nh_input, .nh_proto = NETISR_ETHER, #ifdef RSS .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_DIRECT, .nh_m2cpuid = rss_m2cpuid, #else .nh_policy = NETISR_POLICY_SOURCE, .nh_dispatch = NETISR_DISPATCH_DIRECT, #endif }; static void ether_init(__unused void *arg) { netisr_register(ðer_nh); } SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL); static void vnet_ether_init(__unused void *arg) { struct pfil_head_args args; args.pa_version = PFIL_VERSION; args.pa_flags = PFIL_IN | PFIL_OUT; args.pa_type = PFIL_TYPE_ETHERNET; args.pa_headname = PFIL_ETHER_NAME; V_link_pfil_head = pfil_head_register(&args); #ifdef VIMAGE netisr_register_vnet(ðer_nh); #endif } VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_init, NULL); #ifdef VIMAGE static void vnet_ether_pfil_destroy(__unused void *arg) { pfil_head_unregister(V_link_pfil_head); } VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY, vnet_ether_pfil_destroy, NULL); static void vnet_ether_destroy(__unused void *arg) { netisr_unregister_vnet(ðer_nh); } VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_destroy, NULL); #endif static void ether_input(struct ifnet *ifp, struct mbuf *m) { struct epoch_tracker et; struct mbuf *mn; bool needs_epoch; needs_epoch = !(ifp->if_flags & IFF_KNOWSEPOCH); /* * The drivers are allowed to pass in a chain of packets linked with * m_nextpkt. We split them up into separate packets here and pass * them up. This allows the drivers to amortize the receive lock. */ CURVNET_SET_QUIET(ifp->if_vnet); if (__predict_false(needs_epoch)) NET_EPOCH_ENTER(et); while (m) { mn = m->m_nextpkt; m->m_nextpkt = NULL; /* * We will rely on rcvif being set properly in the deferred * context, so assert it is correct here. */ MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p " "rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp)); netisr_dispatch(NETISR_ETHER, m); m = mn; } if (__predict_false(needs_epoch)) NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } /* * Upper layer processing for a received Ethernet packet. */ void ether_demux(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; int i, isr; u_short ether_type; NET_EPOCH_ASSERT(); KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__)); /* Do not grab PROMISC frames in case we are re-entered. */ if (PFIL_HOOKED_IN(V_link_pfil_head) && !(m->m_flags & M_PROMISC)) { i = pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_IN, NULL); if (i != 0 || m == NULL) return; } eh = mtod(m, struct ether_header *); ether_type = ntohs(eh->ether_type); /* * If this frame has a VLAN tag other than 0, call vlan_input() * if its module is loaded. Otherwise, drop. */ if ((m->m_flags & M_VLANTAG) && EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) { if (ifp->if_vlantrunk == NULL) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!", __func__)); /* Clear before possibly re-entering ether_input(). */ m->m_flags &= ~M_PROMISC; (*vlan_input_p)(ifp, m); return; } /* * Pass promiscuously received frames to the upper layer if the user * requested this by setting IFF_PPROMISC. Otherwise, drop them. */ if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) { m_freem(m); return; } /* * Reset layer specific mbuf flags to avoid confusing upper layers. * Strip off Ethernet header. */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); m_adj(m, ETHER_HDR_LEN); /* * Dispatch frame to upper layer. */ switch (ether_type) { #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } isr = NETISR_ARP; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif default: goto discard; } netisr_dispatch(isr, m); return; discard: /* * Packet is to be discarded. If netgraph is present, * hand the packet to it for last chance processing; * otherwise dispose of it. */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_orphan_p != NULL, ("ng_ether_input_orphan_p is NULL")); /* * Put back the ethernet header so netgraph has a * consistent view of inbound packets. */ M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); (*ng_ether_input_orphan_p)(ifp, m); return; } m_freem(m); } /* * Convert Ethernet address to printable (loggable) representation. * This routine is for compatibility; it's better to just use * * printf("%6D", , ":"); * * since there's no static buffer involved. */ char * ether_sprintf(const u_char *ap) { static char etherbuf[18]; snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":"); return (etherbuf); } /* * Perform common duties while attaching to interface list */ void ether_ifattach(struct ifnet *ifp, const u_int8_t *lla) { int i; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifp->if_addrlen = ETHER_ADDR_LEN; ifp->if_hdrlen = ETHER_HDR_LEN; ifp->if_mtu = ETHERMTU; if_attach(ifp); ifp->if_output = ether_output; ifp->if_input = ether_input; ifp->if_resolvemulti = ether_resolvemulti; ifp->if_requestencap = ether_requestencap; #ifdef VIMAGE ifp->if_reassign = ether_reassign; #endif if (ifp->if_baudrate == 0) ifp->if_baudrate = IF_Mbps(10); /* just a default */ ifp->if_broadcastaddr = etherbroadcastaddr; ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_ETHER; sdl->sdl_alen = ifp->if_addrlen; bcopy(lla, LLADDR(sdl), ifp->if_addrlen); if (ifp->if_hw_addr != NULL) bcopy(lla, ifp->if_hw_addr, ifp->if_addrlen); bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); if (ng_ether_attach_p != NULL) (*ng_ether_attach_p)(ifp); /* Announce Ethernet MAC address if non-zero. */ for (i = 0; i < ifp->if_addrlen; i++) if (lla[i] != 0) break; if (i != ifp->if_addrlen) if_printf(ifp, "Ethernet address: %6D\n", lla, ":"); uuid_ether_add(LLADDR(sdl)); /* Add necessary bits are setup; announce it now. */ EVENTHANDLER_INVOKE(ether_ifattach_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("ETHERNET", ifp->if_xname, "IFATTACH", NULL); } /* * Perform common duties while detaching an Ethernet interface */ void ether_ifdetach(struct ifnet *ifp) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr); uuid_ether_del(LLADDR(sdl)); if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } bpfdetach(ifp); if_detach(ifp); } #ifdef VIMAGE void ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused) { if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } if (ng_ether_attach_p != NULL) { CURVNET_SET_QUIET(new_vnet); (*ng_ether_attach_p)(ifp); CURVNET_RESTORE(); } } #endif SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Ethernet"); #if 0 /* * This is for reference. We have a table-driven version * of the little-endian crc32 generator, which is faster * than the double-loop. */ uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { size_t i; uint32_t crc; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = (crc ^ data) & 1; crc >>= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_LE); } } return (crc); } #else uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { static const uint32_t crctab[] = { 0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, 0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c }; size_t i; uint32_t crc; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { crc ^= buf[i]; crc = (crc >> 4) ^ crctab[crc & 0xf]; crc = (crc >> 4) ^ crctab[crc & 0xf]; } return (crc); } #endif uint32_t ether_crc32_be(const uint8_t *buf, size_t len) { size_t i; uint32_t crc, carry; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01); crc <<= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_BE) | carry; } } return (crc); } int ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; switch (command) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0], ETHER_ADDR_LEN); break; case SIOCSIFMTU: /* * Set the interface MTU. */ if (ifr->ifr_mtu > ETHERMTU) { error = EINVAL; } else { ifp->if_mtu = ifr->ifr_mtu; } break; case SIOCSLANPCP: error = priv_check(curthread, PRIV_NET_SETLANPCP); if (error != 0) break; if (ifr->ifr_lan_pcp > 7 && ifr->ifr_lan_pcp != IFNET_PCP_NONE) { error = EINVAL; } else { ifp->if_pcp = ifr->ifr_lan_pcp; /* broadcast event about PCP change */ EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP); } break; case SIOCGLANPCP: ifr->ifr_lan_pcp = ifp->if_pcp; break; default: error = EINVAL; /* XXX netbsd has ENOTTY??? */ break; } return (error); } static int ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!ETHER_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; *llsa = NULL; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * An IP6 address of 0 means listen to all * of the Ethernet multicast address used for IP6. * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; *llsa = NULL; return 0; } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif default: /* * Well, the text isn't quite right, but it's the name * that counts... */ return EAFNOSUPPORT; } } static moduledata_t ether_mod = { .name = "ether", }; void ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen) { struct ether_vlan_header vlan; struct mbuf mv, mb; KASSERT((m->m_flags & M_VLANTAG) != 0, ("%s: vlan information not present", __func__)); KASSERT(m->m_len >= sizeof(struct ether_header), ("%s: mbuf not large enough for header", __func__)); bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header)); vlan.evl_proto = vlan.evl_encap_proto; vlan.evl_encap_proto = htons(ETHERTYPE_VLAN); vlan.evl_tag = htons(m->m_pkthdr.ether_vtag); m->m_len -= sizeof(struct ether_header); m->m_data += sizeof(struct ether_header); /* * If a data link has been supplied by the caller, then we will need to * re-create a stack allocated mbuf chain with the following structure: * * (1) mbuf #1 will contain the supplied data link * (2) mbuf #2 will contain the vlan header * (3) mbuf #3 will contain the original mbuf's packet data * * Otherwise, submit the packet and vlan header via bpf_mtap2(). */ if (data != NULL) { mv.m_next = m; mv.m_data = (caddr_t)&vlan; mv.m_len = sizeof(vlan); mb.m_next = &mv; mb.m_data = data; mb.m_len = dlen; bpf_mtap(bp, &mb); } else bpf_mtap2(bp, &vlan, sizeof(vlan), m); m->m_len += sizeof(struct ether_header); m->m_data -= sizeof(struct ether_header); } struct mbuf * ether_vlanencap_proto(struct mbuf *m, uint16_t tag, uint16_t proto) { struct ether_vlan_header *evl; M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT); if (m == NULL) return (NULL); /* M_PREPEND takes care of m_len, m_pkthdr.len for us */ if (m->m_len < sizeof(*evl)) { m = m_pullup(m, sizeof(*evl)); if (m == NULL) return (NULL); } /* * Transform the Ethernet header into an Ethernet header * with 802.1Q encapsulation. */ evl = mtod(m, struct ether_vlan_header *); bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN, (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN); evl->evl_encap_proto = htons(proto); evl->evl_tag = htons(tag); return (m); } static SYSCTL_NODE(_net_link, IFT_L2VLAN, vlan, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IEEE 802.1Q VLAN"); static SYSCTL_NODE(_net_link_vlan, PF_LINK, link, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "for consistency"); VNET_DEFINE_STATIC(int, soft_pad); #define V_soft_pad VNET(soft_pad) SYSCTL_INT(_net_link_vlan, OID_AUTO, soft_pad, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(soft_pad), 0, "pad short frames before tagging"); /* * For now, make preserving PCP via an mbuf tag optional, as it increases * per-packet memory allocations and frees. In the future, it would be * preferable to reuse ether_vtag for this, or similar. */ int vlan_mtag_pcp = 0; SYSCTL_INT(_net_link_vlan, OID_AUTO, mtag_pcp, CTLFLAG_RW, &vlan_mtag_pcp, 0, "Retain VLAN PCP information as packets are passed up the stack"); bool ether_8021q_frame(struct mbuf **mp, struct ifnet *ife, struct ifnet *p, struct ether_8021q_tag *qtag) { struct m_tag *mtag; int n; uint16_t tag; static const char pad[8]; /* just zeros */ /* * Pad the frame to the minimum size allowed if told to. * This option is in accord with IEEE Std 802.1Q, 2003 Ed., * paragraph C.4.4.3.b. It can help to work around buggy * bridges that violate paragraph C.4.4.3.a from the same * document, i.e., fail to pad short frames after untagging. * E.g., a tagged frame 66 bytes long (incl. FCS) is OK, but * untagging it will produce a 62-byte frame, which is a runt * and requires padding. There are VLAN-enabled network * devices that just discard such runts instead or mishandle * them somehow. */ if (V_soft_pad && p->if_type == IFT_ETHER) { for (n = ETHERMIN + ETHER_HDR_LEN - (*mp)->m_pkthdr.len; n > 0; n -= sizeof(pad)) { if (!m_append(*mp, min(n, sizeof(pad)), pad)) break; } if (n > 0) { m_freem(*mp); *mp = NULL; if_printf(ife, "cannot pad short frame"); return (false); } } /* * If PCP is set in mbuf, use it */ if ((*mp)->m_flags & M_VLANTAG) { qtag->pcp = EVL_PRIOFTAG((*mp)->m_pkthdr.ether_vtag); } /* * If underlying interface can do VLAN tag insertion itself, * just pass the packet along. However, we need some way to * tell the interface where the packet came from so that it * knows how to find the VLAN tag to use, so we attach a * packet tag that holds it. */ if (vlan_mtag_pcp && (mtag = m_tag_locate(*mp, MTAG_8021Q, MTAG_8021Q_PCP_OUT, NULL)) != NULL) tag = EVL_MAKETAG(qtag->vid, *(uint8_t *)(mtag + 1), 0); else tag = EVL_MAKETAG(qtag->vid, qtag->pcp, 0); if ((p->if_capenable & IFCAP_VLAN_HWTAGGING) && (qtag->proto == ETHERTYPE_VLAN)) { (*mp)->m_pkthdr.ether_vtag = tag; (*mp)->m_flags |= M_VLANTAG; } else { *mp = ether_vlanencap_proto(*mp, tag, qtag->proto); if (*mp == NULL) { if_printf(ife, "unable to prepend 802.1Q header"); return (false); } } return (true); } /* * Allocate an address from the FreeBSD Foundation OUI. This uses a * cryptographic hash function on the containing jail's name, UUID and the * interface name to attempt to provide a unique but stable address. * Pseudo-interfaces which require a MAC address should use this function to * allocate non-locally-administered addresses. */ void ether_gen_addr(struct ifnet *ifp, struct ether_addr *hwaddr) { SHA1_CTX ctx; char *buf; char uuid[HOSTUUIDLEN + 1]; uint64_t addr; int i, sz; char digest[SHA1_RESULTLEN]; char jailname[MAXHOSTNAMELEN]; getcredhostuuid(curthread->td_ucred, uuid, sizeof(uuid)); if (strncmp(uuid, DEFAULT_HOSTUUID, sizeof(uuid)) == 0) { /* Fall back to a random mac address. */ goto rando; } /* If each (vnet) jail would also have a unique hostuuid this would not * be necessary. */ getjailname(curthread->td_ucred, jailname, sizeof(jailname)); sz = asprintf(&buf, M_TEMP, "%s-%s-%s", uuid, if_name(ifp), jailname); if (sz < 0) { /* Fall back to a random mac address. */ goto rando; } SHA1Init(&ctx); SHA1Update(&ctx, buf, sz); SHA1Final(digest, &ctx); free(buf, M_TEMP); addr = ((digest[0] << 16) | (digest[1] << 8) | digest[2]) & OUI_FREEBSD_GENERATED_MASK; addr = OUI_FREEBSD(addr); for (i = 0; i < ETHER_ADDR_LEN; ++i) { hwaddr->octet[i] = addr >> ((ETHER_ADDR_LEN - i - 1) * 8) & 0xFF; } return; rando: arc4rand(hwaddr, sizeof(*hwaddr), 0); /* Unicast */ hwaddr->octet[0] &= 0xFE; /* Locally administered. */ hwaddr->octet[0] |= 0x02; } DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(ether, 1); diff --git a/sys/net/if_fwsubr.c b/sys/net/if_fwsubr.c index a6c43d4d05a4..321721737d36 100644 --- a/sys/net/if_fwsubr.c +++ b/sys/net/if_fwsubr.c @@ -1,855 +1,873 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2004 Doug Rabson * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #endif #ifdef INET6 #include #endif #include static MALLOC_DEFINE(M_FWCOM, "fw_com", "firewire interface internals"); struct fw_hwaddr firewire_broadcastaddr = { 0xffffffff, 0xffffffff, 0xff, 0xff, 0xffff, 0xffffffff }; static int firewire_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct fw_com *fc = IFP2FWC(ifp); int error, type; struct m_tag *mtag; union fw_encap *enc; struct fw_hwaddr *destfw; uint8_t speed; uint16_t psize, fsize, dsize; struct mbuf *mtail; int unicast, dgl, foff; static int next_dgl; #if defined(INET) || defined(INET6) int is_gw = 0; #endif + int af = RO_GET_FAMILY(ro, dst); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) goto bad; #endif if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) { error = ENETDOWN; goto bad; } #if defined(INET) || defined(INET6) if (ro != NULL) is_gw = (ro->ro_flags & RT_HAS_GW) != 0; #endif /* * For unicast, we make a tag to store the lladdr of the * destination. This might not be the first time we have seen * the packet (for instance, the arp code might be trying to * re-send it after receiving an arp reply) so we only * allocate a tag if there isn't one there already. For * multicast, we will eventually use a different tag to store * the channel number. */ unicast = !(m->m_flags & (M_BCAST | M_MCAST)); if (unicast) { mtag = m_tag_locate(m, MTAG_FIREWIRE, MTAG_FIREWIRE_HWADDR, NULL); if (!mtag) { mtag = m_tag_alloc(MTAG_FIREWIRE, MTAG_FIREWIRE_HWADDR, sizeof (struct fw_hwaddr), M_NOWAIT); if (!mtag) { error = ENOMEM; goto bad; } m_tag_prepend(m, mtag); } destfw = (struct fw_hwaddr *)(mtag + 1); } else { destfw = NULL; } + switch (af) { +#ifdef INET + case AF_INET: + type = ETHERTYPE_IP; + break; + case AF_ARP: + type = ETHERTYPE_ARP; + break; +#endif +#ifdef INET6 + case AF_INET6: + type = ETHERTYPE_IPV6; + break; +#endif + default: + if_printf(ifp, "can't handle af%d\n", af); + error = EAFNOSUPPORT; + goto bad; + } + switch (dst->sa_family) { #ifdef INET case AF_INET: /* * Only bother with arp for unicast. Allocation of * channels etc. for firewire is quite different and * doesn't fit into the arp model. */ if (unicast) { error = arpresolve(ifp, is_gw, m, dst, (u_char *) destfw, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); } - type = ETHERTYPE_IP; break; case AF_ARP: { struct arphdr *ah; ah = mtod(m, struct arphdr *); ah->ar_hrd = htons(ARPHRD_IEEE1394); - type = ETHERTYPE_ARP; if (unicast) *destfw = *(struct fw_hwaddr *) ar_tha(ah); /* * The standard arp code leaves a hole for the target * hardware address which we need to close up. */ bcopy(ar_tpa(ah), ar_tha(ah), ah->ar_pln); m_adj(m, -ah->ar_hln); break; } #endif #ifdef INET6 case AF_INET6: if (unicast) { - error = nd6_resolve(fc->fc_ifp, LLE_SF(AF_INET6, is_gw), - m, dst, (u_char *) destfw, NULL, NULL); + error = nd6_resolve(fc->fc_ifp, LLE_SF(af, is_gw), m, + dst, (u_char *) destfw, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); } - type = ETHERTYPE_IPV6; break; #endif default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); error = EAFNOSUPPORT; goto bad; } /* * Let BPF tap off a copy before we encapsulate. */ if (bpf_peers_present(ifp->if_bpf)) { struct fw_bpfhdr h; if (unicast) bcopy(destfw, h.firewire_dhost, 8); else bcopy(&firewire_broadcastaddr, h.firewire_dhost, 8); bcopy(&fc->fc_hwaddr, h.firewire_shost, 8); h.firewire_type = htons(type); bpf_mtap2(ifp->if_bpf, &h, sizeof(h), m); } /* * Punt on MCAP for now and send all multicast packets on the * broadcast channel. */ if (m->m_flags & M_MCAST) m->m_flags |= M_BCAST; /* * Figure out what speed to use and what the largest supported * packet size is. For unicast, this is the minimum of what we * can speak and what they can hear. For broadcast, lets be * conservative and use S100. We could possibly improve that * by examining the bus manager's speed map or similar. We * also reduce the packet size for broadcast to account for * the GASP header. */ if (unicast) { speed = min(fc->fc_speed, destfw->sspd); psize = min(512 << speed, 2 << destfw->sender_max_rec); } else { speed = 0; psize = 512 - 2*sizeof(uint32_t); } /* * Next, we encapsulate, possibly fragmenting the original * datagram if it won't fit into a single packet. */ if (m->m_pkthdr.len <= psize - sizeof(uint32_t)) { /* * No fragmentation is necessary. */ M_PREPEND(m, sizeof(uint32_t), M_NOWAIT); if (!m) { error = ENOBUFS; goto bad; } enc = mtod(m, union fw_encap *); enc->unfrag.ether_type = type; enc->unfrag.lf = FW_ENCAP_UNFRAG; enc->unfrag.reserved = 0; /* * Byte swap the encapsulation header manually. */ enc->ul[0] = htonl(enc->ul[0]); error = (ifp->if_transmit)(ifp, m); return (error); } else { /* * Fragment the datagram, making sure to leave enough * space for the encapsulation header in each packet. */ fsize = psize - 2*sizeof(uint32_t); dgl = next_dgl++; dsize = m->m_pkthdr.len; foff = 0; while (m) { if (m->m_pkthdr.len > fsize) { /* * Split off the tail segment from the * datagram, copying our tags over. */ mtail = m_split(m, fsize, M_NOWAIT); m_tag_copy_chain(mtail, m, M_NOWAIT); } else { mtail = NULL; } /* * Add our encapsulation header to this * fragment and hand it off to the link. */ M_PREPEND(m, 2*sizeof(uint32_t), M_NOWAIT); if (!m) { error = ENOBUFS; goto bad; } enc = mtod(m, union fw_encap *); if (foff == 0) { enc->firstfrag.lf = FW_ENCAP_FIRST; enc->firstfrag.reserved1 = 0; enc->firstfrag.reserved2 = 0; enc->firstfrag.datagram_size = dsize - 1; enc->firstfrag.ether_type = type; enc->firstfrag.dgl = dgl; } else { if (mtail) enc->nextfrag.lf = FW_ENCAP_NEXT; else enc->nextfrag.lf = FW_ENCAP_LAST; enc->nextfrag.reserved1 = 0; enc->nextfrag.reserved2 = 0; enc->nextfrag.reserved3 = 0; enc->nextfrag.datagram_size = dsize - 1; enc->nextfrag.fragment_offset = foff; enc->nextfrag.dgl = dgl; } foff += m->m_pkthdr.len - 2*sizeof(uint32_t); /* * Byte swap the encapsulation header manually. */ enc->ul[0] = htonl(enc->ul[0]); enc->ul[1] = htonl(enc->ul[1]); error = (ifp->if_transmit)(ifp, m); if (error) { if (mtail) m_freem(mtail); return (ENOBUFS); } m = mtail; } return (0); } bad: if (m) m_freem(m); return (error); } static struct mbuf * firewire_input_fragment(struct fw_com *fc, struct mbuf *m, int src) { union fw_encap *enc; struct fw_reass *r; struct mbuf *mf, *mprev; int dsize; int fstart, fend, start, end, islast; uint32_t id; /* * Find an existing reassembly buffer or create a new one. */ enc = mtod(m, union fw_encap *); id = enc->firstfrag.dgl | (src << 16); STAILQ_FOREACH(r, &fc->fc_frags, fr_link) if (r->fr_id == id) break; if (!r) { r = malloc(sizeof(struct fw_reass), M_TEMP, M_NOWAIT); if (!r) { m_freem(m); return 0; } r->fr_id = id; r->fr_frags = 0; STAILQ_INSERT_HEAD(&fc->fc_frags, r, fr_link); } /* * If this fragment overlaps any other fragment, we must discard * the partial reassembly and start again. */ if (enc->firstfrag.lf == FW_ENCAP_FIRST) fstart = 0; else fstart = enc->nextfrag.fragment_offset; fend = fstart + m->m_pkthdr.len - 2*sizeof(uint32_t); dsize = enc->nextfrag.datagram_size; islast = (enc->nextfrag.lf == FW_ENCAP_LAST); for (mf = r->fr_frags; mf; mf = mf->m_nextpkt) { enc = mtod(mf, union fw_encap *); if (enc->nextfrag.datagram_size != dsize) { /* * This fragment must be from a different * packet. */ goto bad; } if (enc->firstfrag.lf == FW_ENCAP_FIRST) start = 0; else start = enc->nextfrag.fragment_offset; end = start + mf->m_pkthdr.len - 2*sizeof(uint32_t); if ((fstart < end && fend > start) || (islast && enc->nextfrag.lf == FW_ENCAP_LAST)) { /* * Overlap - discard reassembly buffer and start * again with this fragment. */ goto bad; } } /* * Find where to put this fragment in the list. */ for (mf = r->fr_frags, mprev = NULL; mf; mprev = mf, mf = mf->m_nextpkt) { enc = mtod(mf, union fw_encap *); if (enc->firstfrag.lf == FW_ENCAP_FIRST) start = 0; else start = enc->nextfrag.fragment_offset; if (start >= fend) break; } /* * If this is a last fragment and we are not adding at the end * of the list, discard the buffer. */ if (islast && mprev && mprev->m_nextpkt) goto bad; if (mprev) { m->m_nextpkt = mprev->m_nextpkt; mprev->m_nextpkt = m; /* * Coalesce forwards and see if we can make a whole * datagram. */ enc = mtod(mprev, union fw_encap *); if (enc->firstfrag.lf == FW_ENCAP_FIRST) start = 0; else start = enc->nextfrag.fragment_offset; end = start + mprev->m_pkthdr.len - 2*sizeof(uint32_t); while (end == fstart) { /* * Strip off the encap header from m and * append it to mprev, freeing m. */ m_adj(m, 2*sizeof(uint32_t)); mprev->m_nextpkt = m->m_nextpkt; mprev->m_pkthdr.len += m->m_pkthdr.len; m_cat(mprev, m); if (mprev->m_pkthdr.len == dsize + 1 + 2*sizeof(uint32_t)) { /* * We have assembled a complete packet * we must be finished. Make sure we have * merged the whole chain. */ STAILQ_REMOVE(&fc->fc_frags, r, fw_reass, fr_link); free(r, M_TEMP); m = mprev->m_nextpkt; while (m) { mf = m->m_nextpkt; m_freem(m); m = mf; } mprev->m_nextpkt = NULL; return (mprev); } /* * See if we can continue merging forwards. */ end = fend; m = mprev->m_nextpkt; if (m) { enc = mtod(m, union fw_encap *); if (enc->firstfrag.lf == FW_ENCAP_FIRST) fstart = 0; else fstart = enc->nextfrag.fragment_offset; fend = fstart + m->m_pkthdr.len - 2*sizeof(uint32_t); } else { break; } } } else { m->m_nextpkt = 0; r->fr_frags = m; } return (0); bad: while (r->fr_frags) { mf = r->fr_frags; r->fr_frags = mf->m_nextpkt; m_freem(mf); } m->m_nextpkt = 0; r->fr_frags = m; return (0); } void firewire_input(struct ifnet *ifp, struct mbuf *m, uint16_t src) { struct fw_com *fc = IFP2FWC(ifp); union fw_encap *enc; int type, isr; /* * The caller has already stripped off the packet header * (stream or wreqb) and marked the mbuf's M_BCAST flag * appropriately. We de-encapsulate the IP packet and pass it * up the line after handling link-level fragmentation. */ if (m->m_pkthdr.len < sizeof(uint32_t)) { if_printf(ifp, "discarding frame without " "encapsulation header (len %u pkt len %u)\n", m->m_len, m->m_pkthdr.len); } m = m_pullup(m, sizeof(uint32_t)); if (m == NULL) return; enc = mtod(m, union fw_encap *); /* * Byte swap the encapsulation header manually. */ enc->ul[0] = ntohl(enc->ul[0]); if (enc->unfrag.lf != 0) { m = m_pullup(m, 2*sizeof(uint32_t)); if (!m) return; enc = mtod(m, union fw_encap *); enc->ul[1] = ntohl(enc->ul[1]); m = firewire_input_fragment(fc, m, src); if (!m) return; enc = mtod(m, union fw_encap *); type = enc->firstfrag.ether_type; m_adj(m, 2*sizeof(uint32_t)); } else { type = enc->unfrag.ether_type; m_adj(m, sizeof(uint32_t)); } if (m->m_pkthdr.rcvif == NULL) { if_printf(ifp, "discard frame w/o interface pointer\n"); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } #ifdef DIAGNOSTIC if (m->m_pkthdr.rcvif != ifp) { if_printf(ifp, "Warning, frame marked as received on %s\n", m->m_pkthdr.rcvif->if_xname); } #endif #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* * Give bpf a chance at the packet. The link-level driver * should have left us a tag with the EUID of the sender. */ if (bpf_peers_present(ifp->if_bpf)) { struct fw_bpfhdr h; struct m_tag *mtag; mtag = m_tag_locate(m, MTAG_FIREWIRE, MTAG_FIREWIRE_SENDER_EUID, 0); if (mtag) bcopy(mtag + 1, h.firewire_shost, 8); else bcopy(&firewire_broadcastaddr, h.firewire_dhost, 8); bcopy(&fc->fc_hwaddr, h.firewire_dhost, 8); h.firewire_type = htons(type); bpf_mtap2(ifp->if_bpf, &h, sizeof(h), m); } if (ifp->if_flags & IFF_MONITOR) { /* * Interface marked for monitoring; discard packet. */ m_freem(m); return; } if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); /* Discard packet if interface is not up */ if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } if (m->m_flags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); switch (type) { #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; break; case ETHERTYPE_ARP: { struct arphdr *ah; ah = mtod(m, struct arphdr *); /* * Adjust the arp packet to insert an empty tha slot. */ m->m_len += ah->ar_hln; m->m_pkthdr.len += ah->ar_hln; bcopy(ar_tha(ah), ar_tpa(ah), ah->ar_pln); isr = NETISR_ARP; break; } #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif default: m_freem(m); return; } M_SETFIB(m, ifp->if_fib); CURVNET_SET_QUIET(ifp->if_vnet); netisr_dispatch(isr, m); CURVNET_RESTORE(); } int firewire_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; switch (command) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: bcopy(&IFP2FWC(ifp)->fc_hwaddr, &ifr->ifr_addr.sa_data[0], sizeof(struct fw_hwaddr)); break; case SIOCSIFMTU: /* * Set the interface MTU. */ if (ifr->ifr_mtu > 1500) { error = EINVAL; } else { ifp->if_mtu = ifr->ifr_mtu; } break; default: error = EINVAL; /* XXX netbsd has ENOTTY??? */ break; } return (error); } static int firewire_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. */ *llsa = NULL; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; *llsa = NULL; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * An IP6 address of 0 means listen to all * of the Ethernet multicast address used for IP6. * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; *llsa = NULL; return 0; } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; *llsa = NULL; return 0; #endif default: /* * Well, the text isn't quite right, but it's the name * that counts... */ return EAFNOSUPPORT; } } void firewire_ifattach(struct ifnet *ifp, struct fw_hwaddr *llc) { struct fw_com *fc = IFP2FWC(ifp); struct ifaddr *ifa; struct sockaddr_dl *sdl; static const char* speeds[] = { "S100", "S200", "S400", "S800", "S1600", "S3200" }; fc->fc_speed = llc->sspd; STAILQ_INIT(&fc->fc_frags); ifp->if_addrlen = sizeof(struct fw_hwaddr); ifp->if_hdrlen = 0; if_attach(ifp); ifp->if_mtu = 1500; /* XXX */ ifp->if_output = firewire_output; ifp->if_resolvemulti = firewire_resolvemulti; ifp->if_broadcastaddr = (u_char *) &firewire_broadcastaddr; ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_IEEE1394; sdl->sdl_alen = ifp->if_addrlen; bcopy(llc, LLADDR(sdl), ifp->if_addrlen); bpfattach(ifp, DLT_APPLE_IP_OVER_IEEE1394, sizeof(struct fw_hwaddr)); if_printf(ifp, "Firewire address: %8D @ 0x%04x%08x, %s, maxrec %d\n", (uint8_t *) &llc->sender_unique_ID_hi, ":", ntohs(llc->sender_unicast_FIFO_hi), ntohl(llc->sender_unicast_FIFO_lo), speeds[llc->sspd], (2 << llc->sender_max_rec)); } void firewire_ifdetach(struct ifnet *ifp) { bpfdetach(ifp); if_detach(ifp); } void firewire_busreset(struct ifnet *ifp) { struct fw_com *fc = IFP2FWC(ifp); struct fw_reass *r; struct mbuf *m; /* * Discard any partial datagrams since the host ids may have changed. */ while ((r = STAILQ_FIRST(&fc->fc_frags))) { STAILQ_REMOVE_HEAD(&fc->fc_frags, fr_link); while (r->fr_frags) { m = r->fr_frags; r->fr_frags = m->m_nextpkt; m_freem(m); } free(r, M_TEMP); } } static void * firewire_alloc(u_char type, struct ifnet *ifp) { struct fw_com *fc; fc = malloc(sizeof(struct fw_com), M_FWCOM, M_WAITOK | M_ZERO); fc->fc_ifp = ifp; return (fc); } static void firewire_free(void *com, u_char type) { free(com, M_FWCOM); } static int firewire_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: if_register_com_alloc(IFT_IEEE1394, firewire_alloc, firewire_free); break; case MOD_UNLOAD: if_deregister_com_alloc(IFT_IEEE1394); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t firewire_mod = { "if_firewire", firewire_modevent, 0 }; DECLARE_MODULE(if_firewire, firewire_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(if_firewire, 1); diff --git a/sys/net/if_gif.c b/sys/net/if_gif.c index 113bcb5c916e..796f427e356b 100644 --- a/sys/net/if_gif.c +++ b/sys/net/if_gif.c @@ -1,723 +1,723 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2018 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: if_gif.c,v 1.87 2001/10/19 08:50:27 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #ifdef INET6 #ifndef INET #include #endif #include #include #include #include #endif /* INET6 */ #include #include #include #include #include static const char gifname[] = "gif"; MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface"); static struct sx gif_ioctl_sx; SX_SYSINIT(gif_ioctl_sx, &gif_ioctl_sx, "gif_ioctl"); void (*ng_gif_input_p)(struct ifnet *ifp, struct mbuf **mp, int af); void (*ng_gif_input_orphan_p)(struct ifnet *ifp, struct mbuf *m, int af); void (*ng_gif_attach_p)(struct ifnet *ifp); void (*ng_gif_detach_p)(struct ifnet *ifp); #ifdef VIMAGE static void gif_reassign(struct ifnet *, struct vnet *, char *); #endif static void gif_delete_tunnel(struct gif_softc *); static int gif_ioctl(struct ifnet *, u_long, caddr_t); static int gif_transmit(struct ifnet *, struct mbuf *); static void gif_qflush(struct ifnet *); static int gif_clone_create(struct if_clone *, int, caddr_t); static void gif_clone_destroy(struct ifnet *); VNET_DEFINE_STATIC(struct if_clone *, gif_cloner); #define V_gif_cloner VNET(gif_cloner) SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_GIF, gif, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Generic Tunnel Interface"); #ifndef MAX_GIF_NEST /* * This macro controls the default upper limitation on nesting of gif tunnels. * Since, setting a large value to this macro with a careless configuration * may introduce system crash, we don't allow any nestings by default. * If you need to configure nested gif tunnels, you can define this macro * in your kernel configuration file. However, if you do so, please be * careful to configure the tunnels so that it won't make a loop. */ #define MAX_GIF_NEST 1 #endif VNET_DEFINE_STATIC(int, max_gif_nesting) = MAX_GIF_NEST; #define V_max_gif_nesting VNET(max_gif_nesting) SYSCTL_INT(_net_link_gif, OID_AUTO, max_nesting, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(max_gif_nesting), 0, "Max nested tunnels"); static int gif_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct gif_softc *sc; sc = malloc(sizeof(struct gif_softc), M_GIF, M_WAITOK | M_ZERO); sc->gif_fibnum = curthread->td_proc->p_fibnum; GIF2IFP(sc) = if_alloc(IFT_GIF); GIF2IFP(sc)->if_softc = sc; if_initname(GIF2IFP(sc), gifname, unit); GIF2IFP(sc)->if_addrlen = 0; GIF2IFP(sc)->if_mtu = GIF_MTU; GIF2IFP(sc)->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; GIF2IFP(sc)->if_ioctl = gif_ioctl; GIF2IFP(sc)->if_transmit = gif_transmit; GIF2IFP(sc)->if_qflush = gif_qflush; GIF2IFP(sc)->if_output = gif_output; #ifdef VIMAGE GIF2IFP(sc)->if_reassign = gif_reassign; #endif GIF2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE; GIF2IFP(sc)->if_capenable |= IFCAP_LINKSTATE; if_attach(GIF2IFP(sc)); bpfattach(GIF2IFP(sc), DLT_NULL, sizeof(u_int32_t)); if (ng_gif_attach_p != NULL) (*ng_gif_attach_p)(GIF2IFP(sc)); return (0); } #ifdef VIMAGE static void gif_reassign(struct ifnet *ifp, struct vnet *new_vnet __unused, char *unused __unused) { struct gif_softc *sc; sx_xlock(&gif_ioctl_sx); sc = ifp->if_softc; if (sc != NULL) gif_delete_tunnel(sc); sx_xunlock(&gif_ioctl_sx); } #endif /* VIMAGE */ static void gif_clone_destroy(struct ifnet *ifp) { struct gif_softc *sc; sx_xlock(&gif_ioctl_sx); sc = ifp->if_softc; gif_delete_tunnel(sc); if (ng_gif_detach_p != NULL) (*ng_gif_detach_p)(ifp); bpfdetach(ifp); if_detach(ifp); ifp->if_softc = NULL; sx_xunlock(&gif_ioctl_sx); GIF_WAIT(); if_free(ifp); free(sc, M_GIF); } static void vnet_gif_init(const void *unused __unused) { V_gif_cloner = if_clone_simple(gifname, gif_clone_create, gif_clone_destroy, 0); #ifdef INET in_gif_init(); #endif #ifdef INET6 in6_gif_init(); #endif } VNET_SYSINIT(vnet_gif_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gif_init, NULL); static void vnet_gif_uninit(const void *unused __unused) { if_clone_detach(V_gif_cloner); #ifdef INET in_gif_uninit(); #endif #ifdef INET6 in6_gif_uninit(); #endif } VNET_SYSUNINIT(vnet_gif_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gif_uninit, NULL); static int gifmodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t gif_mod = { "if_gif", gifmodevent, 0 }; DECLARE_MODULE(if_gif, gif_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_gif, 1); struct gif_list * gif_hashinit(void) { struct gif_list *hash; int i; hash = malloc(sizeof(struct gif_list) * GIF_HASH_SIZE, M_GIF, M_WAITOK); for (i = 0; i < GIF_HASH_SIZE; i++) CK_LIST_INIT(&hash[i]); return (hash); } void gif_hashdestroy(struct gif_list *hash) { free(hash, M_GIF); } #define MTAG_GIF 1080679712 static int gif_transmit(struct ifnet *ifp, struct mbuf *m) { struct gif_softc *sc; struct etherip_header *eth; #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; uint32_t t; #endif uint32_t af; uint8_t proto, ecn; int error; NET_EPOCH_ASSERT(); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); goto err; } #endif error = ENETDOWN; sc = ifp->if_softc; if ((ifp->if_flags & IFF_MONITOR) != 0 || (ifp->if_flags & IFF_UP) == 0 || (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || sc->gif_family == 0 || (error = if_tunnel_check_nesting(ifp, m, MTAG_GIF, V_max_gif_nesting)) != 0) { m_freem(m); goto err; } /* Now pull back the af that we stashed in the csum_data. */ if (ifp->if_bridge) af = AF_LINK; else af = m->m_pkthdr.csum_data; m->m_flags &= ~(M_BCAST|M_MCAST); M_SETFIB(m, sc->gif_fibnum); BPF_MTAP2(ifp, &af, sizeof(af), m); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); /* inner AF-specific encapsulation */ ecn = 0; switch (af) { #ifdef INET case AF_INET: proto = IPPROTO_IPV4; if (m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) { error = ENOBUFS; goto err; } ip = mtod(m, struct ip *); ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &ecn, &ip->ip_tos); break; #endif #ifdef INET6 case AF_INET6: proto = IPPROTO_IPV6; if (m->m_len < sizeof(struct ip6_hdr)) m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) { error = ENOBUFS; goto err; } t = 0; ip6 = mtod(m, struct ip6_hdr *); ip6_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &t, &ip6->ip6_flow); ecn = (ntohl(t) >> 20) & 0xff; break; #endif case AF_LINK: proto = IPPROTO_ETHERIP; M_PREPEND(m, sizeof(struct etherip_header), M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto err; } eth = mtod(m, struct etherip_header *); eth->eip_resvh = 0; eth->eip_ver = ETHERIP_VERSION; eth->eip_resvl = 0; break; default: error = EAFNOSUPPORT; m_freem(m); goto err; } /* XXX should we check if our outer source is legal? */ /* dispatch to output logic based on outer AF */ switch (sc->gif_family) { #ifdef INET case AF_INET: error = in_gif_output(ifp, m, proto, ecn); break; #endif #ifdef INET6 case AF_INET6: error = in6_gif_output(ifp, m, proto, ecn); break; #endif default: m_freem(m); } err: if (error) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } static void gif_qflush(struct ifnet *ifp __unused) { } int gif_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { uint32_t af; if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else - af = dst->sa_family; + af = RO_GET_FAMILY(ro, dst); /* * Now save the af in the inbound pkt csum data, this is a cheat since * we are using the inbound csum_data field to carry the af over to * the gif_transmit() routine, avoiding using yet another mtag. */ m->m_pkthdr.csum_data = af; return (ifp->if_transmit(ifp, m)); } void gif_input(struct mbuf *m, struct ifnet *ifp, int proto, uint8_t ecn) { struct etherip_header *eip; #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; uint32_t t; #endif struct ether_header *eh; struct ifnet *oldifp; int isr, n, af; NET_EPOCH_ASSERT(); if (ifp == NULL) { /* just in case */ m_freem(m); return; } m->m_pkthdr.rcvif = ifp; m_clrprotoflags(m); switch (proto) { #ifdef INET case IPPROTO_IPV4: af = AF_INET; if (m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) goto drop; ip = mtod(m, struct ip *); if (ip_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &ecn, &ip->ip_tos) == 0) { m_freem(m); goto drop; } break; #endif #ifdef INET6 case IPPROTO_IPV6: af = AF_INET6; if (m->m_len < sizeof(struct ip6_hdr)) m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) goto drop; t = htonl((uint32_t)ecn << 20); ip6 = mtod(m, struct ip6_hdr *); if (ip6_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &t, &ip6->ip6_flow) == 0) { m_freem(m); goto drop; } break; #endif case IPPROTO_ETHERIP: af = AF_LINK; break; default: m_freem(m); goto drop; } #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif if (bpf_peers_present(ifp->if_bpf)) { uint32_t af1 = af; bpf_mtap2(ifp->if_bpf, &af1, sizeof(af1), m); } if ((ifp->if_flags & IFF_MONITOR) != 0) { if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); m_freem(m); return; } if (ng_gif_input_p != NULL) { (*ng_gif_input_p)(ifp, &m, af); if (m == NULL) goto drop; } /* * Put the packet to the network layer input queue according to the * specified address family. * Note: older versions of gif_input directly called network layer * input functions, e.g. ip6_input, here. We changed the policy to * prevent too many recursive calls of such input functions, which * might cause kernel panic. But the change may introduce another * problem; if the input queue is full, packets are discarded. * The kernel stack overflow really happened, and we believed * queue-full rarely occurs, so we changed the policy. */ switch (af) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif case AF_LINK: n = sizeof(struct etherip_header) + sizeof(struct ether_header); if (n > m->m_len) m = m_pullup(m, n); if (m == NULL) goto drop; eip = mtod(m, struct etherip_header *); if (eip->eip_ver != ETHERIP_VERSION) { /* discard unknown versions */ m_freem(m); goto drop; } m_adj_decap(m, sizeof(struct etherip_header)); m->m_flags &= ~(M_BCAST|M_MCAST); m->m_pkthdr.rcvif = ifp; if (ifp->if_bridge) { oldifp = ifp; eh = mtod(m, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } BRIDGE_INPUT(ifp, m); if (m != NULL && ifp != oldifp) { /* * The bridge gave us back itself or one of the * members for which the frame is addressed. */ ether_demux(ifp, m); return; } } if (m != NULL) m_freem(m); return; default: if (ng_gif_input_orphan_p != NULL) (*ng_gif_input_orphan_p)(ifp, m, af); else m_freem(m); return; } if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); return; drop: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } static int gif_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq*)data; struct gif_softc *sc; u_int options; int error; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; case SIOCADDMULTI: case SIOCDELMULTI: case SIOCGIFMTU: case SIOCSIFFLAGS: return (0); case SIOCSIFMTU: if (ifr->ifr_mtu < GIF_MTU_MIN || ifr->ifr_mtu > GIF_MTU_MAX) return (EINVAL); else ifp->if_mtu = ifr->ifr_mtu; return (0); } sx_xlock(&gif_ioctl_sx); sc = ifp->if_softc; if (sc == NULL) { error = ENXIO; goto bad; } error = 0; switch (cmd) { case SIOCDIFPHYADDR: if (sc->gif_family == 0) break; gif_delete_tunnel(sc); break; #ifdef INET case SIOCSIFPHYADDR: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: error = in_gif_ioctl(sc, cmd, data); break; #endif #ifdef INET6 case SIOCSIFPHYADDR_IN6: case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: error = in6_gif_ioctl(sc, cmd, data); break; #endif case SIOCGTUNFIB: ifr->ifr_fib = sc->gif_fibnum; break; case SIOCSTUNFIB: if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0) break; if (ifr->ifr_fib >= rt_numfibs) error = EINVAL; else sc->gif_fibnum = ifr->ifr_fib; break; case GIFGOPTS: options = sc->gif_options; error = copyout(&options, ifr_data_get_ptr(ifr), sizeof(options)); break; case GIFSOPTS: if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0) break; error = copyin(ifr_data_get_ptr(ifr), &options, sizeof(options)); if (error) break; if (options & ~GIF_OPTMASK) { error = EINVAL; break; } if (sc->gif_options != options) { switch (sc->gif_family) { #ifdef INET case AF_INET: error = in_gif_setopts(sc, options); break; #endif #ifdef INET6 case AF_INET6: error = in6_gif_setopts(sc, options); break; #endif default: /* No need to invoke AF-handler */ sc->gif_options = options; } } break; default: error = EINVAL; break; } if (error == 0 && sc->gif_family != 0) { if ( #ifdef INET cmd == SIOCSIFPHYADDR || #endif #ifdef INET6 cmd == SIOCSIFPHYADDR_IN6 || #endif 0) { if_link_state_change(ifp, LINK_STATE_UP); } } bad: sx_xunlock(&gif_ioctl_sx); return (error); } static void gif_delete_tunnel(struct gif_softc *sc) { sx_assert(&gif_ioctl_sx, SA_XLOCKED); if (sc->gif_family != 0) { CK_LIST_REMOVE(sc, srchash); CK_LIST_REMOVE(sc, chain); /* Wait until it become safe to free gif_hdr */ GIF_WAIT(); free(sc->gif_hdr, M_GIF); } sc->gif_family = 0; GIF2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(GIF2IFP(sc), LINK_STATE_DOWN); } diff --git a/sys/net/if_gre.c b/sys/net/if_gre.c index 19014f9fd3de..5ad452ac38e0 100644 --- a/sys/net/if_gre.c +++ b/sys/net/if_gre.c @@ -1,832 +1,832 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1998 The NetBSD Foundation, Inc. * Copyright (c) 2014, 2018 Andrey V. Elsukov * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Heiko W.Rupp * * IPv6-over-GRE contributed by Gert Doering * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: if_gre.c,v 1.49 2003/12/11 00:22:29 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #include #ifdef RSS #include #endif #endif #ifdef INET6 #include #include #include #ifdef RSS #include #endif #endif #include #include #include #include #include #include #define GREMTU 1476 static const char grename[] = "gre"; MALLOC_DEFINE(M_GRE, grename, "Generic Routing Encapsulation"); static struct sx gre_ioctl_sx; SX_SYSINIT(gre_ioctl_sx, &gre_ioctl_sx, "gre_ioctl"); static int gre_clone_create(struct if_clone *, int, caddr_t); static void gre_clone_destroy(struct ifnet *); VNET_DEFINE_STATIC(struct if_clone *, gre_cloner); #define V_gre_cloner VNET(gre_cloner) #ifdef VIMAGE static void gre_reassign(struct ifnet *, struct vnet *, char *); #endif static void gre_qflush(struct ifnet *); static int gre_transmit(struct ifnet *, struct mbuf *); static int gre_ioctl(struct ifnet *, u_long, caddr_t); static int gre_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void gre_delete_tunnel(struct gre_softc *); SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_TUNNEL, gre, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Generic Routing Encapsulation"); #ifndef MAX_GRE_NEST /* * This macro controls the default upper limitation on nesting of gre tunnels. * Since, setting a large value to this macro with a careless configuration * may introduce system crash, we don't allow any nestings by default. * If you need to configure nested gre tunnels, you can define this macro * in your kernel configuration file. However, if you do so, please be * careful to configure the tunnels so that it won't make a loop. */ #define MAX_GRE_NEST 1 #endif VNET_DEFINE_STATIC(int, max_gre_nesting) = MAX_GRE_NEST; #define V_max_gre_nesting VNET(max_gre_nesting) SYSCTL_INT(_net_link_gre, OID_AUTO, max_nesting, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_gre_nesting), 0, "Max nested tunnels"); static void vnet_gre_init(const void *unused __unused) { V_gre_cloner = if_clone_simple(grename, gre_clone_create, gre_clone_destroy, 0); #ifdef INET in_gre_init(); #endif #ifdef INET6 in6_gre_init(); #endif } VNET_SYSINIT(vnet_gre_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gre_init, NULL); static void vnet_gre_uninit(const void *unused __unused) { if_clone_detach(V_gre_cloner); #ifdef INET in_gre_uninit(); #endif #ifdef INET6 in6_gre_uninit(); #endif /* XXX: epoch_call drain */ } VNET_SYSUNINIT(vnet_gre_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gre_uninit, NULL); static int gre_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct gre_softc *sc; sc = malloc(sizeof(struct gre_softc), M_GRE, M_WAITOK | M_ZERO); sc->gre_fibnum = curthread->td_proc->p_fibnum; GRE2IFP(sc) = if_alloc(IFT_TUNNEL); GRE2IFP(sc)->if_softc = sc; if_initname(GRE2IFP(sc), grename, unit); GRE2IFP(sc)->if_mtu = GREMTU; GRE2IFP(sc)->if_flags = IFF_POINTOPOINT|IFF_MULTICAST; GRE2IFP(sc)->if_output = gre_output; GRE2IFP(sc)->if_ioctl = gre_ioctl; GRE2IFP(sc)->if_transmit = gre_transmit; GRE2IFP(sc)->if_qflush = gre_qflush; #ifdef VIMAGE GRE2IFP(sc)->if_reassign = gre_reassign; #endif GRE2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE; GRE2IFP(sc)->if_capenable |= IFCAP_LINKSTATE; if_attach(GRE2IFP(sc)); bpfattach(GRE2IFP(sc), DLT_NULL, sizeof(u_int32_t)); return (0); } #ifdef VIMAGE static void gre_reassign(struct ifnet *ifp, struct vnet *new_vnet __unused, char *unused __unused) { struct gre_softc *sc; sx_xlock(&gre_ioctl_sx); sc = ifp->if_softc; if (sc != NULL) gre_delete_tunnel(sc); sx_xunlock(&gre_ioctl_sx); } #endif /* VIMAGE */ static void gre_clone_destroy(struct ifnet *ifp) { struct gre_softc *sc; sx_xlock(&gre_ioctl_sx); sc = ifp->if_softc; gre_delete_tunnel(sc); bpfdetach(ifp); if_detach(ifp); ifp->if_softc = NULL; sx_xunlock(&gre_ioctl_sx); GRE_WAIT(); if_free(ifp); free(sc, M_GRE); } static int gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct gre_softc *sc; uint32_t opt; int error; switch (cmd) { case SIOCSIFMTU: /* XXX: */ if (ifr->ifr_mtu < 576) return (EINVAL); ifp->if_mtu = ifr->ifr_mtu; return (0); case SIOCSIFADDR: ifp->if_flags |= IFF_UP; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: return (0); case GRESADDRS: case GRESADDRD: case GREGADDRS: case GREGADDRD: case GRESPROTO: case GREGPROTO: return (EOPNOTSUPP); } sx_xlock(&gre_ioctl_sx); sc = ifp->if_softc; if (sc == NULL) { error = ENXIO; goto end; } error = 0; switch (cmd) { case SIOCDIFPHYADDR: if (sc->gre_family == 0) break; gre_delete_tunnel(sc); break; #ifdef INET case SIOCSIFPHYADDR: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: error = in_gre_ioctl(sc, cmd, data); break; #endif #ifdef INET6 case SIOCSIFPHYADDR_IN6: case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: error = in6_gre_ioctl(sc, cmd, data); break; #endif case SIOCGTUNFIB: ifr->ifr_fib = sc->gre_fibnum; break; case SIOCSTUNFIB: if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; if (ifr->ifr_fib >= rt_numfibs) error = EINVAL; else sc->gre_fibnum = ifr->ifr_fib; break; case GRESKEY: case GRESOPTS: case GRESPORT: if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; if ((error = copyin(ifr_data_get_ptr(ifr), &opt, sizeof(opt))) != 0) break; if (cmd == GRESKEY) { if (sc->gre_key == opt) break; } else if (cmd == GRESOPTS) { if (opt & ~GRE_OPTMASK) { error = EINVAL; break; } if (sc->gre_options == opt) break; } else if (cmd == GRESPORT) { if (opt != 0 && (opt < V_ipport_hifirstauto || opt > V_ipport_hilastauto)) { error = EINVAL; break; } if (sc->gre_port == opt) break; if ((sc->gre_options & GRE_UDPENCAP) == 0) { /* * UDP encapsulation is not enabled, thus * there is no need to reattach softc. */ sc->gre_port = opt; break; } } switch (sc->gre_family) { #ifdef INET case AF_INET: error = in_gre_setopts(sc, cmd, opt); break; #endif #ifdef INET6 case AF_INET6: error = in6_gre_setopts(sc, cmd, opt); break; #endif default: /* * Tunnel is not yet configured. * We can just change any parameters. */ if (cmd == GRESKEY) sc->gre_key = opt; if (cmd == GRESOPTS) sc->gre_options = opt; if (cmd == GRESPORT) sc->gre_port = opt; break; } /* * XXX: Do we need to initiate change of interface * state here? */ break; case GREGKEY: error = copyout(&sc->gre_key, ifr_data_get_ptr(ifr), sizeof(sc->gre_key)); break; case GREGOPTS: error = copyout(&sc->gre_options, ifr_data_get_ptr(ifr), sizeof(sc->gre_options)); break; case GREGPORT: error = copyout(&sc->gre_port, ifr_data_get_ptr(ifr), sizeof(sc->gre_port)); break; default: error = EINVAL; break; } if (error == 0 && sc->gre_family != 0) { if ( #ifdef INET cmd == SIOCSIFPHYADDR || #endif #ifdef INET6 cmd == SIOCSIFPHYADDR_IN6 || #endif 0) { if_link_state_change(ifp, LINK_STATE_UP); } } end: sx_xunlock(&gre_ioctl_sx); return (error); } static void gre_delete_tunnel(struct gre_softc *sc) { struct gre_socket *gs; sx_assert(&gre_ioctl_sx, SA_XLOCKED); if (sc->gre_family != 0) { CK_LIST_REMOVE(sc, chain); CK_LIST_REMOVE(sc, srchash); GRE_WAIT(); free(sc->gre_hdr, M_GRE); sc->gre_family = 0; } /* * If this Tunnel was the last one that could use UDP socket, * we should unlink socket from hash table and close it. */ if ((gs = sc->gre_so) != NULL && CK_LIST_EMPTY(&gs->list)) { CK_LIST_REMOVE(gs, chain); soclose(gs->so); NET_EPOCH_CALL(gre_sofree, &gs->epoch_ctx); sc->gre_so = NULL; } GRE2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(GRE2IFP(sc), LINK_STATE_DOWN); } struct gre_list * gre_hashinit(void) { struct gre_list *hash; int i; hash = malloc(sizeof(struct gre_list) * GRE_HASH_SIZE, M_GRE, M_WAITOK); for (i = 0; i < GRE_HASH_SIZE; i++) CK_LIST_INIT(&hash[i]); return (hash); } void gre_hashdestroy(struct gre_list *hash) { free(hash, M_GRE); } void gre_sofree(epoch_context_t ctx) { struct gre_socket *gs; gs = __containerof(ctx, struct gre_socket, epoch_ctx); free(gs, M_GRE); } static __inline uint16_t gre_cksum_add(uint16_t sum, uint16_t a) { uint16_t res; res = sum + a; return (res + (res < a)); } void gre_update_udphdr(struct gre_softc *sc, struct udphdr *udp, uint16_t csum) { sx_assert(&gre_ioctl_sx, SA_XLOCKED); MPASS(sc->gre_options & GRE_UDPENCAP); udp->uh_dport = htons(GRE_UDPPORT); udp->uh_sport = htons(sc->gre_port); udp->uh_sum = csum; udp->uh_ulen = 0; } void gre_update_hdr(struct gre_softc *sc, struct grehdr *gh) { uint32_t *opts; uint16_t flags; sx_assert(&gre_ioctl_sx, SA_XLOCKED); flags = 0; opts = gh->gre_opts; if (sc->gre_options & GRE_ENABLE_CSUM) { flags |= GRE_FLAGS_CP; sc->gre_hlen += 2 * sizeof(uint16_t); *opts++ = 0; } if (sc->gre_key != 0) { flags |= GRE_FLAGS_KP; sc->gre_hlen += sizeof(uint32_t); *opts++ = htonl(sc->gre_key); } if (sc->gre_options & GRE_ENABLE_SEQ) { flags |= GRE_FLAGS_SP; sc->gre_hlen += sizeof(uint32_t); *opts++ = 0; } else sc->gre_oseq = 0; gh->gre_flags = htons(flags); } int gre_input(struct mbuf *m, int off, int proto, void *arg) { struct gre_softc *sc = arg; struct grehdr *gh; struct ifnet *ifp; uint32_t *opts; #ifdef notyet uint32_t key; #endif uint16_t flags; int hlen, isr, af; ifp = GRE2IFP(sc); hlen = off + sizeof(struct grehdr) + 4 * sizeof(uint32_t); if (m->m_pkthdr.len < hlen) goto drop; if (m->m_len < hlen) { m = m_pullup(m, hlen); if (m == NULL) goto drop; } gh = (struct grehdr *)mtodo(m, off); flags = ntohs(gh->gre_flags); if (flags & ~GRE_FLAGS_MASK) goto drop; opts = gh->gre_opts; hlen = 2 * sizeof(uint16_t); if (flags & GRE_FLAGS_CP) { /* reserved1 field must be zero */ if (((uint16_t *)opts)[1] != 0) goto drop; if (in_cksum_skip(m, m->m_pkthdr.len, off) != 0) goto drop; hlen += 2 * sizeof(uint16_t); opts++; } if (flags & GRE_FLAGS_KP) { #ifdef notyet /* * XXX: The current implementation uses the key only for outgoing * packets. But we can check the key value here, or even in the * encapcheck function. */ key = ntohl(*opts); #endif hlen += sizeof(uint32_t); opts++; } #ifdef notyet } else key = 0; if (sc->gre_key != 0 && (key != sc->gre_key || key != 0)) goto drop; #endif if (flags & GRE_FLAGS_SP) { #ifdef notyet seq = ntohl(*opts); #endif hlen += sizeof(uint32_t); } switch (ntohs(gh->gre_proto)) { case ETHERTYPE_WCCP: /* * For WCCP skip an additional 4 bytes if after GRE header * doesn't follow an IP header. */ if (flags == 0 && (*(uint8_t *)gh->gre_opts & 0xF0) != 0x40) hlen += sizeof(uint32_t); /* FALLTHROUGH */ case ETHERTYPE_IP: isr = NETISR_IP; af = AF_INET; break; case ETHERTYPE_IPV6: isr = NETISR_IPV6; af = AF_INET6; break; default: goto drop; } m_adj(m, off + hlen); m_clrprotoflags(m); m->m_pkthdr.rcvif = ifp; M_SETFIB(m, ifp->if_fib); #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif BPF_MTAP2(ifp, &af, sizeof(af), m); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if ((ifp->if_flags & IFF_MONITOR) != 0) m_freem(m); else netisr_dispatch(isr, m); return (IPPROTO_DONE); drop: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return (IPPROTO_DONE); } static int gre_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { uint32_t af; if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else - af = dst->sa_family; + af = RO_GET_FAMILY(ro, dst); /* * Now save the af in the inbound pkt csum data, this is a cheat since * we are using the inbound csum_data field to carry the af over to * the gre_transmit() routine, avoiding using yet another mtag. */ m->m_pkthdr.csum_data = af; return (ifp->if_transmit(ifp, m)); } static void gre_setseqn(struct grehdr *gh, uint32_t seq) { uint32_t *opts; uint16_t flags; opts = gh->gre_opts; flags = ntohs(gh->gre_flags); KASSERT((flags & GRE_FLAGS_SP) != 0, ("gre_setseqn called, but GRE_FLAGS_SP isn't set ")); if (flags & GRE_FLAGS_CP) opts++; if (flags & GRE_FLAGS_KP) opts++; *opts = htonl(seq); } static uint32_t gre_flowid(struct gre_softc *sc, struct mbuf *m, uint32_t af) { uint32_t flowid = 0; if ((sc->gre_options & GRE_UDPENCAP) == 0 || sc->gre_port != 0) return (flowid); switch (af) { #ifdef INET case AF_INET: #ifdef RSS flowid = rss_hash_ip4_2tuple(mtod(m, struct ip *)->ip_src, mtod(m, struct ip *)->ip_dst); break; #endif flowid = mtod(m, struct ip *)->ip_src.s_addr ^ mtod(m, struct ip *)->ip_dst.s_addr; break; #endif #ifdef INET6 case AF_INET6: #ifdef RSS flowid = rss_hash_ip6_2tuple( &mtod(m, struct ip6_hdr *)->ip6_src, &mtod(m, struct ip6_hdr *)->ip6_dst); break; #endif flowid = mtod(m, struct ip6_hdr *)->ip6_src.s6_addr32[3] ^ mtod(m, struct ip6_hdr *)->ip6_dst.s6_addr32[3]; break; #endif default: break; } return (flowid); } #define MTAG_GRE 1307983903 static int gre_transmit(struct ifnet *ifp, struct mbuf *m) { GRE_RLOCK_TRACKER; struct gre_softc *sc; struct grehdr *gh; struct udphdr *uh; uint32_t af, flowid; int error, len; uint16_t proto; len = 0; GRE_RLOCK(); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); goto drop; } #endif error = ENETDOWN; sc = ifp->if_softc; if ((ifp->if_flags & IFF_MONITOR) != 0 || (ifp->if_flags & IFF_UP) == 0 || (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || sc->gre_family == 0 || (error = if_tunnel_check_nesting(ifp, m, MTAG_GRE, V_max_gre_nesting)) != 0) { m_freem(m); goto drop; } af = m->m_pkthdr.csum_data; BPF_MTAP2(ifp, &af, sizeof(af), m); m->m_flags &= ~(M_BCAST|M_MCAST); flowid = gre_flowid(sc, m, af); M_SETFIB(m, sc->gre_fibnum); M_PREPEND(m, sc->gre_hlen, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto drop; } bcopy(sc->gre_hdr, mtod(m, void *), sc->gre_hlen); /* Determine GRE proto */ switch (af) { #ifdef INET case AF_INET: proto = htons(ETHERTYPE_IP); break; #endif #ifdef INET6 case AF_INET6: proto = htons(ETHERTYPE_IPV6); break; #endif default: m_freem(m); error = ENETDOWN; goto drop; } /* Determine offset of GRE header */ switch (sc->gre_family) { #ifdef INET case AF_INET: len = sizeof(struct ip); break; #endif #ifdef INET6 case AF_INET6: len = sizeof(struct ip6_hdr); break; #endif default: m_freem(m); error = ENETDOWN; goto drop; } if (sc->gre_options & GRE_UDPENCAP) { uh = (struct udphdr *)mtodo(m, len); uh->uh_sport |= htons(V_ipport_hifirstauto) | (flowid >> 16) | (flowid & 0xFFFF); uh->uh_sport = htons(ntohs(uh->uh_sport) % V_ipport_hilastauto); uh->uh_ulen = htons(m->m_pkthdr.len - len); uh->uh_sum = gre_cksum_add(uh->uh_sum, htons(m->m_pkthdr.len - len + IPPROTO_UDP)); m->m_pkthdr.csum_flags = sc->gre_csumflags; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); len += sizeof(struct udphdr); } gh = (struct grehdr *)mtodo(m, len); gh->gre_proto = proto; if (sc->gre_options & GRE_ENABLE_SEQ) gre_setseqn(gh, sc->gre_oseq++); if (sc->gre_options & GRE_ENABLE_CSUM) { *(uint16_t *)gh->gre_opts = in_cksum_skip(m, m->m_pkthdr.len, len); } len = m->m_pkthdr.len - len; switch (sc->gre_family) { #ifdef INET case AF_INET: error = in_gre_output(m, af, sc->gre_hlen); break; #endif #ifdef INET6 case AF_INET6: error = in6_gre_output(m, af, sc->gre_hlen, flowid); break; #endif default: m_freem(m); error = ENETDOWN; } drop: if (error) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); } GRE_RUNLOCK(); return (error); } static void gre_qflush(struct ifnet *ifp __unused) { } static int gremodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t gre_mod = { "if_gre", gremodevent, 0 }; DECLARE_MODULE(if_gre, gre_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_gre, 1); diff --git a/sys/net/if_infiniband.c b/sys/net/if_infiniband.c index 244b2a5ba117..4dfbd5272d15 100644 --- a/sys/net/if_infiniband.c +++ b/sys/net/if_infiniband.c @@ -1,651 +1,652 @@ /*- * Copyright (c) 2020 Mellanox Technologies. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_inet6.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* if_lagg(4) support */ struct mbuf *(*lagg_input_infiniband_p)(struct ifnet *, struct mbuf *); #ifdef INET static inline void infiniband_ipv4_multicast_map(uint32_t addr, const uint8_t *broadcast, uint8_t *buf) { uint8_t scope; addr = ntohl(addr); scope = broadcast[5] & 0xF; buf[0] = 0; buf[1] = 0xff; buf[2] = 0xff; buf[3] = 0xff; buf[4] = 0xff; buf[5] = 0x10 | scope; buf[6] = 0x40; buf[7] = 0x1b; buf[8] = broadcast[8]; buf[9] = broadcast[9]; buf[10] = 0; buf[11] = 0; buf[12] = 0; buf[13] = 0; buf[14] = 0; buf[15] = 0; buf[16] = (addr >> 24) & 0xff; buf[17] = (addr >> 16) & 0xff; buf[18] = (addr >> 8) & 0xff; buf[19] = addr & 0xff; } #endif #ifdef INET6 static inline void infiniband_ipv6_multicast_map(const struct in6_addr *addr, const uint8_t *broadcast, uint8_t *buf) { uint8_t scope; scope = broadcast[5] & 0xF; buf[0] = 0; buf[1] = 0xff; buf[2] = 0xff; buf[3] = 0xff; buf[4] = 0xff; buf[5] = 0x10 | scope; buf[6] = 0x60; buf[7] = 0x1b; buf[8] = broadcast[8]; buf[9] = broadcast[9]; memcpy(&buf[10], &addr->s6_addr[6], 10); } #endif /* * This is for clients that have an infiniband_header in the mbuf. */ void infiniband_bpf_mtap(struct ifnet *ifp, struct mbuf *mb) { struct infiniband_header *ibh; struct ether_header eh; if (mb->m_len < sizeof(*ibh)) return; ibh = mtod(mb, struct infiniband_header *); eh.ether_type = ibh->ib_protocol; memset(eh.ether_shost, 0, ETHER_ADDR_LEN); memcpy(eh.ether_dhost, ibh->ib_hwaddr + 4, ETHER_ADDR_LEN); mb->m_data += sizeof(*ibh); mb->m_len -= sizeof(*ibh); mb->m_pkthdr.len -= sizeof(*ibh); bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); mb->m_data -= sizeof(*ibh); mb->m_len += sizeof(*ibh); mb->m_pkthdr.len += sizeof(*ibh); } static void update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst) { int csum_flags = 0; if (src->m_pkthdr.csum_flags & CSUM_IP) csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA) csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); if (src->m_pkthdr.csum_flags & CSUM_SCTP) csum_flags |= CSUM_SCTP_VALID; dst->m_pkthdr.csum_flags |= csum_flags; if (csum_flags & CSUM_DATA_VALID) dst->m_pkthdr.csum_data = 0xffff; } /* * Handle link-layer encapsulation requests. */ static int infiniband_requestencap(struct ifnet *ifp, struct if_encap_req *req) { struct infiniband_header *ih; struct arphdr *ah; uint16_t etype; const uint8_t *lladdr; if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < INFINIBAND_HDR_LEN) return (ENOMEM); ih = (struct infiniband_header *)req->buf; lladdr = req->lladdr; req->lladdr_off = 0; switch (req->family) { case AF_INET: etype = htons(ETHERTYPE_IP); break; case AF_INET6: etype = htons(ETHERTYPE_IPV6); break; case AF_ARP: ah = (struct arphdr *)req->hdata; ah->ar_hrd = htons(ARPHRD_INFINIBAND); switch (ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: etype = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: etype = htons(ETHERTYPE_ARP); break; } if (req->flags & IFENCAP_FLAG_BROADCAST) lladdr = ifp->if_broadcastaddr; break; default: return (EAFNOSUPPORT); } ih->ib_protocol = etype; ih->ib_reserved = 0; memcpy(ih->ib_hwaddr, lladdr, INFINIBAND_ADDR_LEN); req->bufsize = sizeof(struct infiniband_header); return (0); } static int infiniband_resolve_addr(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro, uint8_t *phdr, uint32_t *pflags, struct llentry **plle) { struct infiniband_header *ih; uint32_t lleflags = 0; int error = 0; if (plle) *plle = NULL; ih = (struct infiniband_header *)phdr; switch (dst->sa_family) { #ifdef INET case AF_INET: if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) { error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle); } else { if (m->m_flags & M_BCAST) { memcpy(ih->ib_hwaddr, ifp->if_broadcastaddr, INFINIBAND_ADDR_LEN); } else { infiniband_ipv4_multicast_map( ((const struct sockaddr_in *)dst)->sin_addr.s_addr, ifp->if_broadcastaddr, ih->ib_hwaddr); } ih->ib_protocol = htons(ETHERTYPE_IP); ih->ib_reserved = 0; } break; #endif #ifdef INET6 case AF_INET6: if ((m->m_flags & M_MCAST) == 0) { - error = nd6_resolve(ifp, LLE_SF(AF_INET6, 0), m, dst, - phdr, &lleflags, plle); + int af = RO_GET_FAMILY(ro, dst); + error = nd6_resolve(ifp, LLE_SF(af, 0), m, dst, phdr, + &lleflags, plle); } else { infiniband_ipv6_multicast_map( &((const struct sockaddr_in6 *)dst)->sin6_addr, ifp->if_broadcastaddr, ih->ib_hwaddr); ih->ib_protocol = htons(ETHERTYPE_IPV6); ih->ib_reserved = 0; } break; #endif default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); if (m != NULL) m_freem(m); return (EAFNOSUPPORT); } if (error == EHOSTDOWN) { if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0) error = EHOSTUNREACH; } if (error != 0) return (error); *pflags = RT_MAY_LOOP; if (lleflags & LLE_IFADDR) *pflags |= RT_L2_ME; return (0); } /* * Infiniband output routine. */ static int infiniband_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { uint8_t linkhdr[INFINIBAND_HDR_LEN]; uint8_t *phdr; struct llentry *lle = NULL; struct infiniband_header *ih; int error = 0; int hlen; /* link layer header length */ uint32_t pflags; bool addref; NET_EPOCH_ASSERT(); addref = false; phdr = NULL; pflags = 0; if (ro != NULL) { /* XXX BPF uses ro_prepend */ if (ro->ro_prepend != NULL) { phdr = ro->ro_prepend; hlen = ro->ro_plen; } else if (!(m->m_flags & (M_BCAST | M_MCAST))) { if ((ro->ro_flags & RT_LLE_CACHE) != 0) { lle = ro->ro_lle; if (lle != NULL && (lle->la_flags & LLE_VALID) == 0) { LLE_FREE(lle); lle = NULL; /* redundant */ ro->ro_lle = NULL; } if (lle == NULL) { /* if we lookup, keep cache */ addref = 1; } else /* * Notify LLE code that * the entry was used * by datapath. */ llentry_provide_feedback(lle); } if (lle != NULL) { phdr = lle->r_linkdata; hlen = lle->r_hdrlen; pflags = lle->r_flags; } } } #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) goto bad; #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) { error = ENETDOWN; goto bad; } if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) { error = ENETDOWN; goto bad; } if (phdr == NULL) { /* No prepend data supplied. Try to calculate ourselves. */ phdr = linkhdr; hlen = INFINIBAND_HDR_LEN; error = infiniband_resolve_addr(ifp, m, dst, ro, phdr, &pflags, addref ? &lle : NULL); if (addref && lle != NULL) ro->ro_lle = lle; if (error != 0) return (error == EWOULDBLOCK ? 0 : error); } if ((pflags & RT_L2_ME) != 0) { update_mbuf_csumflags(m, m); - return (if_simloop(ifp, m, dst->sa_family, 0)); + return (if_simloop(ifp, m, RO_GET_FAMILY(ro, dst), 0)); } /* * Add local infiniband header. If no space in first mbuf, * allocate another. */ M_PREPEND(m, INFINIBAND_HDR_LEN, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } if ((pflags & RT_HAS_HEADER) == 0) { ih = mtod(m, struct infiniband_header *); memcpy(ih, phdr, hlen); } /* * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ return (ifp->if_transmit(ifp, m)); bad: if (m != NULL) m_freem(m); return (error); } /* * Process a received Infiniband packet. */ static void infiniband_input(struct ifnet *ifp, struct mbuf *m) { struct infiniband_header *ibh; struct epoch_tracker et; int isr; CURVNET_SET_QUIET(ifp->if_vnet); if ((ifp->if_flags & IFF_UP) == 0) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); goto done; } ibh = mtod(m, struct infiniband_header *); /* * Reset layer specific mbuf flags to avoid confusing upper * layers: */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); if (INFINIBAND_IS_MULTICAST(ibh->ib_hwaddr)) { if (memcmp(ibh->ib_hwaddr, ifp->if_broadcastaddr, ifp->if_addrlen) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } /* Let BPF have it before we strip the header. */ INFINIBAND_BPF_MTAP(ifp, m); /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); goto done; } /* Direct packet to correct FIB based on interface config. */ M_SETFIB(m, ifp->if_fib); /* Handle input from a lagg port */ if (ifp->if_type == IFT_INFINIBANDLAG) { KASSERT(lagg_input_infiniband_p != NULL, ("%s: if_lagg not loaded!", __func__)); m = (*lagg_input_infiniband_p)(ifp, m); if (__predict_false(m == NULL)) goto done; ifp = m->m_pkthdr.rcvif; } /* * Dispatch frame to upper layer. */ switch (ibh->ib_protocol) { #ifdef INET case htons(ETHERTYPE_IP): isr = NETISR_IP; break; case htons(ETHERTYPE_ARP): if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); goto done; } isr = NETISR_ARP; break; #endif #ifdef INET6 case htons(ETHERTYPE_IPV6): isr = NETISR_IPV6; break; #endif default: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); goto done; } /* Strip off the Infiniband header. */ m_adj(m, INFINIBAND_HDR_LEN); #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* Allow monitor mode to claim this frame, after stats are updated. */ NET_EPOCH_ENTER(et); netisr_dispatch(isr, m); NET_EPOCH_EXIT(et); done: CURVNET_RESTORE(); } static int infiniband_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif uint8_t *e_addr; switch (sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!INFINIBAND_IS_MULTICAST(e_addr)) return (EADDRNOTAVAIL); *llsa = NULL; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return (EADDRNOTAVAIL); sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ADDR_LEN; e_addr = LLADDR(sdl); infiniband_ipv4_multicast_map( sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return (0); #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; /* * An IP6 address of 0 means listen to all of the * multicast address used for IP6. This has no meaning * in infiniband. */ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) return (EADDRNOTAVAIL); if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return (EADDRNOTAVAIL); sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ADDR_LEN; e_addr = LLADDR(sdl); infiniband_ipv6_multicast_map( &sin6->sin6_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return (0); #endif default: return (EAFNOSUPPORT); } } void infiniband_ifattach(struct ifnet *ifp, const uint8_t *lla, const uint8_t *llb) { struct sockaddr_dl *sdl; struct ifaddr *ifa; int i; ifp->if_addrlen = INFINIBAND_ADDR_LEN; ifp->if_hdrlen = INFINIBAND_HDR_LEN; ifp->if_mtu = INFINIBAND_MTU; if_attach(ifp); ifp->if_output = infiniband_output; ifp->if_input = infiniband_input; ifp->if_resolvemulti = infiniband_resolvemulti; ifp->if_requestencap = infiniband_requestencap; if (ifp->if_baudrate == 0) ifp->if_baudrate = IF_Gbps(10); /* default value */ if (llb != NULL) ifp->if_broadcastaddr = llb; ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_INFINIBAND; sdl->sdl_alen = ifp->if_addrlen; if (lla != NULL) { memcpy(LLADDR(sdl), lla, ifp->if_addrlen); if (ifp->if_hw_addr != NULL) memcpy(ifp->if_hw_addr, lla, ifp->if_addrlen); } else { lla = LLADDR(sdl); } /* Attach ethernet compatible network device */ bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); /* Announce Infiniband MAC address if non-zero. */ for (i = 0; i < ifp->if_addrlen; i++) if (lla[i] != 0) break; if (i != ifp->if_addrlen) if_printf(ifp, "Infiniband address: %20D\n", lla, ":"); /* Add necessary bits are setup; announce it now. */ EVENTHANDLER_INVOKE(infiniband_ifattach_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("INFINIBAND", ifp->if_xname, "IFATTACH", NULL); } /* * Perform common duties while detaching an Infiniband interface */ void infiniband_ifdetach(struct ifnet *ifp) { bpfdetach(ifp); if_detach(ifp); } static int infiniband_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: return (0); default: return (EOPNOTSUPP); } } static moduledata_t infiniband_mod = { .name = "if_infiniband", .evhand = &infiniband_modevent, }; DECLARE_MODULE(if_infiniband, infiniband_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(if_infiniband, 1); diff --git a/sys/net/if_loop.c b/sys/net/if_loop.c index cbff8200806a..643ef2240fe1 100644 --- a/sys/net/if_loop.c +++ b/sys/net/if_loop.c @@ -1,455 +1,455 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_loop.c 8.2 (Berkeley) 1/9/95 * $FreeBSD$ */ /* * Loopback interface driver for protocol testing and timing. */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif #ifdef INET6 #ifndef INET #include #endif #include #include #endif #include #ifdef TINY_LOMTU #define LOMTU (1024+512) #elif defined(LARGE_LOMTU) #define LOMTU 131072 #else #define LOMTU 16384 #endif #define LO_CSUM_FEATURES (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP) #define LO_CSUM_FEATURES6 (CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6) #define LO_CSUM_SET (CSUM_DATA_VALID | CSUM_DATA_VALID_IPV6 | \ CSUM_PSEUDO_HDR | \ CSUM_IP_CHECKED | CSUM_IP_VALID | \ CSUM_SCTP_VALID) int loioctl(struct ifnet *, u_long, caddr_t); int looutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro); static int lo_clone_create(struct if_clone *, int, caddr_t); static void lo_clone_destroy(struct ifnet *); VNET_DEFINE(struct ifnet *, loif); /* Used externally */ #ifdef VIMAGE VNET_DEFINE_STATIC(struct if_clone *, lo_cloner); #define V_lo_cloner VNET(lo_cloner) #endif static struct if_clone *lo_cloner; static const char loname[] = "lo"; static void lo_clone_destroy(struct ifnet *ifp) { #ifndef VIMAGE /* XXX: destroying lo0 will lead to panics. */ KASSERT(V_loif != ifp, ("%s: destroying lo0", __func__)); #endif bpfdetach(ifp); if_detach(ifp); if_free(ifp); } static int lo_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct ifnet *ifp; ifp = if_alloc(IFT_LOOP); if (ifp == NULL) return (ENOSPC); if_initname(ifp, loname, unit); ifp->if_mtu = LOMTU; ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST; ifp->if_ioctl = loioctl; ifp->if_output = looutput; ifp->if_snd.ifq_maxlen = ifqmaxlen; ifp->if_capabilities = ifp->if_capenable = IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 | IFCAP_LINKSTATE; ifp->if_hwassist = LO_CSUM_FEATURES | LO_CSUM_FEATURES6; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); if (V_loif == NULL) V_loif = ifp; return (0); } static void vnet_loif_init(const void *unused __unused) { #ifdef VIMAGE lo_cloner = if_clone_simple(loname, lo_clone_create, lo_clone_destroy, 1); V_lo_cloner = lo_cloner; #else lo_cloner = if_clone_simple(loname, lo_clone_create, lo_clone_destroy, 1); #endif } VNET_SYSINIT(vnet_loif_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_loif_init, NULL); #ifdef VIMAGE static void vnet_loif_uninit(const void *unused __unused) { if_clone_detach(V_lo_cloner); V_loif = NULL; } VNET_SYSUNINIT(vnet_loif_uninit, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_loif_uninit, NULL); #endif static int loop_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: break; case MOD_UNLOAD: printf("loop module unload - not possible for this module type\n"); return (EINVAL); default: return (EOPNOTSUPP); } return (0); } static moduledata_t loop_mod = { "if_lo", loop_modevent, 0 }; DECLARE_MODULE(if_lo, loop_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); int looutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { u_int32_t af; #ifdef MAC int error; #endif M_ASSERTPKTHDR(m); /* check if we have the packet header */ #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); return (error); } #endif if (ro != NULL && ro->ro_flags & (RT_REJECT|RT_BLACKHOLE)) { m_freem(m); return (ro->ro_flags & RT_BLACKHOLE ? 0 : EHOSTUNREACH); } if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); #ifdef RSS M_HASHTYPE_CLEAR(m); #endif /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC || dst->sa_family == pseudo_AF_HDRCMPLT) bcopy(dst->sa_data, &af, sizeof(af)); else - af = dst->sa_family; + af = RO_GET_FAMILY(ro, dst); #if 1 /* XXX */ switch (af) { case AF_INET: if (ifp->if_capenable & IFCAP_RXCSUM) { m->m_pkthdr.csum_data = 0xffff; m->m_pkthdr.csum_flags = LO_CSUM_SET; } m->m_pkthdr.csum_flags &= ~LO_CSUM_FEATURES; break; case AF_INET6: #if 0 /* * XXX-BZ for now always claim the checksum is good despite * any interface flags. This is a workaround for 9.1-R and * a proper solution ought to be sought later. */ if (ifp->if_capenable & IFCAP_RXCSUM_IPV6) { m->m_pkthdr.csum_data = 0xffff; m->m_pkthdr.csum_flags = LO_CSUM_SET; } #else m->m_pkthdr.csum_data = 0xffff; m->m_pkthdr.csum_flags = LO_CSUM_SET; #endif m->m_pkthdr.csum_flags &= ~LO_CSUM_FEATURES6; break; default: printf("looutput: af=%d unexpected\n", af); m_freem(m); return (EAFNOSUPPORT); } #endif return (if_simloop(ifp, m, af, 0)); } /* * if_simloop() * * This function is to support software emulation of hardware loopback, * i.e., for interfaces with the IFF_SIMPLEX attribute. Since they can't * hear their own broadcasts, we create a copy of the packet that we * would normally receive via a hardware loopback. * * This function expects the packet to include the media header of length hlen. */ int if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen) { int isr; M_ASSERTPKTHDR(m); m_tag_delete_nonpersistent(m); m->m_pkthdr.rcvif = ifp; #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif /* * Let BPF see incoming packet in the following manner: * - Emulated packet loopback for a simplex interface * (net/if_ethersubr.c) * -> passes it to ifp's BPF * - IPv4/v6 multicast packet loopback (netinet(6)/ip(6)_output.c) * -> not passes it to any BPF * - Normal packet loopback from myself to myself (net/if_loop.c) * -> passes to lo0's BPF (even in case of IPv6, where ifp!=lo0) */ if (hlen > 0) { if (bpf_peers_present(ifp->if_bpf)) { bpf_mtap(ifp->if_bpf, m); } } else { if (bpf_peers_present(V_loif->if_bpf)) { if ((m->m_flags & M_MCAST) == 0 || V_loif == ifp) { /* XXX beware sizeof(af) != 4 */ u_int32_t af1 = af; /* * We need to prepend the address family. */ bpf_mtap2(V_loif->if_bpf, &af1, sizeof(af1), m); } } } /* Strip away media header */ if (hlen > 0) { m_adj(m, hlen); #ifndef __NO_STRICT_ALIGNMENT /* * Some archs do not like unaligned data, so * we move data down in the first mbuf. */ if (mtod(m, vm_offset_t) & 3) { KASSERT(hlen >= 3, ("if_simloop: hlen too small")); bcopy(m->m_data, (char *)(mtod(m, vm_offset_t) - (mtod(m, vm_offset_t) & 3)), m->m_len); m->m_data -= (mtod(m,vm_offset_t) & 3); } #endif } /* Deliver to upper layer protocol */ switch (af) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: m->m_flags |= M_LOOP; isr = NETISR_IPV6; break; #endif default: printf("if_simloop: can't handle af=%d\n", af); m_freem(m); return (EAFNOSUPPORT); } if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); netisr_queue(isr, m); /* mbuf is free'd on failure. */ return (0); } /* * Process an ioctl request. */ /* ARGSUSED */ int loioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; int error = 0, mask; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; ifp->if_drv_flags |= IFF_DRV_RUNNING; if_link_state_change(ifp, LINK_STATE_UP); /* * Everything else is done at a higher level. */ break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifr == NULL) { error = EAFNOSUPPORT; /* XXX */ break; } switch (ifr->ifr_addr.sa_family) { #ifdef INET case AF_INET: break; #endif #ifdef INET6 case AF_INET6: break; #endif default: error = EAFNOSUPPORT; break; } break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; break; case SIOCSIFFLAGS: if_link_state_change(ifp, (ifp->if_flags & IFF_UP) ? LINK_STATE_UP: LINK_STATE_DOWN); break; case SIOCSIFCAP: mask = ifp->if_capenable ^ ifr->ifr_reqcap; if ((mask & IFCAP_RXCSUM) != 0) ifp->if_capenable ^= IFCAP_RXCSUM; if ((mask & IFCAP_TXCSUM) != 0) ifp->if_capenable ^= IFCAP_TXCSUM; if ((mask & IFCAP_RXCSUM_IPV6) != 0) { #if 0 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; #else error = EOPNOTSUPP; break; #endif } if ((mask & IFCAP_TXCSUM_IPV6) != 0) { #if 0 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; #else error = EOPNOTSUPP; break; #endif } ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist = LO_CSUM_FEATURES; #if 0 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= LO_CSUM_FEATURES6; #endif break; default: error = EINVAL; } return (error); } diff --git a/sys/net/if_me.c b/sys/net/if_me.c index aafc07c2b203..067ab22cd84d 100644 --- a/sys/net/if_me.c +++ b/sys/net/if_me.c @@ -1,686 +1,686 @@ /*- * Copyright (c) 2014, 2018 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define MEMTU (1500 - sizeof(struct mobhdr)) static const char mename[] = "me"; static MALLOC_DEFINE(M_IFME, mename, "Minimal Encapsulation for IP"); /* Minimal forwarding header RFC 2004 */ struct mobhdr { uint8_t mob_proto; /* protocol */ uint8_t mob_flags; /* flags */ #define MOB_FLAGS_SP 0x80 /* source present */ uint16_t mob_csum; /* header checksum */ struct in_addr mob_dst; /* original destination address */ struct in_addr mob_src; /* original source addr (optional) */ } __packed; struct me_softc { struct ifnet *me_ifp; u_int me_fibnum; struct in_addr me_src; struct in_addr me_dst; CK_LIST_ENTRY(me_softc) chain; CK_LIST_ENTRY(me_softc) srchash; }; CK_LIST_HEAD(me_list, me_softc); #define ME2IFP(sc) ((sc)->me_ifp) #define ME_READY(sc) ((sc)->me_src.s_addr != 0) #define ME_RLOCK_TRACKER struct epoch_tracker me_et #define ME_RLOCK() epoch_enter_preempt(net_epoch_preempt, &me_et) #define ME_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &me_et) #define ME_WAIT() epoch_wait_preempt(net_epoch_preempt) #ifndef ME_HASH_SIZE #define ME_HASH_SIZE (1 << 4) #endif VNET_DEFINE_STATIC(struct me_list *, me_hashtbl) = NULL; VNET_DEFINE_STATIC(struct me_list *, me_srchashtbl) = NULL; #define V_me_hashtbl VNET(me_hashtbl) #define V_me_srchashtbl VNET(me_srchashtbl) #define ME_HASH(src, dst) (V_me_hashtbl[\ me_hashval((src), (dst)) & (ME_HASH_SIZE - 1)]) #define ME_SRCHASH(src) (V_me_srchashtbl[\ fnv_32_buf(&(src), sizeof(src), FNV1_32_INIT) & (ME_HASH_SIZE - 1)]) static struct sx me_ioctl_sx; SX_SYSINIT(me_ioctl_sx, &me_ioctl_sx, "me_ioctl"); static int me_clone_create(struct if_clone *, int, caddr_t); static void me_clone_destroy(struct ifnet *); VNET_DEFINE_STATIC(struct if_clone *, me_cloner); #define V_me_cloner VNET(me_cloner) #ifdef VIMAGE static void me_reassign(struct ifnet *, struct vnet *, char *); #endif static void me_qflush(struct ifnet *); static int me_transmit(struct ifnet *, struct mbuf *); static int me_ioctl(struct ifnet *, u_long, caddr_t); static int me_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int me_input(struct mbuf *, int, int, void *); static int me_set_tunnel(struct me_softc *, in_addr_t, in_addr_t); static void me_delete_tunnel(struct me_softc *); SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_TUNNEL, me, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Minimal Encapsulation for IP (RFC 2004)"); #ifndef MAX_ME_NEST #define MAX_ME_NEST 1 #endif VNET_DEFINE_STATIC(int, max_me_nesting) = MAX_ME_NEST; #define V_max_me_nesting VNET(max_me_nesting) SYSCTL_INT(_net_link_me, OID_AUTO, max_nesting, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_me_nesting), 0, "Max nested tunnels"); static uint32_t me_hashval(in_addr_t src, in_addr_t dst) { uint32_t ret; ret = fnv_32_buf(&src, sizeof(src), FNV1_32_INIT); return (fnv_32_buf(&dst, sizeof(dst), ret)); } static struct me_list * me_hashinit(void) { struct me_list *hash; int i; hash = malloc(sizeof(struct me_list) * ME_HASH_SIZE, M_IFME, M_WAITOK); for (i = 0; i < ME_HASH_SIZE; i++) CK_LIST_INIT(&hash[i]); return (hash); } static void vnet_me_init(const void *unused __unused) { V_me_cloner = if_clone_simple(mename, me_clone_create, me_clone_destroy, 0); } VNET_SYSINIT(vnet_me_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_me_init, NULL); static void vnet_me_uninit(const void *unused __unused) { if (V_me_hashtbl != NULL) { free(V_me_hashtbl, M_IFME); V_me_hashtbl = NULL; ME_WAIT(); free(V_me_srchashtbl, M_IFME); } if_clone_detach(V_me_cloner); } VNET_SYSUNINIT(vnet_me_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_me_uninit, NULL); static int me_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct me_softc *sc; sc = malloc(sizeof(struct me_softc), M_IFME, M_WAITOK | M_ZERO); sc->me_fibnum = curthread->td_proc->p_fibnum; ME2IFP(sc) = if_alloc(IFT_TUNNEL); ME2IFP(sc)->if_softc = sc; if_initname(ME2IFP(sc), mename, unit); ME2IFP(sc)->if_mtu = MEMTU; ME2IFP(sc)->if_flags = IFF_POINTOPOINT|IFF_MULTICAST; ME2IFP(sc)->if_output = me_output; ME2IFP(sc)->if_ioctl = me_ioctl; ME2IFP(sc)->if_transmit = me_transmit; ME2IFP(sc)->if_qflush = me_qflush; #ifdef VIMAGE ME2IFP(sc)->if_reassign = me_reassign; #endif ME2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE; ME2IFP(sc)->if_capenable |= IFCAP_LINKSTATE; if_attach(ME2IFP(sc)); bpfattach(ME2IFP(sc), DLT_NULL, sizeof(u_int32_t)); return (0); } #ifdef VIMAGE static void me_reassign(struct ifnet *ifp, struct vnet *new_vnet __unused, char *unused __unused) { struct me_softc *sc; sx_xlock(&me_ioctl_sx); sc = ifp->if_softc; if (sc != NULL) me_delete_tunnel(sc); sx_xunlock(&me_ioctl_sx); } #endif /* VIMAGE */ static void me_clone_destroy(struct ifnet *ifp) { struct me_softc *sc; sx_xlock(&me_ioctl_sx); sc = ifp->if_softc; me_delete_tunnel(sc); bpfdetach(ifp); if_detach(ifp); ifp->if_softc = NULL; sx_xunlock(&me_ioctl_sx); ME_WAIT(); if_free(ifp); free(sc, M_IFME); } static int me_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *src, *dst; struct me_softc *sc; int error; switch (cmd) { case SIOCSIFMTU: if (ifr->ifr_mtu < 576) return (EINVAL); ifp->if_mtu = ifr->ifr_mtu; return (0); case SIOCSIFADDR: ifp->if_flags |= IFF_UP; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: return (0); } sx_xlock(&me_ioctl_sx); sc = ifp->if_softc; if (sc == NULL) { error = ENXIO; goto end; } error = 0; switch (cmd) { case SIOCSIFPHYADDR: src = &((struct in_aliasreq *)data)->ifra_addr; dst = &((struct in_aliasreq *)data)->ifra_dstaddr; if (src->sin_family != dst->sin_family || src->sin_family != AF_INET || src->sin_len != dst->sin_len || src->sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; break; } if (src->sin_addr.s_addr == INADDR_ANY || dst->sin_addr.s_addr == INADDR_ANY) { error = EADDRNOTAVAIL; break; } error = me_set_tunnel(sc, src->sin_addr.s_addr, dst->sin_addr.s_addr); break; case SIOCDIFPHYADDR: me_delete_tunnel(sc); break; case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: if (!ME_READY(sc)) { error = EADDRNOTAVAIL; break; } src = (struct sockaddr_in *)&ifr->ifr_addr; memset(src, 0, sizeof(*src)); src->sin_family = AF_INET; src->sin_len = sizeof(*src); switch (cmd) { case SIOCGIFPSRCADDR: src->sin_addr = sc->me_src; break; case SIOCGIFPDSTADDR: src->sin_addr = sc->me_dst; break; } error = prison_if(curthread->td_ucred, sintosa(src)); if (error != 0) memset(src, 0, sizeof(*src)); break; case SIOCGTUNFIB: ifr->ifr_fib = sc->me_fibnum; break; case SIOCSTUNFIB: if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; if (ifr->ifr_fib >= rt_numfibs) error = EINVAL; else sc->me_fibnum = ifr->ifr_fib; break; default: error = EINVAL; break; } end: sx_xunlock(&me_ioctl_sx); return (error); } static int me_lookup(const struct mbuf *m, int off, int proto, void **arg) { const struct ip *ip; struct me_softc *sc; if (V_me_hashtbl == NULL) return (0); NET_EPOCH_ASSERT(); ip = mtod(m, const struct ip *); CK_LIST_FOREACH(sc, &ME_HASH(ip->ip_dst.s_addr, ip->ip_src.s_addr), chain) { if (sc->me_src.s_addr == ip->ip_dst.s_addr && sc->me_dst.s_addr == ip->ip_src.s_addr) { if ((ME2IFP(sc)->if_flags & IFF_UP) == 0) return (0); *arg = sc; return (ENCAP_DRV_LOOKUP); } } return (0); } /* * Check that ingress address belongs to local host. */ static void me_set_running(struct me_softc *sc) { if (in_localip(sc->me_src)) ME2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING; else ME2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; } /* * ifaddr_event handler. * Clear IFF_DRV_RUNNING flag when ingress address disappears to prevent * source address spoofing. */ static void me_srcaddr(void *arg __unused, const struct sockaddr *sa, int event __unused) { const struct sockaddr_in *sin; struct me_softc *sc; /* Check that VNET is ready */ if (V_me_hashtbl == NULL) return; NET_EPOCH_ASSERT(); sin = (const struct sockaddr_in *)sa; CK_LIST_FOREACH(sc, &ME_SRCHASH(sin->sin_addr.s_addr), srchash) { if (sc->me_src.s_addr != sin->sin_addr.s_addr) continue; me_set_running(sc); } } static int me_set_tunnel(struct me_softc *sc, in_addr_t src, in_addr_t dst) { struct me_softc *tmp; sx_assert(&me_ioctl_sx, SA_XLOCKED); if (V_me_hashtbl == NULL) { V_me_hashtbl = me_hashinit(); V_me_srchashtbl = me_hashinit(); } if (sc->me_src.s_addr == src && sc->me_dst.s_addr == dst) return (0); CK_LIST_FOREACH(tmp, &ME_HASH(src, dst), chain) { if (tmp == sc) continue; if (tmp->me_src.s_addr == src && tmp->me_dst.s_addr == dst) return (EADDRNOTAVAIL); } me_delete_tunnel(sc); sc->me_dst.s_addr = dst; sc->me_src.s_addr = src; CK_LIST_INSERT_HEAD(&ME_HASH(src, dst), sc, chain); CK_LIST_INSERT_HEAD(&ME_SRCHASH(src), sc, srchash); me_set_running(sc); if_link_state_change(ME2IFP(sc), LINK_STATE_UP); return (0); } static void me_delete_tunnel(struct me_softc *sc) { sx_assert(&me_ioctl_sx, SA_XLOCKED); if (ME_READY(sc)) { CK_LIST_REMOVE(sc, chain); CK_LIST_REMOVE(sc, srchash); ME_WAIT(); sc->me_src.s_addr = 0; sc->me_dst.s_addr = 0; ME2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(ME2IFP(sc), LINK_STATE_DOWN); } } static uint16_t me_in_cksum(uint16_t *p, int nwords) { uint32_t sum = 0; while (nwords-- > 0) sum += *p++; sum = (sum >> 16) + (sum & 0xffff); sum += (sum >> 16); return (~sum); } static int me_input(struct mbuf *m, int off, int proto, void *arg) { struct me_softc *sc = arg; struct mobhdr *mh; struct ifnet *ifp; struct ip *ip; int hlen; NET_EPOCH_ASSERT(); ifp = ME2IFP(sc); /* checks for short packets */ hlen = sizeof(struct mobhdr); if (m->m_pkthdr.len < sizeof(struct ip) + hlen) hlen -= sizeof(struct in_addr); if (m->m_len < sizeof(struct ip) + hlen) m = m_pullup(m, sizeof(struct ip) + hlen); if (m == NULL) goto drop; mh = (struct mobhdr *)mtodo(m, sizeof(struct ip)); /* check for wrong flags */ if (mh->mob_flags & (~MOB_FLAGS_SP)) { m_freem(m); goto drop; } if (mh->mob_flags) { if (hlen != sizeof(struct mobhdr)) { m_freem(m); goto drop; } } else hlen = sizeof(struct mobhdr) - sizeof(struct in_addr); /* check mobile header checksum */ if (me_in_cksum((uint16_t *)mh, hlen / sizeof(uint16_t)) != 0) { m_freem(m); goto drop; } #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif ip = mtod(m, struct ip *); ip->ip_dst = mh->mob_dst; ip->ip_p = mh->mob_proto; ip->ip_sum = 0; ip->ip_len = htons(m->m_pkthdr.len - hlen); if (mh->mob_flags) ip->ip_src = mh->mob_src; memmove(mtodo(m, hlen), ip, sizeof(struct ip)); m_adj(m, hlen); m_clrprotoflags(m); m->m_pkthdr.rcvif = ifp; m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); M_SETFIB(m, ifp->if_fib); hlen = AF_INET; BPF_MTAP2(ifp, &hlen, sizeof(hlen), m); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if ((ifp->if_flags & IFF_MONITOR) != 0) m_freem(m); else netisr_dispatch(NETISR_IP, m); return (IPPROTO_DONE); drop: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return (IPPROTO_DONE); } static int me_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, - struct route *ro __unused) + struct route *ro) { uint32_t af; if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else - af = dst->sa_family; + af = RO_GET_FAMILY(ro, dst); m->m_pkthdr.csum_data = af; return (ifp->if_transmit(ifp, m)); } #define MTAG_ME 1414491977 static int me_transmit(struct ifnet *ifp, struct mbuf *m) { ME_RLOCK_TRACKER; struct mobhdr mh; struct me_softc *sc; struct ip *ip; uint32_t af; int error, hlen, plen; ME_RLOCK(); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error != 0) goto drop; #endif error = ENETDOWN; sc = ifp->if_softc; if (sc == NULL || !ME_READY(sc) || (ifp->if_flags & IFF_MONITOR) != 0 || (ifp->if_flags & IFF_UP) == 0 || (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (error = if_tunnel_check_nesting(ifp, m, MTAG_ME, V_max_me_nesting)) != 0) { m_freem(m); goto drop; } af = m->m_pkthdr.csum_data; if (af != AF_INET) { error = EAFNOSUPPORT; m_freem(m); goto drop; } if (m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) { error = ENOBUFS; goto drop; } ip = mtod(m, struct ip *); /* Fragmented datagramms shouldn't be encapsulated */ if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) { error = EINVAL; m_freem(m); goto drop; } mh.mob_proto = ip->ip_p; mh.mob_src = ip->ip_src; mh.mob_dst = ip->ip_dst; if (in_hosteq(sc->me_src, ip->ip_src)) { hlen = sizeof(struct mobhdr) - sizeof(struct in_addr); mh.mob_flags = 0; } else { hlen = sizeof(struct mobhdr); mh.mob_flags = MOB_FLAGS_SP; } BPF_MTAP2(ifp, &af, sizeof(af), m); plen = m->m_pkthdr.len; ip->ip_src = sc->me_src; ip->ip_dst = sc->me_dst; m->m_flags &= ~(M_BCAST|M_MCAST); M_SETFIB(m, sc->me_fibnum); M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto drop; } if (m->m_len < sizeof(struct ip) + hlen) m = m_pullup(m, sizeof(struct ip) + hlen); if (m == NULL) { error = ENOBUFS; goto drop; } memmove(mtod(m, void *), mtodo(m, hlen), sizeof(struct ip)); ip = mtod(m, struct ip *); ip->ip_len = htons(m->m_pkthdr.len); ip->ip_p = IPPROTO_MOBILE; ip->ip_sum = 0; mh.mob_csum = 0; mh.mob_csum = me_in_cksum((uint16_t *)&mh, hlen / sizeof(uint16_t)); bcopy(&mh, mtodo(m, sizeof(struct ip)), hlen); error = ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); drop: if (error) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, plen); } ME_RUNLOCK(); return (error); } static void me_qflush(struct ifnet *ifp __unused) { } static const struct srcaddrtab *me_srcaddrtab = NULL; static const struct encaptab *ecookie = NULL; static const struct encap_config me_encap_cfg = { .proto = IPPROTO_MOBILE, .min_length = sizeof(struct ip) + sizeof(struct mobhdr) - sizeof(in_addr_t), .exact_match = ENCAP_DRV_LOOKUP, .lookup = me_lookup, .input = me_input }; static int memodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: me_srcaddrtab = ip_encap_register_srcaddr(me_srcaddr, NULL, M_WAITOK); ecookie = ip_encap_attach(&me_encap_cfg, NULL, M_WAITOK); break; case MOD_UNLOAD: ip_encap_detach(ecookie); ip_encap_unregister_srcaddr(me_srcaddrtab); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t me_mod = { "if_me", memodevent, 0 }; DECLARE_MODULE(if_me, me_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_me, 1); diff --git a/sys/net/if_spppsubr.c b/sys/net/if_spppsubr.c index fbf7b0ea8f4c..804367025532 100644 --- a/sys/net/if_spppsubr.c +++ b/sys/net/if_spppsubr.c @@ -1,5417 +1,5418 @@ /* * Synchronous PPP/Cisco/Frame Relay link level subroutines. * Keepalive protocol implemented in both Cisco and PPP modes. */ /*- * Copyright (C) 1994-2000 Cronyx Engineering. * Author: Serge Vakulenko, * * Heavily revamped to conform to RFC 1661. * Copyright (C) 1997, 2001 Joerg Wunsch. * * This software is distributed with NO WARRANTIES, not even the implied * warranties for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * Authors grant any other persons or organisations permission to use * or modify this software as long as this message is kept with the software, * all derivative works or modified versions. * * From: Version 2.4, Thu Apr 30 17:17:21 MSD 1997 * * $FreeBSD$ */ #include #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif #ifdef INET6 #include #endif #include #include #define IOCTL_CMD_T u_long #define MAXALIVECNT 3 /* max. alive packets */ /* * Interface flags that can be set in an ifconfig command. * * Setting link0 will make the link passive, i.e. it will be marked * as being administrative openable, but won't be opened to begin * with. Incoming calls will be answered, or subsequent calls with * -link1 will cause the administrative open of the LCP layer. * * Setting link1 will cause the link to auto-dial only as packets * arrive to be sent. * * Setting IFF_DEBUG will syslog the option negotiation and state * transitions at level kern.debug. Note: all logs consistently look * like * * : * * with being something like "bppp0", and * being one of "lcp", "ipcp", "cisco", "chap", "pap", etc. */ #define IFF_PASSIVE IFF_LINK0 /* wait passively for connection */ #define IFF_AUTO IFF_LINK1 /* auto-dial on output */ #define IFF_CISCO IFF_LINK2 /* auto-dial on output */ #define PPP_ALLSTATIONS 0xff /* All-Stations broadcast address */ #define PPP_UI 0x03 /* Unnumbered Information */ #define PPP_IP 0x0021 /* Internet Protocol */ #define PPP_ISO 0x0023 /* ISO OSI Protocol */ #define PPP_XNS 0x0025 /* Xerox NS Protocol */ #define PPP_IPX 0x002b /* Novell IPX Protocol */ #define PPP_VJ_COMP 0x002d /* VJ compressed TCP/IP */ #define PPP_VJ_UCOMP 0x002f /* VJ uncompressed TCP/IP */ #define PPP_IPV6 0x0057 /* Internet Protocol Version 6 */ #define PPP_LCP 0xc021 /* Link Control Protocol */ #define PPP_PAP 0xc023 /* Password Authentication Protocol */ #define PPP_CHAP 0xc223 /* Challenge-Handshake Auth Protocol */ #define PPP_IPCP 0x8021 /* Internet Protocol Control Protocol */ #define PPP_IPV6CP 0x8057 /* IPv6 Control Protocol */ #define CONF_REQ 1 /* PPP configure request */ #define CONF_ACK 2 /* PPP configure acknowledge */ #define CONF_NAK 3 /* PPP configure negative ack */ #define CONF_REJ 4 /* PPP configure reject */ #define TERM_REQ 5 /* PPP terminate request */ #define TERM_ACK 6 /* PPP terminate acknowledge */ #define CODE_REJ 7 /* PPP code reject */ #define PROTO_REJ 8 /* PPP protocol reject */ #define ECHO_REQ 9 /* PPP echo request */ #define ECHO_REPLY 10 /* PPP echo reply */ #define DISC_REQ 11 /* PPP discard request */ #define LCP_OPT_MRU 1 /* maximum receive unit */ #define LCP_OPT_ASYNC_MAP 2 /* async control character map */ #define LCP_OPT_AUTH_PROTO 3 /* authentication protocol */ #define LCP_OPT_QUAL_PROTO 4 /* quality protocol */ #define LCP_OPT_MAGIC 5 /* magic number */ #define LCP_OPT_RESERVED 6 /* reserved */ #define LCP_OPT_PROTO_COMP 7 /* protocol field compression */ #define LCP_OPT_ADDR_COMP 8 /* address/control field compression */ #define IPCP_OPT_ADDRESSES 1 /* both IP addresses; deprecated */ #define IPCP_OPT_COMPRESSION 2 /* IP compression protocol (VJ) */ #define IPCP_OPT_ADDRESS 3 /* local IP address */ #define IPV6CP_OPT_IFID 1 /* interface identifier */ #define IPV6CP_OPT_COMPRESSION 2 /* IPv6 compression protocol */ #define IPCP_COMP_VJ 0x2d /* Code for VJ compression */ #define PAP_REQ 1 /* PAP name/password request */ #define PAP_ACK 2 /* PAP acknowledge */ #define PAP_NAK 3 /* PAP fail */ #define CHAP_CHALLENGE 1 /* CHAP challenge request */ #define CHAP_RESPONSE 2 /* CHAP challenge response */ #define CHAP_SUCCESS 3 /* CHAP response ok */ #define CHAP_FAILURE 4 /* CHAP response failed */ #define CHAP_MD5 5 /* hash algorithm - MD5 */ #define CISCO_MULTICAST 0x8f /* Cisco multicast address */ #define CISCO_UNICAST 0x0f /* Cisco unicast address */ #define CISCO_KEEPALIVE 0x8035 /* Cisco keepalive protocol */ #define CISCO_ADDR_REQ 0 /* Cisco address request */ #define CISCO_ADDR_REPLY 1 /* Cisco address reply */ #define CISCO_KEEPALIVE_REQ 2 /* Cisco keepalive request */ /* states are named and numbered according to RFC 1661 */ #define STATE_INITIAL 0 #define STATE_STARTING 1 #define STATE_CLOSED 2 #define STATE_STOPPED 3 #define STATE_CLOSING 4 #define STATE_STOPPING 5 #define STATE_REQ_SENT 6 #define STATE_ACK_RCVD 7 #define STATE_ACK_SENT 8 #define STATE_OPENED 9 static MALLOC_DEFINE(M_SPPP, "sppp", "synchronous PPP interface internals"); struct ppp_header { u_char address; u_char control; u_short protocol; } __packed; #define PPP_HEADER_LEN sizeof (struct ppp_header) struct lcp_header { u_char type; u_char ident; u_short len; } __packed; #define LCP_HEADER_LEN sizeof (struct lcp_header) struct cisco_packet { u_long type; u_long par1; u_long par2; u_short rel; u_short time0; u_short time1; } __packed; #define CISCO_PACKET_LEN sizeof (struct cisco_packet) /* * We follow the spelling and capitalization of RFC 1661 here, to make * it easier comparing with the standard. Please refer to this RFC in * case you can't make sense out of these abbreviation; it will also * explain the semantics related to the various events and actions. */ struct cp { u_short proto; /* PPP control protocol number */ u_char protoidx; /* index into state table in struct sppp */ u_char flags; #define CP_LCP 0x01 /* this is the LCP */ #define CP_AUTH 0x02 /* this is an authentication protocol */ #define CP_NCP 0x04 /* this is a NCP */ #define CP_QUAL 0x08 /* this is a quality reporting protocol */ const char *name; /* name of this control protocol */ /* event handlers */ void (*Up)(struct sppp *sp); void (*Down)(struct sppp *sp); void (*Open)(struct sppp *sp); void (*Close)(struct sppp *sp); void (*TO)(void *sp); int (*RCR)(struct sppp *sp, struct lcp_header *h, int len); void (*RCN_rej)(struct sppp *sp, struct lcp_header *h, int len); void (*RCN_nak)(struct sppp *sp, struct lcp_header *h, int len); /* actions */ void (*tlu)(struct sppp *sp); void (*tld)(struct sppp *sp); void (*tls)(struct sppp *sp); void (*tlf)(struct sppp *sp); void (*scr)(struct sppp *sp); }; #define SPP_FMT "%s: " #define SPP_ARGS(ifp) (ifp)->if_xname #define SPPP_LOCK(sp) mtx_lock (&(sp)->mtx) #define SPPP_UNLOCK(sp) mtx_unlock (&(sp)->mtx) #define SPPP_LOCK_ASSERT(sp) mtx_assert (&(sp)->mtx, MA_OWNED) #define SPPP_LOCK_OWNED(sp) mtx_owned (&(sp)->mtx) #ifdef INET /* * The following disgusting hack gets around the problem that IP TOS * can't be set yet. We want to put "interactive" traffic on a high * priority queue. To decide if traffic is interactive, we check that * a) it is TCP and b) one of its ports is telnet, rlogin or ftp control. * * XXX is this really still necessary? - joerg - */ static const u_short interactive_ports[8] = { 0, 513, 0, 0, 0, 21, 0, 23, }; #define INTERACTIVE(p) (interactive_ports[(p) & 7] == (p)) #endif /* almost every function needs these */ #define STDDCL \ struct ifnet *ifp = SP2IFP(sp); \ int debug = ifp->if_flags & IFF_DEBUG static int sppp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro); static void sppp_cisco_send(struct sppp *sp, int type, long par1, long par2); static void sppp_cisco_input(struct sppp *sp, struct mbuf *m); static void sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m); static void sppp_cp_send(struct sppp *sp, u_short proto, u_char type, u_char ident, u_short len, void *data); /* static void sppp_cp_timeout(void *arg); */ static void sppp_cp_change_state(const struct cp *cp, struct sppp *sp, int newstate); static void sppp_auth_send(const struct cp *cp, struct sppp *sp, unsigned int type, unsigned int id, ...); static void sppp_up_event(const struct cp *cp, struct sppp *sp); static void sppp_down_event(const struct cp *cp, struct sppp *sp); static void sppp_open_event(const struct cp *cp, struct sppp *sp); static void sppp_close_event(const struct cp *cp, struct sppp *sp); static void sppp_to_event(const struct cp *cp, struct sppp *sp); static void sppp_null(struct sppp *sp); static void sppp_pp_up(struct sppp *sp); static void sppp_pp_down(struct sppp *sp); static void sppp_lcp_init(struct sppp *sp); static void sppp_lcp_up(struct sppp *sp); static void sppp_lcp_down(struct sppp *sp); static void sppp_lcp_open(struct sppp *sp); static void sppp_lcp_close(struct sppp *sp); static void sppp_lcp_TO(void *sp); static int sppp_lcp_RCR(struct sppp *sp, struct lcp_header *h, int len); static void sppp_lcp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len); static void sppp_lcp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len); static void sppp_lcp_tlu(struct sppp *sp); static void sppp_lcp_tld(struct sppp *sp); static void sppp_lcp_tls(struct sppp *sp); static void sppp_lcp_tlf(struct sppp *sp); static void sppp_lcp_scr(struct sppp *sp); static void sppp_lcp_check_and_close(struct sppp *sp); static int sppp_ncp_check(struct sppp *sp); static void sppp_ipcp_init(struct sppp *sp); static void sppp_ipcp_up(struct sppp *sp); static void sppp_ipcp_down(struct sppp *sp); static void sppp_ipcp_open(struct sppp *sp); static void sppp_ipcp_close(struct sppp *sp); static void sppp_ipcp_TO(void *sp); static int sppp_ipcp_RCR(struct sppp *sp, struct lcp_header *h, int len); static void sppp_ipcp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len); static void sppp_ipcp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len); static void sppp_ipcp_tlu(struct sppp *sp); static void sppp_ipcp_tld(struct sppp *sp); static void sppp_ipcp_tls(struct sppp *sp); static void sppp_ipcp_tlf(struct sppp *sp); static void sppp_ipcp_scr(struct sppp *sp); static void sppp_ipv6cp_init(struct sppp *sp); static void sppp_ipv6cp_up(struct sppp *sp); static void sppp_ipv6cp_down(struct sppp *sp); static void sppp_ipv6cp_open(struct sppp *sp); static void sppp_ipv6cp_close(struct sppp *sp); static void sppp_ipv6cp_TO(void *sp); static int sppp_ipv6cp_RCR(struct sppp *sp, struct lcp_header *h, int len); static void sppp_ipv6cp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len); static void sppp_ipv6cp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len); static void sppp_ipv6cp_tlu(struct sppp *sp); static void sppp_ipv6cp_tld(struct sppp *sp); static void sppp_ipv6cp_tls(struct sppp *sp); static void sppp_ipv6cp_tlf(struct sppp *sp); static void sppp_ipv6cp_scr(struct sppp *sp); static void sppp_pap_input(struct sppp *sp, struct mbuf *m); static void sppp_pap_init(struct sppp *sp); static void sppp_pap_open(struct sppp *sp); static void sppp_pap_close(struct sppp *sp); static void sppp_pap_TO(void *sp); static void sppp_pap_my_TO(void *sp); static void sppp_pap_tlu(struct sppp *sp); static void sppp_pap_tld(struct sppp *sp); static void sppp_pap_scr(struct sppp *sp); static void sppp_chap_input(struct sppp *sp, struct mbuf *m); static void sppp_chap_init(struct sppp *sp); static void sppp_chap_open(struct sppp *sp); static void sppp_chap_close(struct sppp *sp); static void sppp_chap_TO(void *sp); static void sppp_chap_tlu(struct sppp *sp); static void sppp_chap_tld(struct sppp *sp); static void sppp_chap_scr(struct sppp *sp); static const char *sppp_auth_type_name(u_short proto, u_char type); static const char *sppp_cp_type_name(u_char type); #ifdef INET static const char *sppp_dotted_quad(u_long addr); static const char *sppp_ipcp_opt_name(u_char opt); #endif #ifdef INET6 static const char *sppp_ipv6cp_opt_name(u_char opt); #endif static const char *sppp_lcp_opt_name(u_char opt); static const char *sppp_phase_name(enum ppp_phase phase); static const char *sppp_proto_name(u_short proto); static const char *sppp_state_name(int state); static int sppp_params(struct sppp *sp, u_long cmd, void *data); static int sppp_strnlen(u_char *p, int max); static void sppp_keepalive(void *dummy); static void sppp_phase_network(struct sppp *sp); static void sppp_print_bytes(const u_char *p, u_short len); static void sppp_print_string(const char *p, u_short len); static void sppp_qflush(struct ifqueue *ifq); #ifdef INET static void sppp_set_ip_addr(struct sppp *sp, u_long src); #endif #ifdef INET6 static void sppp_get_ip6_addrs(struct sppp *sp, struct in6_addr *src, struct in6_addr *dst, struct in6_addr *srcmask); #ifdef IPV6CP_MYIFID_DYN static void sppp_set_ip6_addr(struct sppp *sp, const struct in6_addr *src); static void sppp_gen_ip6_addr(struct sppp *sp, const struct in6_addr *src); #endif static void sppp_suggest_ip6_addr(struct sppp *sp, struct in6_addr *src); #endif /* if_start () wrapper */ static void sppp_ifstart (struct ifnet *ifp); /* our control protocol descriptors */ static const struct cp lcp = { PPP_LCP, IDX_LCP, CP_LCP, "lcp", sppp_lcp_up, sppp_lcp_down, sppp_lcp_open, sppp_lcp_close, sppp_lcp_TO, sppp_lcp_RCR, sppp_lcp_RCN_rej, sppp_lcp_RCN_nak, sppp_lcp_tlu, sppp_lcp_tld, sppp_lcp_tls, sppp_lcp_tlf, sppp_lcp_scr }; static const struct cp ipcp = { PPP_IPCP, IDX_IPCP, #ifdef INET /* don't run IPCP if there's no IPv4 support */ CP_NCP, #else 0, #endif "ipcp", sppp_ipcp_up, sppp_ipcp_down, sppp_ipcp_open, sppp_ipcp_close, sppp_ipcp_TO, sppp_ipcp_RCR, sppp_ipcp_RCN_rej, sppp_ipcp_RCN_nak, sppp_ipcp_tlu, sppp_ipcp_tld, sppp_ipcp_tls, sppp_ipcp_tlf, sppp_ipcp_scr }; static const struct cp ipv6cp = { PPP_IPV6CP, IDX_IPV6CP, #ifdef INET6 /*don't run IPv6CP if there's no IPv6 support*/ CP_NCP, #else 0, #endif "ipv6cp", sppp_ipv6cp_up, sppp_ipv6cp_down, sppp_ipv6cp_open, sppp_ipv6cp_close, sppp_ipv6cp_TO, sppp_ipv6cp_RCR, sppp_ipv6cp_RCN_rej, sppp_ipv6cp_RCN_nak, sppp_ipv6cp_tlu, sppp_ipv6cp_tld, sppp_ipv6cp_tls, sppp_ipv6cp_tlf, sppp_ipv6cp_scr }; static const struct cp pap = { PPP_PAP, IDX_PAP, CP_AUTH, "pap", sppp_null, sppp_null, sppp_pap_open, sppp_pap_close, sppp_pap_TO, 0, 0, 0, sppp_pap_tlu, sppp_pap_tld, sppp_null, sppp_null, sppp_pap_scr }; static const struct cp chap = { PPP_CHAP, IDX_CHAP, CP_AUTH, "chap", sppp_null, sppp_null, sppp_chap_open, sppp_chap_close, sppp_chap_TO, 0, 0, 0, sppp_chap_tlu, sppp_chap_tld, sppp_null, sppp_null, sppp_chap_scr }; static const struct cp *cps[IDX_COUNT] = { &lcp, /* IDX_LCP */ &ipcp, /* IDX_IPCP */ &ipv6cp, /* IDX_IPV6CP */ &pap, /* IDX_PAP */ &chap, /* IDX_CHAP */ }; static void* sppp_alloc(u_char type, struct ifnet *ifp) { struct sppp *sp; sp = malloc(sizeof(struct sppp), M_SPPP, M_WAITOK | M_ZERO); sp->pp_ifp = ifp; return (sp); } static void sppp_free(void *com, u_char type) { free(com, M_SPPP); } static int sppp_modevent(module_t mod, int type, void *unused) { switch (type) { case MOD_LOAD: /* * XXX: should probably be IFT_SPPP, but it's fairly * harmless to allocate struct sppp's for non-sppp * interfaces. */ if_register_com_alloc(IFT_PPP, sppp_alloc, sppp_free); break; case MOD_UNLOAD: /* if_deregister_com_alloc(IFT_PPP); */ return EACCES; default: return EOPNOTSUPP; } return 0; } static moduledata_t spppmod = { "sppp", sppp_modevent, 0 }; MODULE_VERSION(sppp, 1); DECLARE_MODULE(sppp, spppmod, SI_SUB_DRIVERS, SI_ORDER_ANY); /* * Exported functions, comprising our interface to the lower layer. */ /* * Process the received packet. */ void sppp_input(struct ifnet *ifp, struct mbuf *m) { struct ppp_header *h; int isr = -1; struct sppp *sp = IFP2SP(ifp); int debug, do_account = 0; #ifdef INET int hlen, vjlen; u_char *iphdr; #endif SPPP_LOCK(sp); debug = ifp->if_flags & IFF_DEBUG; if (ifp->if_flags & IFF_UP) /* Count received bytes, add FCS and one flag */ if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len + 3); if (m->m_pkthdr.len <= PPP_HEADER_LEN) { /* Too small packet, drop it. */ if (debug) log(LOG_DEBUG, SPP_FMT "input packet is too small, %d bytes\n", SPP_ARGS(ifp), m->m_pkthdr.len); drop: m_freem (m); SPPP_UNLOCK(sp); drop2: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); return; } if (sp->pp_mode == PP_FR) { sppp_fr_input (sp, m); SPPP_UNLOCK(sp); return; } /* Get PPP header. */ h = mtod (m, struct ppp_header*); m_adj (m, PPP_HEADER_LEN); switch (h->address) { case PPP_ALLSTATIONS: if (h->control != PPP_UI) goto invalid; if (sp->pp_mode == IFF_CISCO) { if (debug) log(LOG_DEBUG, SPP_FMT "PPP packet in Cisco mode " "\n", SPP_ARGS(ifp), h->address, h->control, ntohs(h->protocol)); goto drop; } switch (ntohs (h->protocol)) { default: if (debug) log(LOG_DEBUG, SPP_FMT "rejecting protocol " "\n", SPP_ARGS(ifp), h->address, h->control, ntohs(h->protocol)); if (sp->state[IDX_LCP] == STATE_OPENED) sppp_cp_send (sp, PPP_LCP, PROTO_REJ, ++sp->pp_seq[IDX_LCP], m->m_pkthdr.len + 2, &h->protocol); if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto drop; case PPP_LCP: sppp_cp_input(&lcp, sp, m); m_freem (m); SPPP_UNLOCK(sp); return; case PPP_PAP: if (sp->pp_phase >= PHASE_AUTHENTICATE) sppp_pap_input(sp, m); m_freem (m); SPPP_UNLOCK(sp); return; case PPP_CHAP: if (sp->pp_phase >= PHASE_AUTHENTICATE) sppp_chap_input(sp, m); m_freem (m); SPPP_UNLOCK(sp); return; #ifdef INET case PPP_IPCP: if (sp->pp_phase == PHASE_NETWORK) sppp_cp_input(&ipcp, sp, m); m_freem (m); SPPP_UNLOCK(sp); return; case PPP_IP: if (sp->state[IDX_IPCP] == STATE_OPENED) { isr = NETISR_IP; } do_account++; break; case PPP_VJ_COMP: if (sp->state[IDX_IPCP] == STATE_OPENED) { if ((vjlen = sl_uncompress_tcp_core(mtod(m, u_char *), m->m_len, m->m_len, TYPE_COMPRESSED_TCP, sp->pp_comp, &iphdr, &hlen)) <= 0) { if (debug) log(LOG_INFO, SPP_FMT "VJ uncompress failed on compressed packet\n", SPP_ARGS(ifp)); goto drop; } /* * Trim the VJ header off the packet, and prepend * the uncompressed IP header (which will usually * end up in two chained mbufs since there's not * enough leading space in the existing mbuf). */ m_adj(m, vjlen); M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) { SPPP_UNLOCK(sp); goto drop2; } bcopy(iphdr, mtod(m, u_char *), hlen); isr = NETISR_IP; } do_account++; break; case PPP_VJ_UCOMP: if (sp->state[IDX_IPCP] == STATE_OPENED) { if (sl_uncompress_tcp_core(mtod(m, u_char *), m->m_len, m->m_len, TYPE_UNCOMPRESSED_TCP, sp->pp_comp, &iphdr, &hlen) != 0) { if (debug) log(LOG_INFO, SPP_FMT "VJ uncompress failed on uncompressed packet\n", SPP_ARGS(ifp)); goto drop; } isr = NETISR_IP; } do_account++; break; #endif #ifdef INET6 case PPP_IPV6CP: if (sp->pp_phase == PHASE_NETWORK) sppp_cp_input(&ipv6cp, sp, m); m_freem (m); SPPP_UNLOCK(sp); return; case PPP_IPV6: if (sp->state[IDX_IPV6CP] == STATE_OPENED) isr = NETISR_IPV6; do_account++; break; #endif } break; case CISCO_MULTICAST: case CISCO_UNICAST: /* Don't check the control field here (RFC 1547). */ if (sp->pp_mode != IFF_CISCO) { if (debug) log(LOG_DEBUG, SPP_FMT "Cisco packet in PPP mode " "\n", SPP_ARGS(ifp), h->address, h->control, ntohs(h->protocol)); goto drop; } switch (ntohs (h->protocol)) { default: if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto invalid; case CISCO_KEEPALIVE: sppp_cisco_input (sp, m); m_freem (m); SPPP_UNLOCK(sp); return; #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; do_account++; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; do_account++; break; #endif } break; default: /* Invalid PPP packet. */ invalid: if (debug) log(LOG_DEBUG, SPP_FMT "invalid input packet " "\n", SPP_ARGS(ifp), h->address, h->control, ntohs(h->protocol)); goto drop; } if (! (ifp->if_flags & IFF_UP) || isr == -1) goto drop; SPPP_UNLOCK(sp); M_SETFIB(m, ifp->if_fib); /* Check queue. */ if (netisr_queue(isr, m)) { /* (0) on success. */ if (debug) log(LOG_DEBUG, SPP_FMT "protocol queue overflow\n", SPP_ARGS(ifp)); goto drop2; } if (do_account) /* * Do only account for network packets, not for control * packets. This is used by some subsystems to detect * idle lines. */ sp->pp_last_recv = time_uptime; } static void sppp_ifstart_sched(void *dummy) { struct sppp *sp = dummy; sp->if_start(SP2IFP(sp)); } /* if_start () wrapper function. We use it to schedule real if_start () for * execution. We can't call it directly */ static void sppp_ifstart(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); if (SPPP_LOCK_OWNED(sp)) { if (callout_pending(&sp->ifstart_callout)) return; callout_reset(&sp->ifstart_callout, 1, sppp_ifstart_sched, (void *)sp); } else { sp->if_start(ifp); } } /* * Enqueue transmit packet. */ static int sppp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct sppp *sp = IFP2SP(ifp); struct ppp_header *h; struct ifqueue *ifq = NULL; int error, rv = 0; #ifdef INET int ipproto = PPP_IP; #endif int debug = ifp->if_flags & IFF_DEBUG; + int af = RO_GET_FAMILY(ro, dst); SPPP_LOCK(sp); if (!(ifp->if_flags & IFF_UP) || (!(ifp->if_flags & IFF_AUTO) && !(ifp->if_drv_flags & IFF_DRV_RUNNING))) { #ifdef INET6 drop: #endif m_freem (m); SPPP_UNLOCK(sp); return (ENETDOWN); } if ((ifp->if_flags & IFF_AUTO) && !(ifp->if_drv_flags & IFF_DRV_RUNNING)) { #ifdef INET6 /* * XXX * * Hack to prevent the initialization-time generated * IPv6 multicast packet to erroneously cause a * dialout event in case IPv6 has been * administratively disabled on that interface. */ - if (dst->sa_family == AF_INET6 && + if (af == AF_INET6 && !(sp->confflags & CONF_ENABLE_IPV6)) goto drop; #endif /* * Interface is not yet running, but auto-dial. Need * to start LCP for it. */ ifp->if_drv_flags |= IFF_DRV_RUNNING; lcp.Open(sp); } #ifdef INET - if (dst->sa_family == AF_INET) { + if (af == AF_INET) { /* XXX Check mbuf length here? */ struct ip *ip = mtod (m, struct ip*); struct tcphdr *tcp = (struct tcphdr*) ((long*)ip + ip->ip_hl); /* * When using dynamic local IP address assignment by using * 0.0.0.0 as a local address, the first TCP session will * not connect because the local TCP checksum is computed * using 0.0.0.0 which will later become our real IP address * so the TCP checksum computed at the remote end will * become invalid. So we * - don't let packets with src ip addr 0 thru * - we flag TCP packets with src ip 0 as an error */ if(ip->ip_src.s_addr == INADDR_ANY) /* -hm */ { m_freem(m); SPPP_UNLOCK(sp); if(ip->ip_p == IPPROTO_TCP) return(EADDRNOTAVAIL); else return(0); } /* * Put low delay, telnet, rlogin and ftp control packets * in front of the queue or let ALTQ take care. */ if (ALTQ_IS_ENABLED(&ifp->if_snd)) ; else if (_IF_QFULL(&sp->pp_fastq)) ; else if (ip->ip_tos & IPTOS_LOWDELAY) ifq = &sp->pp_fastq; else if (m->m_len < sizeof *ip + sizeof *tcp) ; else if (ip->ip_p != IPPROTO_TCP) ; else if (INTERACTIVE (ntohs (tcp->th_sport))) ifq = &sp->pp_fastq; else if (INTERACTIVE (ntohs (tcp->th_dport))) ifq = &sp->pp_fastq; /* * Do IP Header compression */ if (sp->pp_mode != IFF_CISCO && sp->pp_mode != PP_FR && (sp->ipcp.flags & IPCP_VJ) && ip->ip_p == IPPROTO_TCP) switch (sl_compress_tcp(m, ip, sp->pp_comp, sp->ipcp.compress_cid)) { case TYPE_COMPRESSED_TCP: ipproto = PPP_VJ_COMP; break; case TYPE_UNCOMPRESSED_TCP: ipproto = PPP_VJ_UCOMP; break; case TYPE_IP: ipproto = PPP_IP; break; default: m_freem(m); SPPP_UNLOCK(sp); return (EINVAL); } } #endif #ifdef INET6 - if (dst->sa_family == AF_INET6) { + if (af == AF_INET6) { /* XXX do something tricky here? */ } #endif if (sp->pp_mode == PP_FR) { /* Add frame relay header. */ - m = sppp_fr_header (sp, m, dst->sa_family); + m = sppp_fr_header (sp, m, af); if (! m) goto nobufs; goto out; } /* * Prepend general data packet PPP header. For now, IP only. */ M_PREPEND (m, PPP_HEADER_LEN, M_NOWAIT); if (! m) { nobufs: if (debug) log(LOG_DEBUG, SPP_FMT "no memory for transmit header\n", SPP_ARGS(ifp)); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); SPPP_UNLOCK(sp); return (ENOBUFS); } /* * May want to check size of packet * (albeit due to the implementation it's always enough) */ h = mtod (m, struct ppp_header*); if (sp->pp_mode == IFF_CISCO) { h->address = CISCO_UNICAST; /* unicast address */ h->control = 0; } else { h->address = PPP_ALLSTATIONS; /* broadcast address */ h->control = PPP_UI; /* Unnumbered Info */ } - switch (dst->sa_family) { + switch (af) { #ifdef INET case AF_INET: /* Internet Protocol */ if (sp->pp_mode == IFF_CISCO) h->protocol = htons (ETHERTYPE_IP); else { /* * Don't choke with an ENETDOWN early. It's * possible that we just started dialing out, * so don't drop the packet immediately. If * we notice that we run out of buffer space * below, we will however remember that we are * not ready to carry IP packets, and return * ENETDOWN, as opposed to ENOBUFS. */ h->protocol = htons(ipproto); if (sp->state[IDX_IPCP] != STATE_OPENED) rv = ENETDOWN; } break; #endif #ifdef INET6 case AF_INET6: /* Internet Protocol */ if (sp->pp_mode == IFF_CISCO) h->protocol = htons (ETHERTYPE_IPV6); else { /* * Don't choke with an ENETDOWN early. It's * possible that we just started dialing out, * so don't drop the packet immediately. If * we notice that we run out of buffer space * below, we will however remember that we are * not ready to carry IP packets, and return * ENETDOWN, as opposed to ENOBUFS. */ h->protocol = htons(PPP_IPV6); if (sp->state[IDX_IPV6CP] != STATE_OPENED) rv = ENETDOWN; } break; #endif default: m_freem (m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); SPPP_UNLOCK(sp); return (EAFNOSUPPORT); } /* * Queue message on interface, and start output if interface * not yet active. */ out: if (ifq != NULL) error = !(IF_HANDOFF_ADJ(ifq, m, ifp, 3)); else IFQ_HANDOFF_ADJ(ifp, m, 3, error); if (error) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); SPPP_UNLOCK(sp); return (rv? rv: ENOBUFS); } SPPP_UNLOCK(sp); /* * Unlike in sppp_input(), we can always bump the timestamp * here since sppp_output() is only called on behalf of * network-layer traffic; control-layer traffic is handled * by sppp_cp_send(). */ sp->pp_last_sent = time_uptime; return (0); } void sppp_attach(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); /* Initialize mtx lock */ mtx_init(&sp->mtx, "sppp", MTX_NETWORK_LOCK, MTX_DEF | MTX_RECURSE); /* Initialize keepalive handler. */ callout_init(&sp->keepalive_callout, 1); callout_reset(&sp->keepalive_callout, hz * 10, sppp_keepalive, (void *)sp); ifp->if_mtu = PP_MTU; ifp->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; ifp->if_output = sppp_output; #if 0 sp->pp_flags = PP_KEEPALIVE; #endif ifp->if_snd.ifq_maxlen = 32; sp->pp_fastq.ifq_maxlen = 32; sp->pp_cpq.ifq_maxlen = 20; sp->pp_loopcnt = 0; sp->pp_alivecnt = 0; bzero(&sp->pp_seq[0], sizeof(sp->pp_seq)); bzero(&sp->pp_rseq[0], sizeof(sp->pp_rseq)); sp->pp_phase = PHASE_DEAD; sp->pp_up = sppp_pp_up; sp->pp_down = sppp_pp_down; if(!mtx_initialized(&sp->pp_cpq.ifq_mtx)) mtx_init(&sp->pp_cpq.ifq_mtx, "sppp_cpq", NULL, MTX_DEF); if(!mtx_initialized(&sp->pp_fastq.ifq_mtx)) mtx_init(&sp->pp_fastq.ifq_mtx, "sppp_fastq", NULL, MTX_DEF); sp->pp_last_recv = sp->pp_last_sent = time_uptime; sp->confflags = 0; #ifdef INET sp->confflags |= CONF_ENABLE_VJ; #endif #ifdef INET6 sp->confflags |= CONF_ENABLE_IPV6; #endif callout_init(&sp->ifstart_callout, 1); sp->if_start = ifp->if_start; ifp->if_start = sppp_ifstart; sp->pp_comp = malloc(sizeof(struct slcompress), M_TEMP, M_WAITOK); sl_compress_init(sp->pp_comp, -1); sppp_lcp_init(sp); sppp_ipcp_init(sp); sppp_ipv6cp_init(sp); sppp_pap_init(sp); sppp_chap_init(sp); } void sppp_detach(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); int i; KASSERT(mtx_initialized(&sp->mtx), ("sppp mutex is not initialized")); /* Stop keepalive handler. */ callout_drain(&sp->keepalive_callout); for (i = 0; i < IDX_COUNT; i++) { callout_drain(&sp->ch[i]); } callout_drain(&sp->pap_my_to_ch); mtx_destroy(&sp->pp_cpq.ifq_mtx); mtx_destroy(&sp->pp_fastq.ifq_mtx); mtx_destroy(&sp->mtx); } /* * Flush the interface output queue. */ static void sppp_flush_unlocked(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); sppp_qflush ((struct ifqueue *)&SP2IFP(sp)->if_snd); sppp_qflush (&sp->pp_fastq); sppp_qflush (&sp->pp_cpq); } void sppp_flush(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); SPPP_LOCK(sp); sppp_flush_unlocked (ifp); SPPP_UNLOCK(sp); } /* * Check if the output queue is empty. */ int sppp_isempty(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); int empty; SPPP_LOCK(sp); empty = !sp->pp_fastq.ifq_head && !sp->pp_cpq.ifq_head && !SP2IFP(sp)->if_snd.ifq_head; SPPP_UNLOCK(sp); return (empty); } /* * Get next packet to send. */ struct mbuf * sppp_dequeue(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); struct mbuf *m; SPPP_LOCK(sp); /* * Process only the control protocol queue until we have at * least one NCP open. * * Do always serve all three queues in Cisco mode. */ IF_DEQUEUE(&sp->pp_cpq, m); if (m == NULL && (sppp_ncp_check(sp) || sp->pp_mode == IFF_CISCO || sp->pp_mode == PP_FR)) { IF_DEQUEUE(&sp->pp_fastq, m); if (m == NULL) IF_DEQUEUE (&SP2IFP(sp)->if_snd, m); } SPPP_UNLOCK(sp); return m; } /* * Pick the next packet, do not remove it from the queue. */ struct mbuf * sppp_pick(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); struct mbuf *m; SPPP_LOCK(sp); m = sp->pp_cpq.ifq_head; if (m == NULL && (sp->pp_phase == PHASE_NETWORK || sp->pp_mode == IFF_CISCO || sp->pp_mode == PP_FR)) if ((m = sp->pp_fastq.ifq_head) == NULL) m = SP2IFP(sp)->if_snd.ifq_head; SPPP_UNLOCK(sp); return (m); } /* * Process an ioctl request. Called on low priority level. */ int sppp_ioctl(struct ifnet *ifp, IOCTL_CMD_T cmd, void *data) { struct ifreq *ifr = (struct ifreq*) data; struct sppp *sp = IFP2SP(ifp); int rv, going_up, going_down, newmode; SPPP_LOCK(sp); rv = 0; switch (cmd) { case SIOCAIFADDR: break; case SIOCSIFADDR: /* set the interface "up" when assigning an IP address */ ifp->if_flags |= IFF_UP; /* FALLTHROUGH */ case SIOCSIFFLAGS: going_up = ifp->if_flags & IFF_UP && (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0; going_down = (ifp->if_flags & IFF_UP) == 0 && ifp->if_drv_flags & IFF_DRV_RUNNING; newmode = ifp->if_flags & IFF_PASSIVE; if (!newmode) newmode = ifp->if_flags & IFF_AUTO; if (!newmode) newmode = ifp->if_flags & IFF_CISCO; ifp->if_flags &= ~(IFF_PASSIVE | IFF_AUTO | IFF_CISCO); ifp->if_flags |= newmode; if (!newmode) newmode = sp->pp_flags & PP_FR; if (newmode != sp->pp_mode) { going_down = 1; if (!going_up) going_up = ifp->if_drv_flags & IFF_DRV_RUNNING; } if (going_down) { if (sp->pp_mode != IFF_CISCO && sp->pp_mode != PP_FR) lcp.Close(sp); else if (sp->pp_tlf) (sp->pp_tlf)(sp); sppp_flush_unlocked(ifp); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; sp->pp_mode = newmode; } if (going_up) { if (sp->pp_mode != IFF_CISCO && sp->pp_mode != PP_FR) lcp.Close(sp); sp->pp_mode = newmode; if (sp->pp_mode == 0) { ifp->if_drv_flags |= IFF_DRV_RUNNING; lcp.Open(sp); } if ((sp->pp_mode == IFF_CISCO) || (sp->pp_mode == PP_FR)) { if (sp->pp_tls) (sp->pp_tls)(sp); ifp->if_drv_flags |= IFF_DRV_RUNNING; } } break; #ifdef SIOCSIFMTU #ifndef ifr_mtu #define ifr_mtu ifr_metric #endif case SIOCSIFMTU: if (ifr->ifr_mtu < 128 || ifr->ifr_mtu > sp->lcp.their_mru) return (EINVAL); ifp->if_mtu = ifr->ifr_mtu; break; #endif #ifdef SLIOCSETMTU case SLIOCSETMTU: if (*(short*)data < 128 || *(short*)data > sp->lcp.their_mru) return (EINVAL); ifp->if_mtu = *(short*)data; break; #endif #ifdef SIOCGIFMTU case SIOCGIFMTU: ifr->ifr_mtu = ifp->if_mtu; break; #endif #ifdef SLIOCGETMTU case SLIOCGETMTU: *(short*)data = ifp->if_mtu; break; #endif case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCGIFGENERIC: case SIOCSIFGENERIC: rv = sppp_params(sp, cmd, data); break; default: rv = ENOTTY; } SPPP_UNLOCK(sp); return rv; } /* * Cisco framing implementation. */ /* * Handle incoming Cisco keepalive protocol packets. */ static void sppp_cisco_input(struct sppp *sp, struct mbuf *m) { STDDCL; struct cisco_packet *h; u_long me, mymask; if (m->m_pkthdr.len < CISCO_PACKET_LEN) { if (debug) log(LOG_DEBUG, SPP_FMT "cisco invalid packet length: %d bytes\n", SPP_ARGS(ifp), m->m_pkthdr.len); return; } h = mtod (m, struct cisco_packet*); if (debug) log(LOG_DEBUG, SPP_FMT "cisco input: %d bytes " "<0x%lx 0x%lx 0x%lx 0x%x 0x%x-0x%x>\n", SPP_ARGS(ifp), m->m_pkthdr.len, (u_long)ntohl (h->type), (u_long)h->par1, (u_long)h->par2, (u_int)h->rel, (u_int)h->time0, (u_int)h->time1); switch (ntohl (h->type)) { default: if (debug) log(-1, SPP_FMT "cisco unknown packet type: 0x%lx\n", SPP_ARGS(ifp), (u_long)ntohl (h->type)); break; case CISCO_ADDR_REPLY: /* Reply on address request, ignore */ break; case CISCO_KEEPALIVE_REQ: sp->pp_alivecnt = 0; sp->pp_rseq[IDX_LCP] = ntohl (h->par1); if (sp->pp_seq[IDX_LCP] == sp->pp_rseq[IDX_LCP]) { /* Local and remote sequence numbers are equal. * Probably, the line is in loopback mode. */ if (sp->pp_loopcnt >= MAXALIVECNT) { printf (SPP_FMT "loopback\n", SPP_ARGS(ifp)); sp->pp_loopcnt = 0; if (ifp->if_flags & IFF_UP) { if_down (ifp); sppp_qflush (&sp->pp_cpq); } } ++sp->pp_loopcnt; /* Generate new local sequence number */ sp->pp_seq[IDX_LCP] = random(); break; } sp->pp_loopcnt = 0; if (! (ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { if_up(ifp); printf (SPP_FMT "up\n", SPP_ARGS(ifp)); } break; case CISCO_ADDR_REQ: sppp_get_ip_addrs(sp, &me, 0, &mymask); if (me != 0L) sppp_cisco_send(sp, CISCO_ADDR_REPLY, me, mymask); break; } } /* * Send Cisco keepalive packet. */ static void sppp_cisco_send(struct sppp *sp, int type, long par1, long par2) { STDDCL; struct ppp_header *h; struct cisco_packet *ch; struct mbuf *m; struct timeval tv; getmicrouptime(&tv); MGETHDR (m, M_NOWAIT, MT_DATA); if (! m) return; m->m_pkthdr.len = m->m_len = PPP_HEADER_LEN + CISCO_PACKET_LEN; m->m_pkthdr.rcvif = 0; h = mtod (m, struct ppp_header*); h->address = CISCO_MULTICAST; h->control = 0; h->protocol = htons (CISCO_KEEPALIVE); ch = (struct cisco_packet*) (h + 1); ch->type = htonl (type); ch->par1 = htonl (par1); ch->par2 = htonl (par2); ch->rel = -1; ch->time0 = htons ((u_short) (tv.tv_sec >> 16)); ch->time1 = htons ((u_short) tv.tv_sec); if (debug) log(LOG_DEBUG, SPP_FMT "cisco output: <0x%lx 0x%lx 0x%lx 0x%x 0x%x-0x%x>\n", SPP_ARGS(ifp), (u_long)ntohl (ch->type), (u_long)ch->par1, (u_long)ch->par2, (u_int)ch->rel, (u_int)ch->time0, (u_int)ch->time1); if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3)) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } /* * PPP protocol implementation. */ /* * Send PPP control protocol packet. */ static void sppp_cp_send(struct sppp *sp, u_short proto, u_char type, u_char ident, u_short len, void *data) { STDDCL; struct ppp_header *h; struct lcp_header *lh; struct mbuf *m; if (len > MHLEN - PPP_HEADER_LEN - LCP_HEADER_LEN) len = MHLEN - PPP_HEADER_LEN - LCP_HEADER_LEN; MGETHDR (m, M_NOWAIT, MT_DATA); if (! m) return; m->m_pkthdr.len = m->m_len = PPP_HEADER_LEN + LCP_HEADER_LEN + len; m->m_pkthdr.rcvif = 0; h = mtod (m, struct ppp_header*); h->address = PPP_ALLSTATIONS; /* broadcast address */ h->control = PPP_UI; /* Unnumbered Info */ h->protocol = htons (proto); /* Link Control Protocol */ lh = (struct lcp_header*) (h + 1); lh->type = type; lh->ident = ident; lh->len = htons (LCP_HEADER_LEN + len); if (len) bcopy (data, lh+1, len); if (debug) { log(LOG_DEBUG, SPP_FMT "%s output <%s id=0x%x len=%d", SPP_ARGS(ifp), sppp_proto_name(proto), sppp_cp_type_name (lh->type), lh->ident, ntohs (lh->len)); sppp_print_bytes ((u_char*) (lh+1), len); log(-1, ">\n"); } if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3)) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } /* * Handle incoming PPP control protocol packets. */ static void sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m) { STDDCL; struct lcp_header *h; int len = m->m_pkthdr.len; int rv; u_char *p; if (len < 4) { if (debug) log(LOG_DEBUG, SPP_FMT "%s invalid packet length: %d bytes\n", SPP_ARGS(ifp), cp->name, len); return; } h = mtod (m, struct lcp_header*); if (debug) { log(LOG_DEBUG, SPP_FMT "%s input(%s): <%s id=0x%x len=%d", SPP_ARGS(ifp), cp->name, sppp_state_name(sp->state[cp->protoidx]), sppp_cp_type_name (h->type), h->ident, ntohs (h->len)); sppp_print_bytes ((u_char*) (h+1), len-4); log(-1, ">\n"); } if (len > ntohs (h->len)) len = ntohs (h->len); p = (u_char *)(h + 1); switch (h->type) { case CONF_REQ: if (len < 4) { if (debug) log(-1, SPP_FMT "%s invalid conf-req length %d\n", SPP_ARGS(ifp), cp->name, len); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); break; } /* handle states where RCR doesn't get a SCA/SCN */ switch (sp->state[cp->protoidx]) { case STATE_CLOSING: case STATE_STOPPING: return; case STATE_CLOSED: sppp_cp_send(sp, cp->proto, TERM_ACK, h->ident, 0, 0); return; } rv = (cp->RCR)(sp, h, len); switch (sp->state[cp->protoidx]) { case STATE_OPENED: (cp->tld)(sp); (cp->scr)(sp); /* FALLTHROUGH */ case STATE_ACK_SENT: case STATE_REQ_SENT: /* * sppp_cp_change_state() have the side effect of * restarting the timeouts. We want to avoid that * if the state don't change, otherwise we won't * ever timeout and resend a configuration request * that got lost. */ if (sp->state[cp->protoidx] == (rv ? STATE_ACK_SENT: STATE_REQ_SENT)) break; sppp_cp_change_state(cp, sp, rv? STATE_ACK_SENT: STATE_REQ_SENT); break; case STATE_STOPPED: sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; (cp->scr)(sp); sppp_cp_change_state(cp, sp, rv? STATE_ACK_SENT: STATE_REQ_SENT); break; case STATE_ACK_RCVD: if (rv) { sppp_cp_change_state(cp, sp, STATE_OPENED); if (debug) log(LOG_DEBUG, SPP_FMT "%s tlu\n", SPP_ARGS(ifp), cp->name); (cp->tlu)(sp); } else sppp_cp_change_state(cp, sp, STATE_ACK_RCVD); break; default: printf(SPP_FMT "%s illegal %s in state %s\n", SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; case CONF_ACK: if (h->ident != sp->confid[cp->protoidx]) { if (debug) log(-1, SPP_FMT "%s id mismatch 0x%x != 0x%x\n", SPP_ARGS(ifp), cp->name, h->ident, sp->confid[cp->protoidx]); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); break; } switch (sp->state[cp->protoidx]) { case STATE_CLOSED: case STATE_STOPPED: sppp_cp_send(sp, cp->proto, TERM_ACK, h->ident, 0, 0); break; case STATE_CLOSING: case STATE_STOPPING: break; case STATE_REQ_SENT: sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; sppp_cp_change_state(cp, sp, STATE_ACK_RCVD); break; case STATE_OPENED: (cp->tld)(sp); /* FALLTHROUGH */ case STATE_ACK_RCVD: (cp->scr)(sp); sppp_cp_change_state(cp, sp, STATE_REQ_SENT); break; case STATE_ACK_SENT: sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; sppp_cp_change_state(cp, sp, STATE_OPENED); if (debug) log(LOG_DEBUG, SPP_FMT "%s tlu\n", SPP_ARGS(ifp), cp->name); (cp->tlu)(sp); break; default: printf(SPP_FMT "%s illegal %s in state %s\n", SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; case CONF_NAK: case CONF_REJ: if (h->ident != sp->confid[cp->protoidx]) { if (debug) log(-1, SPP_FMT "%s id mismatch 0x%x != 0x%x\n", SPP_ARGS(ifp), cp->name, h->ident, sp->confid[cp->protoidx]); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); break; } if (h->type == CONF_NAK) (cp->RCN_nak)(sp, h, len); else /* CONF_REJ */ (cp->RCN_rej)(sp, h, len); switch (sp->state[cp->protoidx]) { case STATE_CLOSED: case STATE_STOPPED: sppp_cp_send(sp, cp->proto, TERM_ACK, h->ident, 0, 0); break; case STATE_REQ_SENT: case STATE_ACK_SENT: sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; /* * Slow things down a bit if we think we might be * in loopback. Depend on the timeout to send the * next configuration request. */ if (sp->pp_loopcnt) break; (cp->scr)(sp); break; case STATE_OPENED: (cp->tld)(sp); /* FALLTHROUGH */ case STATE_ACK_RCVD: sppp_cp_change_state(cp, sp, STATE_REQ_SENT); (cp->scr)(sp); break; case STATE_CLOSING: case STATE_STOPPING: break; default: printf(SPP_FMT "%s illegal %s in state %s\n", SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; case TERM_REQ: switch (sp->state[cp->protoidx]) { case STATE_ACK_RCVD: case STATE_ACK_SENT: sppp_cp_change_state(cp, sp, STATE_REQ_SENT); /* FALLTHROUGH */ case STATE_CLOSED: case STATE_STOPPED: case STATE_CLOSING: case STATE_STOPPING: case STATE_REQ_SENT: sta: /* Send Terminate-Ack packet. */ if (debug) log(LOG_DEBUG, SPP_FMT "%s send terminate-ack\n", SPP_ARGS(ifp), cp->name); sppp_cp_send(sp, cp->proto, TERM_ACK, h->ident, 0, 0); break; case STATE_OPENED: (cp->tld)(sp); sp->rst_counter[cp->protoidx] = 0; sppp_cp_change_state(cp, sp, STATE_STOPPING); goto sta; break; default: printf(SPP_FMT "%s illegal %s in state %s\n", SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; case TERM_ACK: switch (sp->state[cp->protoidx]) { case STATE_CLOSED: case STATE_STOPPED: case STATE_REQ_SENT: case STATE_ACK_SENT: break; case STATE_CLOSING: sppp_cp_change_state(cp, sp, STATE_CLOSED); (cp->tlf)(sp); break; case STATE_STOPPING: sppp_cp_change_state(cp, sp, STATE_STOPPED); (cp->tlf)(sp); break; case STATE_ACK_RCVD: sppp_cp_change_state(cp, sp, STATE_REQ_SENT); break; case STATE_OPENED: (cp->tld)(sp); (cp->scr)(sp); sppp_cp_change_state(cp, sp, STATE_ACK_RCVD); break; default: printf(SPP_FMT "%s illegal %s in state %s\n", SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; case CODE_REJ: /* XXX catastrophic rejects (RXJ-) aren't handled yet. */ log(LOG_INFO, SPP_FMT "%s: ignoring RXJ (%s) for proto 0x%x, " "danger will robinson\n", SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), ntohs(*((u_short *)p))); switch (sp->state[cp->protoidx]) { case STATE_CLOSED: case STATE_STOPPED: case STATE_REQ_SENT: case STATE_ACK_SENT: case STATE_CLOSING: case STATE_STOPPING: case STATE_OPENED: break; case STATE_ACK_RCVD: sppp_cp_change_state(cp, sp, STATE_REQ_SENT); break; default: printf(SPP_FMT "%s illegal %s in state %s\n", SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; case PROTO_REJ: { int catastrophic; const struct cp *upper; int i; u_int16_t proto; catastrophic = 0; upper = NULL; proto = ntohs(*((u_int16_t *)p)); for (i = 0; i < IDX_COUNT; i++) { if (cps[i]->proto == proto) { upper = cps[i]; break; } } if (upper == NULL) catastrophic++; if (catastrophic || debug) log(catastrophic? LOG_INFO: LOG_DEBUG, SPP_FMT "%s: RXJ%c (%s) for proto 0x%x (%s/%s)\n", SPP_ARGS(ifp), cp->name, catastrophic ? '-' : '+', sppp_cp_type_name(h->type), proto, upper ? upper->name : "unknown", upper ? sppp_state_name(sp->state[upper->protoidx]) : "?"); /* * if we got RXJ+ against conf-req, the peer does not implement * this particular protocol type. terminate the protocol. */ if (upper && !catastrophic) { if (sp->state[upper->protoidx] == STATE_REQ_SENT) { upper->Close(sp); break; } } /* XXX catastrophic rejects (RXJ-) aren't handled yet. */ switch (sp->state[cp->protoidx]) { case STATE_CLOSED: case STATE_STOPPED: case STATE_REQ_SENT: case STATE_ACK_SENT: case STATE_CLOSING: case STATE_STOPPING: case STATE_OPENED: break; case STATE_ACK_RCVD: sppp_cp_change_state(cp, sp, STATE_REQ_SENT); break; default: printf(SPP_FMT "%s illegal %s in state %s\n", SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; } case DISC_REQ: if (cp->proto != PPP_LCP) goto illegal; /* Discard the packet. */ break; case ECHO_REQ: if (cp->proto != PPP_LCP) goto illegal; if (sp->state[cp->protoidx] != STATE_OPENED) { if (debug) log(-1, SPP_FMT "lcp echo req but lcp closed\n", SPP_ARGS(ifp)); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); break; } if (len < 8) { if (debug) log(-1, SPP_FMT "invalid lcp echo request " "packet length: %d bytes\n", SPP_ARGS(ifp), len); break; } if ((sp->lcp.opts & (1 << LCP_OPT_MAGIC)) && ntohl (*(long*)(h+1)) == sp->lcp.magic) { /* Line loopback mode detected. */ printf(SPP_FMT "loopback\n", SPP_ARGS(ifp)); sp->pp_loopcnt = MAXALIVECNT * 5; if_down (ifp); sppp_qflush (&sp->pp_cpq); /* Shut down the PPP link. */ /* XXX */ lcp.Down(sp); lcp.Up(sp); break; } *(long*)(h+1) = htonl (sp->lcp.magic); if (debug) log(-1, SPP_FMT "got lcp echo req, sending echo rep\n", SPP_ARGS(ifp)); sppp_cp_send (sp, PPP_LCP, ECHO_REPLY, h->ident, len-4, h+1); break; case ECHO_REPLY: if (cp->proto != PPP_LCP) goto illegal; if (h->ident != sp->lcp.echoid) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); break; } if (len < 8) { if (debug) log(-1, SPP_FMT "lcp invalid echo reply " "packet length: %d bytes\n", SPP_ARGS(ifp), len); break; } if (debug) log(-1, SPP_FMT "lcp got echo rep\n", SPP_ARGS(ifp)); if (!(sp->lcp.opts & (1 << LCP_OPT_MAGIC)) || ntohl (*(long*)(h+1)) != sp->lcp.magic) sp->pp_alivecnt = 0; break; default: /* Unknown packet type -- send Code-Reject packet. */ illegal: if (debug) log(-1, SPP_FMT "%s send code-rej for 0x%x\n", SPP_ARGS(ifp), cp->name, h->type); sppp_cp_send(sp, cp->proto, CODE_REJ, ++sp->pp_seq[cp->protoidx], m->m_pkthdr.len, h); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } } /* * The generic part of all Up/Down/Open/Close/TO event handlers. * Basically, the state transition handling in the automaton. */ static void sppp_up_event(const struct cp *cp, struct sppp *sp) { STDDCL; if (debug) log(LOG_DEBUG, SPP_FMT "%s up(%s)\n", SPP_ARGS(ifp), cp->name, sppp_state_name(sp->state[cp->protoidx])); switch (sp->state[cp->protoidx]) { case STATE_INITIAL: sppp_cp_change_state(cp, sp, STATE_CLOSED); break; case STATE_STARTING: sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; (cp->scr)(sp); sppp_cp_change_state(cp, sp, STATE_REQ_SENT); break; default: printf(SPP_FMT "%s illegal up in state %s\n", SPP_ARGS(ifp), cp->name, sppp_state_name(sp->state[cp->protoidx])); } } static void sppp_down_event(const struct cp *cp, struct sppp *sp) { STDDCL; if (debug) log(LOG_DEBUG, SPP_FMT "%s down(%s)\n", SPP_ARGS(ifp), cp->name, sppp_state_name(sp->state[cp->protoidx])); switch (sp->state[cp->protoidx]) { case STATE_CLOSED: case STATE_CLOSING: sppp_cp_change_state(cp, sp, STATE_INITIAL); break; case STATE_STOPPED: sppp_cp_change_state(cp, sp, STATE_STARTING); (cp->tls)(sp); break; case STATE_STOPPING: case STATE_REQ_SENT: case STATE_ACK_RCVD: case STATE_ACK_SENT: sppp_cp_change_state(cp, sp, STATE_STARTING); break; case STATE_OPENED: (cp->tld)(sp); sppp_cp_change_state(cp, sp, STATE_STARTING); break; default: printf(SPP_FMT "%s illegal down in state %s\n", SPP_ARGS(ifp), cp->name, sppp_state_name(sp->state[cp->protoidx])); } } static void sppp_open_event(const struct cp *cp, struct sppp *sp) { STDDCL; if (debug) log(LOG_DEBUG, SPP_FMT "%s open(%s)\n", SPP_ARGS(ifp), cp->name, sppp_state_name(sp->state[cp->protoidx])); switch (sp->state[cp->protoidx]) { case STATE_INITIAL: sppp_cp_change_state(cp, sp, STATE_STARTING); (cp->tls)(sp); break; case STATE_STARTING: break; case STATE_CLOSED: sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; (cp->scr)(sp); sppp_cp_change_state(cp, sp, STATE_REQ_SENT); break; case STATE_STOPPED: /* * Try escaping stopped state. This seems to bite * people occasionally, in particular for IPCP, * presumably following previous IPCP negotiation * aborts. Somehow, we must have missed a Down event * which would have caused a transition into starting * state, so as a bandaid we force the Down event now. * This effectively implements (something like the) * `restart' option mentioned in the state transition * table of RFC 1661. */ sppp_cp_change_state(cp, sp, STATE_STARTING); (cp->tls)(sp); break; case STATE_STOPPING: case STATE_REQ_SENT: case STATE_ACK_RCVD: case STATE_ACK_SENT: case STATE_OPENED: break; case STATE_CLOSING: sppp_cp_change_state(cp, sp, STATE_STOPPING); break; } } static void sppp_close_event(const struct cp *cp, struct sppp *sp) { STDDCL; if (debug) log(LOG_DEBUG, SPP_FMT "%s close(%s)\n", SPP_ARGS(ifp), cp->name, sppp_state_name(sp->state[cp->protoidx])); switch (sp->state[cp->protoidx]) { case STATE_INITIAL: case STATE_CLOSED: case STATE_CLOSING: break; case STATE_STARTING: sppp_cp_change_state(cp, sp, STATE_INITIAL); (cp->tlf)(sp); break; case STATE_STOPPED: sppp_cp_change_state(cp, sp, STATE_CLOSED); break; case STATE_STOPPING: sppp_cp_change_state(cp, sp, STATE_CLOSING); break; case STATE_OPENED: (cp->tld)(sp); /* FALLTHROUGH */ case STATE_REQ_SENT: case STATE_ACK_RCVD: case STATE_ACK_SENT: sp->rst_counter[cp->protoidx] = sp->lcp.max_terminate; sppp_cp_send(sp, cp->proto, TERM_REQ, ++sp->pp_seq[cp->protoidx], 0, 0); sppp_cp_change_state(cp, sp, STATE_CLOSING); break; } } static void sppp_to_event(const struct cp *cp, struct sppp *sp) { STDDCL; SPPP_LOCK(sp); if (debug) log(LOG_DEBUG, SPP_FMT "%s TO(%s) rst_counter = %d\n", SPP_ARGS(ifp), cp->name, sppp_state_name(sp->state[cp->protoidx]), sp->rst_counter[cp->protoidx]); if (--sp->rst_counter[cp->protoidx] < 0) /* TO- event */ switch (sp->state[cp->protoidx]) { case STATE_CLOSING: sppp_cp_change_state(cp, sp, STATE_CLOSED); (cp->tlf)(sp); break; case STATE_STOPPING: sppp_cp_change_state(cp, sp, STATE_STOPPED); (cp->tlf)(sp); break; case STATE_REQ_SENT: case STATE_ACK_RCVD: case STATE_ACK_SENT: sppp_cp_change_state(cp, sp, STATE_STOPPED); (cp->tlf)(sp); break; } else /* TO+ event */ switch (sp->state[cp->protoidx]) { case STATE_CLOSING: case STATE_STOPPING: sppp_cp_send(sp, cp->proto, TERM_REQ, ++sp->pp_seq[cp->protoidx], 0, 0); callout_reset(&sp->ch[cp->protoidx], sp->lcp.timeout, cp->TO, (void *)sp); break; case STATE_REQ_SENT: case STATE_ACK_RCVD: (cp->scr)(sp); /* sppp_cp_change_state() will restart the timer */ sppp_cp_change_state(cp, sp, STATE_REQ_SENT); break; case STATE_ACK_SENT: (cp->scr)(sp); callout_reset(&sp->ch[cp->protoidx], sp->lcp.timeout, cp->TO, (void *)sp); break; } SPPP_UNLOCK(sp); } /* * Change the state of a control protocol in the state automaton. * Takes care of starting/stopping the restart timer. */ static void sppp_cp_change_state(const struct cp *cp, struct sppp *sp, int newstate) { sp->state[cp->protoidx] = newstate; callout_stop (&sp->ch[cp->protoidx]); switch (newstate) { case STATE_INITIAL: case STATE_STARTING: case STATE_CLOSED: case STATE_STOPPED: case STATE_OPENED: break; case STATE_CLOSING: case STATE_STOPPING: case STATE_REQ_SENT: case STATE_ACK_RCVD: case STATE_ACK_SENT: callout_reset(&sp->ch[cp->protoidx], sp->lcp.timeout, cp->TO, (void *)sp); break; } } /* *--------------------------------------------------------------------------* * * * The LCP implementation. * * * *--------------------------------------------------------------------------* */ static void sppp_pp_up(struct sppp *sp) { SPPP_LOCK(sp); lcp.Up(sp); SPPP_UNLOCK(sp); } static void sppp_pp_down(struct sppp *sp) { SPPP_LOCK(sp); lcp.Down(sp); SPPP_UNLOCK(sp); } static void sppp_lcp_init(struct sppp *sp) { sp->lcp.opts = (1 << LCP_OPT_MAGIC); sp->lcp.magic = 0; sp->state[IDX_LCP] = STATE_INITIAL; sp->fail_counter[IDX_LCP] = 0; sp->pp_seq[IDX_LCP] = 0; sp->pp_rseq[IDX_LCP] = 0; sp->lcp.protos = 0; sp->lcp.mru = sp->lcp.their_mru = PP_MTU; /* Note that these values are relevant for all control protocols */ sp->lcp.timeout = 3 * hz; sp->lcp.max_terminate = 2; sp->lcp.max_configure = 10; sp->lcp.max_failure = 10; callout_init(&sp->ch[IDX_LCP], 1); } static void sppp_lcp_up(struct sppp *sp) { STDDCL; sp->pp_alivecnt = 0; sp->lcp.opts = (1 << LCP_OPT_MAGIC); sp->lcp.magic = 0; sp->lcp.protos = 0; sp->lcp.mru = sp->lcp.their_mru = PP_MTU; /* * If we are authenticator, negotiate LCP_AUTH */ if (sp->hisauth.proto != 0) sp->lcp.opts |= (1 << LCP_OPT_AUTH_PROTO); else sp->lcp.opts &= ~(1 << LCP_OPT_AUTH_PROTO); sp->pp_flags &= ~PP_NEEDAUTH; /* * If this interface is passive or dial-on-demand, and we are * still in Initial state, it means we've got an incoming * call. Activate the interface. */ if ((ifp->if_flags & (IFF_AUTO | IFF_PASSIVE)) != 0) { if (debug) log(LOG_DEBUG, SPP_FMT "Up event", SPP_ARGS(ifp)); ifp->if_drv_flags |= IFF_DRV_RUNNING; if (sp->state[IDX_LCP] == STATE_INITIAL) { if (debug) log(-1, "(incoming call)\n"); sp->pp_flags |= PP_CALLIN; lcp.Open(sp); } else if (debug) log(-1, "\n"); } else if ((ifp->if_flags & (IFF_AUTO | IFF_PASSIVE)) == 0 && (sp->state[IDX_LCP] == STATE_INITIAL)) { ifp->if_drv_flags |= IFF_DRV_RUNNING; lcp.Open(sp); } sppp_up_event(&lcp, sp); } static void sppp_lcp_down(struct sppp *sp) { STDDCL; sppp_down_event(&lcp, sp); /* * If this is neither a dial-on-demand nor a passive * interface, simulate an ``ifconfig down'' action, so the * administrator can force a redial by another ``ifconfig * up''. XXX For leased line operation, should we immediately * try to reopen the connection here? */ if ((ifp->if_flags & (IFF_AUTO | IFF_PASSIVE)) == 0) { log(LOG_INFO, SPP_FMT "Down event, taking interface down.\n", SPP_ARGS(ifp)); if_down(ifp); } else { if (debug) log(LOG_DEBUG, SPP_FMT "Down event (carrier loss)\n", SPP_ARGS(ifp)); sp->pp_flags &= ~PP_CALLIN; if (sp->state[IDX_LCP] != STATE_INITIAL) lcp.Close(sp); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; } } static void sppp_lcp_open(struct sppp *sp) { sppp_open_event(&lcp, sp); } static void sppp_lcp_close(struct sppp *sp) { sppp_close_event(&lcp, sp); } static void sppp_lcp_TO(void *cookie) { sppp_to_event(&lcp, (struct sppp *)cookie); } /* * Analyze a configure request. Return true if it was agreeable, and * caused action sca, false if it has been rejected or nak'ed, and * caused action scn. (The return value is used to make the state * transition decision in the state automaton.) */ static int sppp_lcp_RCR(struct sppp *sp, struct lcp_header *h, int len) { STDDCL; u_char *buf, *r, *p; int origlen, rlen; u_long nmagic; u_short authproto; len -= 4; origlen = len; buf = r = malloc (len, M_TEMP, M_NOWAIT); if (! buf) return (0); if (debug) log(LOG_DEBUG, SPP_FMT "lcp parse opts: ", SPP_ARGS(ifp)); /* pass 1: check for things that need to be rejected */ p = (void*) (h+1); for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; len-=p[1], p+=p[1]) { if (debug) log(-1, " %s ", sppp_lcp_opt_name(*p)); switch (*p) { case LCP_OPT_MAGIC: /* Magic number. */ if (len >= 6 && p[1] == 6) continue; if (debug) log(-1, "[invalid] "); break; case LCP_OPT_ASYNC_MAP: /* Async control character map. */ if (len >= 6 && p[1] == 6) continue; if (debug) log(-1, "[invalid] "); break; case LCP_OPT_MRU: /* Maximum receive unit. */ if (len >= 4 && p[1] == 4) continue; if (debug) log(-1, "[invalid] "); break; case LCP_OPT_AUTH_PROTO: if (len < 4) { if (debug) log(-1, "[invalid] "); break; } authproto = (p[2] << 8) + p[3]; if (authproto == PPP_CHAP && p[1] != 5) { if (debug) log(-1, "[invalid chap len] "); break; } if (sp->myauth.proto == 0) { /* we are not configured to do auth */ if (debug) log(-1, "[not configured] "); break; } /* * Remote want us to authenticate, remember this, * so we stay in PHASE_AUTHENTICATE after LCP got * up. */ sp->pp_flags |= PP_NEEDAUTH; continue; default: /* Others not supported. */ if (debug) log(-1, "[rej] "); break; } /* Add the option to rejected list. */ bcopy (p, r, p[1]); r += p[1]; rlen += p[1]; } if (rlen) { if (debug) log(-1, " send conf-rej\n"); sppp_cp_send (sp, PPP_LCP, CONF_REJ, h->ident, rlen, buf); return 0; } else if (debug) log(-1, "\n"); /* * pass 2: check for option values that are unacceptable and * thus require to be nak'ed. */ if (debug) log(LOG_DEBUG, SPP_FMT "lcp parse opt values: ", SPP_ARGS(ifp)); p = (void*) (h+1); len = origlen; for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; len-=p[1], p+=p[1]) { if (debug) log(-1, " %s ", sppp_lcp_opt_name(*p)); switch (*p) { case LCP_OPT_MAGIC: /* Magic number -- extract. */ nmagic = (u_long)p[2] << 24 | (u_long)p[3] << 16 | p[4] << 8 | p[5]; if (nmagic != sp->lcp.magic) { sp->pp_loopcnt = 0; if (debug) log(-1, "0x%lx ", nmagic); continue; } if (debug && sp->pp_loopcnt < MAXALIVECNT*5) log(-1, "[glitch] "); ++sp->pp_loopcnt; /* * We negate our magic here, and NAK it. If * we see it later in an NAK packet, we * suggest a new one. */ nmagic = ~sp->lcp.magic; /* Gonna NAK it. */ p[2] = nmagic >> 24; p[3] = nmagic >> 16; p[4] = nmagic >> 8; p[5] = nmagic; break; case LCP_OPT_ASYNC_MAP: /* * Async control character map -- just ignore it. * * Quote from RFC 1662, chapter 6: * To enable this functionality, synchronous PPP * implementations MUST always respond to the * Async-Control-Character-Map Configuration * Option with the LCP Configure-Ack. However, * acceptance of the Configuration Option does * not imply that the synchronous implementation * will do any ACCM mapping. Instead, all such * octet mapping will be performed by the * asynchronous-to-synchronous converter. */ continue; case LCP_OPT_MRU: /* * Maximum receive unit. Always agreeable, * but ignored by now. */ sp->lcp.their_mru = p[2] * 256 + p[3]; if (debug) log(-1, "%lu ", sp->lcp.their_mru); continue; case LCP_OPT_AUTH_PROTO: authproto = (p[2] << 8) + p[3]; if (sp->myauth.proto != authproto) { /* not agreed, nak */ if (debug) log(-1, "[mine %s != his %s] ", sppp_proto_name(sp->hisauth.proto), sppp_proto_name(authproto)); p[2] = sp->myauth.proto >> 8; p[3] = sp->myauth.proto; break; } if (authproto == PPP_CHAP && p[4] != CHAP_MD5) { if (debug) log(-1, "[chap not MD5] "); p[4] = CHAP_MD5; break; } continue; } /* Add the option to nak'ed list. */ bcopy (p, r, p[1]); r += p[1]; rlen += p[1]; } if (rlen) { /* * Local and remote magics equal -- loopback? */ if (sp->pp_loopcnt >= MAXALIVECNT*5) { if (sp->pp_loopcnt == MAXALIVECNT*5) printf (SPP_FMT "loopback\n", SPP_ARGS(ifp)); if (ifp->if_flags & IFF_UP) { if_down(ifp); sppp_qflush(&sp->pp_cpq); /* XXX ? */ lcp.Down(sp); lcp.Up(sp); } } else if (!sp->pp_loopcnt && ++sp->fail_counter[IDX_LCP] >= sp->lcp.max_failure) { if (debug) log(-1, " max_failure (%d) exceeded, " "send conf-rej\n", sp->lcp.max_failure); sppp_cp_send(sp, PPP_LCP, CONF_REJ, h->ident, rlen, buf); } else { if (debug) log(-1, " send conf-nak\n"); sppp_cp_send (sp, PPP_LCP, CONF_NAK, h->ident, rlen, buf); } } else { if (debug) log(-1, " send conf-ack\n"); sp->fail_counter[IDX_LCP] = 0; sp->pp_loopcnt = 0; sppp_cp_send (sp, PPP_LCP, CONF_ACK, h->ident, origlen, h+1); } free (buf, M_TEMP); return (rlen == 0); } /* * Analyze the LCP Configure-Reject option list, and adjust our * negotiation. */ static void sppp_lcp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len) { STDDCL; u_char *buf, *p; len -= 4; buf = malloc (len, M_TEMP, M_NOWAIT); if (!buf) return; if (debug) log(LOG_DEBUG, SPP_FMT "lcp rej opts: ", SPP_ARGS(ifp)); p = (void*) (h+1); for (; len >= 2 && p[1] >= 2 && len >= p[1]; len -= p[1], p += p[1]) { if (debug) log(-1, " %s ", sppp_lcp_opt_name(*p)); switch (*p) { case LCP_OPT_MAGIC: /* Magic number -- can't use it, use 0 */ sp->lcp.opts &= ~(1 << LCP_OPT_MAGIC); sp->lcp.magic = 0; break; case LCP_OPT_MRU: /* * Should not be rejected anyway, since we only * negotiate a MRU if explicitly requested by * peer. */ sp->lcp.opts &= ~(1 << LCP_OPT_MRU); break; case LCP_OPT_AUTH_PROTO: /* * Peer doesn't want to authenticate himself, * deny unless this is a dialout call, and * AUTHFLAG_NOCALLOUT is set. */ if ((sp->pp_flags & PP_CALLIN) == 0 && (sp->hisauth.flags & AUTHFLAG_NOCALLOUT) != 0) { if (debug) log(-1, "[don't insist on auth " "for callout]"); sp->lcp.opts &= ~(1 << LCP_OPT_AUTH_PROTO); break; } if (debug) log(-1, "[access denied]\n"); lcp.Close(sp); break; } } if (debug) log(-1, "\n"); free (buf, M_TEMP); return; } /* * Analyze the LCP Configure-NAK option list, and adjust our * negotiation. */ static void sppp_lcp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len) { STDDCL; u_char *buf, *p; u_long magic; len -= 4; buf = malloc (len, M_TEMP, M_NOWAIT); if (!buf) return; if (debug) log(LOG_DEBUG, SPP_FMT "lcp nak opts: ", SPP_ARGS(ifp)); p = (void*) (h+1); for (; len >= 2 && p[1] >= 2 && len >= p[1]; len -= p[1], p += p[1]) { if (debug) log(-1, " %s ", sppp_lcp_opt_name(*p)); switch (*p) { case LCP_OPT_MAGIC: /* Magic number -- renegotiate */ if ((sp->lcp.opts & (1 << LCP_OPT_MAGIC)) && len >= 6 && p[1] == 6) { magic = (u_long)p[2] << 24 | (u_long)p[3] << 16 | p[4] << 8 | p[5]; /* * If the remote magic is our negated one, * this looks like a loopback problem. * Suggest a new magic to make sure. */ if (magic == ~sp->lcp.magic) { if (debug) log(-1, "magic glitch "); sp->lcp.magic = random(); } else { sp->lcp.magic = magic; if (debug) log(-1, "%lu ", magic); } } break; case LCP_OPT_MRU: /* * Peer wants to advise us to negotiate an MRU. * Agree on it if it's reasonable, or use * default otherwise. */ if (len >= 4 && p[1] == 4) { u_int mru = p[2] * 256 + p[3]; if (debug) log(-1, "%d ", mru); if (mru < PP_MTU || mru > PP_MAX_MRU) mru = PP_MTU; sp->lcp.mru = mru; sp->lcp.opts |= (1 << LCP_OPT_MRU); } break; case LCP_OPT_AUTH_PROTO: /* * Peer doesn't like our authentication method, * deny. */ if (debug) log(-1, "[access denied]\n"); lcp.Close(sp); break; } } if (debug) log(-1, "\n"); free (buf, M_TEMP); return; } static void sppp_lcp_tlu(struct sppp *sp) { STDDCL; int i; u_long mask; /* XXX ? */ if (! (ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* Coming out of loopback mode. */ if_up(ifp); printf (SPP_FMT "up\n", SPP_ARGS(ifp)); } for (i = 0; i < IDX_COUNT; i++) if ((cps[i])->flags & CP_QUAL) (cps[i])->Open(sp); if ((sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) != 0 || (sp->pp_flags & PP_NEEDAUTH) != 0) sp->pp_phase = PHASE_AUTHENTICATE; else sp->pp_phase = PHASE_NETWORK; if (debug) log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp), sppp_phase_name(sp->pp_phase)); /* * Open all authentication protocols. This is even required * if we already proceeded to network phase, since it might be * that remote wants us to authenticate, so we might have to * send a PAP request. Undesired authentication protocols * don't do anything when they get an Open event. */ for (i = 0; i < IDX_COUNT; i++) if ((cps[i])->flags & CP_AUTH) (cps[i])->Open(sp); if (sp->pp_phase == PHASE_NETWORK) { /* Notify all NCPs. */ for (i = 0; i < IDX_COUNT; i++) if (((cps[i])->flags & CP_NCP) && /* * XXX * Hack to administratively disable IPv6 if * not desired. Perhaps we should have another * flag for this, but right now, we can make * all struct cp's read/only. */ (cps[i] != &ipv6cp || (sp->confflags & CONF_ENABLE_IPV6))) (cps[i])->Open(sp); } /* Send Up events to all started protos. */ for (i = 0, mask = 1; i < IDX_COUNT; i++, mask <<= 1) if ((sp->lcp.protos & mask) && ((cps[i])->flags & CP_LCP) == 0) (cps[i])->Up(sp); /* notify low-level driver of state change */ if (sp->pp_chg) sp->pp_chg(sp, (int)sp->pp_phase); if (sp->pp_phase == PHASE_NETWORK) /* if no NCP is starting, close down */ sppp_lcp_check_and_close(sp); } static void sppp_lcp_tld(struct sppp *sp) { STDDCL; int i; u_long mask; sp->pp_phase = PHASE_TERMINATE; if (debug) log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp), sppp_phase_name(sp->pp_phase)); /* * Take upper layers down. We send the Down event first and * the Close second to prevent the upper layers from sending * ``a flurry of terminate-request packets'', as the RFC * describes it. */ for (i = 0, mask = 1; i < IDX_COUNT; i++, mask <<= 1) if ((sp->lcp.protos & mask) && ((cps[i])->flags & CP_LCP) == 0) { (cps[i])->Down(sp); (cps[i])->Close(sp); } } static void sppp_lcp_tls(struct sppp *sp) { STDDCL; sp->pp_phase = PHASE_ESTABLISH; if (debug) log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp), sppp_phase_name(sp->pp_phase)); /* Notify lower layer if desired. */ if (sp->pp_tls) (sp->pp_tls)(sp); else (sp->pp_up)(sp); } static void sppp_lcp_tlf(struct sppp *sp) { STDDCL; sp->pp_phase = PHASE_DEAD; if (debug) log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp), sppp_phase_name(sp->pp_phase)); /* Notify lower layer if desired. */ if (sp->pp_tlf) (sp->pp_tlf)(sp); else (sp->pp_down)(sp); } static void sppp_lcp_scr(struct sppp *sp) { char opt[6 /* magicnum */ + 4 /* mru */ + 5 /* chap */]; int i = 0; u_short authproto; if (sp->lcp.opts & (1 << LCP_OPT_MAGIC)) { if (! sp->lcp.magic) sp->lcp.magic = random(); opt[i++] = LCP_OPT_MAGIC; opt[i++] = 6; opt[i++] = sp->lcp.magic >> 24; opt[i++] = sp->lcp.magic >> 16; opt[i++] = sp->lcp.magic >> 8; opt[i++] = sp->lcp.magic; } if (sp->lcp.opts & (1 << LCP_OPT_MRU)) { opt[i++] = LCP_OPT_MRU; opt[i++] = 4; opt[i++] = sp->lcp.mru >> 8; opt[i++] = sp->lcp.mru; } if (sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) { authproto = sp->hisauth.proto; opt[i++] = LCP_OPT_AUTH_PROTO; opt[i++] = authproto == PPP_CHAP? 5: 4; opt[i++] = authproto >> 8; opt[i++] = authproto; if (authproto == PPP_CHAP) opt[i++] = CHAP_MD5; } sp->confid[IDX_LCP] = ++sp->pp_seq[IDX_LCP]; sppp_cp_send (sp, PPP_LCP, CONF_REQ, sp->confid[IDX_LCP], i, &opt); } /* * Check the open NCPs, return true if at least one NCP is open. */ static int sppp_ncp_check(struct sppp *sp) { int i, mask; for (i = 0, mask = 1; i < IDX_COUNT; i++, mask <<= 1) if ((sp->lcp.protos & mask) && (cps[i])->flags & CP_NCP) return 1; return 0; } /* * Re-check the open NCPs and see if we should terminate the link. * Called by the NCPs during their tlf action handling. */ static void sppp_lcp_check_and_close(struct sppp *sp) { if (sp->pp_phase < PHASE_NETWORK) /* don't bother, we are already going down */ return; if (sppp_ncp_check(sp)) return; lcp.Close(sp); } /* *--------------------------------------------------------------------------* * * * The IPCP implementation. * * * *--------------------------------------------------------------------------* */ #ifdef INET static void sppp_ipcp_init(struct sppp *sp) { sp->ipcp.opts = 0; sp->ipcp.flags = 0; sp->state[IDX_IPCP] = STATE_INITIAL; sp->fail_counter[IDX_IPCP] = 0; sp->pp_seq[IDX_IPCP] = 0; sp->pp_rseq[IDX_IPCP] = 0; callout_init(&sp->ch[IDX_IPCP], 1); } static void sppp_ipcp_up(struct sppp *sp) { sppp_up_event(&ipcp, sp); } static void sppp_ipcp_down(struct sppp *sp) { sppp_down_event(&ipcp, sp); } static void sppp_ipcp_open(struct sppp *sp) { STDDCL; u_long myaddr, hisaddr; sp->ipcp.flags &= ~(IPCP_HISADDR_SEEN | IPCP_MYADDR_SEEN | IPCP_MYADDR_DYN | IPCP_VJ); sp->ipcp.opts = 0; sppp_get_ip_addrs(sp, &myaddr, &hisaddr, 0); /* * If we don't have his address, this probably means our * interface doesn't want to talk IP at all. (This could * be the case if somebody wants to speak only IPX, for * example.) Don't open IPCP in this case. */ if (hisaddr == 0L) { /* XXX this message should go away */ if (debug) log(LOG_DEBUG, SPP_FMT "ipcp_open(): no IP interface\n", SPP_ARGS(ifp)); return; } if (myaddr == 0L) { /* * I don't have an assigned address, so i need to * negotiate my address. */ sp->ipcp.flags |= IPCP_MYADDR_DYN; sp->ipcp.opts |= (1 << IPCP_OPT_ADDRESS); } else sp->ipcp.flags |= IPCP_MYADDR_SEEN; if (sp->confflags & CONF_ENABLE_VJ) { sp->ipcp.opts |= (1 << IPCP_OPT_COMPRESSION); sp->ipcp.max_state = MAX_STATES - 1; sp->ipcp.compress_cid = 1; } sppp_open_event(&ipcp, sp); } static void sppp_ipcp_close(struct sppp *sp) { sppp_close_event(&ipcp, sp); if (sp->ipcp.flags & IPCP_MYADDR_DYN) /* * My address was dynamic, clear it again. */ sppp_set_ip_addr(sp, 0L); } static void sppp_ipcp_TO(void *cookie) { sppp_to_event(&ipcp, (struct sppp *)cookie); } /* * Analyze a configure request. Return true if it was agreeable, and * caused action sca, false if it has been rejected or nak'ed, and * caused action scn. (The return value is used to make the state * transition decision in the state automaton.) */ static int sppp_ipcp_RCR(struct sppp *sp, struct lcp_header *h, int len) { u_char *buf, *r, *p; struct ifnet *ifp = SP2IFP(sp); int rlen, origlen, debug = ifp->if_flags & IFF_DEBUG; u_long hisaddr, desiredaddr; int gotmyaddr = 0; int desiredcomp; len -= 4; origlen = len; /* * Make sure to allocate a buf that can at least hold a * conf-nak with an `address' option. We might need it below. */ buf = r = malloc ((len < 6? 6: len), M_TEMP, M_NOWAIT); if (! buf) return (0); /* pass 1: see if we can recognize them */ if (debug) log(LOG_DEBUG, SPP_FMT "ipcp parse opts: ", SPP_ARGS(ifp)); p = (void*) (h+1); for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; len-=p[1], p+=p[1]) { if (debug) log(-1, " %s ", sppp_ipcp_opt_name(*p)); switch (*p) { case IPCP_OPT_COMPRESSION: if (!(sp->confflags & CONF_ENABLE_VJ)) { /* VJ compression administratively disabled */ if (debug) log(-1, "[locally disabled] "); break; } /* * In theory, we should only conf-rej an * option that is shorter than RFC 1618 * requires (i.e. < 4), and should conf-nak * anything else that is not VJ. However, * since our algorithm always uses the * original option to NAK it with new values, * things would become more complicated. In * practice, the only commonly implemented IP * compression option is VJ anyway, so the * difference is negligible. */ if (len >= 6 && p[1] == 6) { /* * correctly formed compression option * that could be VJ compression */ continue; } if (debug) log(-1, "optlen %d [invalid/unsupported] ", p[1]); break; case IPCP_OPT_ADDRESS: if (len >= 6 && p[1] == 6) { /* correctly formed address option */ continue; } if (debug) log(-1, "[invalid] "); break; default: /* Others not supported. */ if (debug) log(-1, "[rej] "); break; } /* Add the option to rejected list. */ bcopy (p, r, p[1]); r += p[1]; rlen += p[1]; } if (rlen) { if (debug) log(-1, " send conf-rej\n"); sppp_cp_send (sp, PPP_IPCP, CONF_REJ, h->ident, rlen, buf); return 0; } else if (debug) log(-1, "\n"); /* pass 2: parse option values */ sppp_get_ip_addrs(sp, 0, &hisaddr, 0); if (debug) log(LOG_DEBUG, SPP_FMT "ipcp parse opt values: ", SPP_ARGS(ifp)); p = (void*) (h+1); len = origlen; for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; len-=p[1], p+=p[1]) { if (debug) log(-1, " %s ", sppp_ipcp_opt_name(*p)); switch (*p) { case IPCP_OPT_COMPRESSION: desiredcomp = p[2] << 8 | p[3]; /* We only support VJ */ if (desiredcomp == IPCP_COMP_VJ) { if (debug) log(-1, "VJ [ack] "); sp->ipcp.flags |= IPCP_VJ; sl_compress_init(sp->pp_comp, p[4]); sp->ipcp.max_state = p[4]; sp->ipcp.compress_cid = p[5]; continue; } if (debug) log(-1, "compproto %#04x [not supported] ", desiredcomp); p[2] = IPCP_COMP_VJ >> 8; p[3] = IPCP_COMP_VJ; p[4] = sp->ipcp.max_state; p[5] = sp->ipcp.compress_cid; break; case IPCP_OPT_ADDRESS: /* This is the address he wants in his end */ desiredaddr = p[2] << 24 | p[3] << 16 | p[4] << 8 | p[5]; if (desiredaddr == hisaddr || (hisaddr >= 1 && hisaddr <= 254 && desiredaddr != 0)) { /* * Peer's address is same as our value, * or we have set it to 0.0.0.* to * indicate that we do not really care, * this is agreeable. Gonna conf-ack * it. */ if (debug) log(-1, "%s [ack] ", sppp_dotted_quad(hisaddr)); /* record that we've seen it already */ sp->ipcp.flags |= IPCP_HISADDR_SEEN; continue; } /* * The address wasn't agreeable. This is either * he sent us 0.0.0.0, asking to assign him an * address, or he send us another address not * matching our value. Either case, we gonna * conf-nak it with our value. * XXX: we should "rej" if hisaddr == 0 */ if (debug) { if (desiredaddr == 0) log(-1, "[addr requested] "); else log(-1, "%s [not agreed] ", sppp_dotted_quad(desiredaddr)); } p[2] = hisaddr >> 24; p[3] = hisaddr >> 16; p[4] = hisaddr >> 8; p[5] = hisaddr; break; } /* Add the option to nak'ed list. */ bcopy (p, r, p[1]); r += p[1]; rlen += p[1]; } /* * If we are about to conf-ack the request, but haven't seen * his address so far, gonna conf-nak it instead, with the * `address' option present and our idea of his address being * filled in there, to request negotiation of both addresses. * * XXX This can result in an endless req - nak loop if peer * doesn't want to send us his address. Q: What should we do * about it? XXX A: implement the max-failure counter. */ if (rlen == 0 && !(sp->ipcp.flags & IPCP_HISADDR_SEEN) && !gotmyaddr) { buf[0] = IPCP_OPT_ADDRESS; buf[1] = 6; buf[2] = hisaddr >> 24; buf[3] = hisaddr >> 16; buf[4] = hisaddr >> 8; buf[5] = hisaddr; rlen = 6; if (debug) log(-1, "still need hisaddr "); } if (rlen) { if (debug) log(-1, " send conf-nak\n"); sppp_cp_send (sp, PPP_IPCP, CONF_NAK, h->ident, rlen, buf); } else { if (debug) log(-1, " send conf-ack\n"); sppp_cp_send (sp, PPP_IPCP, CONF_ACK, h->ident, origlen, h+1); } free (buf, M_TEMP); return (rlen == 0); } /* * Analyze the IPCP Configure-Reject option list, and adjust our * negotiation. */ static void sppp_ipcp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len) { u_char *buf, *p; struct ifnet *ifp = SP2IFP(sp); int debug = ifp->if_flags & IFF_DEBUG; len -= 4; buf = malloc (len, M_TEMP, M_NOWAIT); if (!buf) return; if (debug) log(LOG_DEBUG, SPP_FMT "ipcp rej opts: ", SPP_ARGS(ifp)); p = (void*) (h+1); for (; len >= 2 && p[1] >= 2 && len >= p[1]; len -= p[1], p += p[1]) { if (debug) log(-1, " %s ", sppp_ipcp_opt_name(*p)); switch (*p) { case IPCP_OPT_COMPRESSION: sp->ipcp.opts &= ~(1 << IPCP_OPT_COMPRESSION); break; case IPCP_OPT_ADDRESS: /* * Peer doesn't grok address option. This is * bad. XXX Should we better give up here? * XXX We could try old "addresses" option... */ sp->ipcp.opts &= ~(1 << IPCP_OPT_ADDRESS); break; } } if (debug) log(-1, "\n"); free (buf, M_TEMP); return; } /* * Analyze the IPCP Configure-NAK option list, and adjust our * negotiation. */ static void sppp_ipcp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len) { u_char *buf, *p; struct ifnet *ifp = SP2IFP(sp); int debug = ifp->if_flags & IFF_DEBUG; int desiredcomp; u_long wantaddr; len -= 4; buf = malloc (len, M_TEMP, M_NOWAIT); if (!buf) return; if (debug) log(LOG_DEBUG, SPP_FMT "ipcp nak opts: ", SPP_ARGS(ifp)); p = (void*) (h+1); for (; len >= 2 && p[1] >= 2 && len >= p[1]; len -= p[1], p += p[1]) { if (debug) log(-1, " %s ", sppp_ipcp_opt_name(*p)); switch (*p) { case IPCP_OPT_COMPRESSION: if (len >= 6 && p[1] == 6) { desiredcomp = p[2] << 8 | p[3]; if (debug) log(-1, "[wantcomp %#04x] ", desiredcomp); if (desiredcomp == IPCP_COMP_VJ) { sl_compress_init(sp->pp_comp, p[4]); sp->ipcp.max_state = p[4]; sp->ipcp.compress_cid = p[5]; if (debug) log(-1, "[agree] "); } else sp->ipcp.opts &= ~(1 << IPCP_OPT_COMPRESSION); } break; case IPCP_OPT_ADDRESS: /* * Peer doesn't like our local IP address. See * if we can do something for him. We'll drop * him our address then. */ if (len >= 6 && p[1] == 6) { wantaddr = p[2] << 24 | p[3] << 16 | p[4] << 8 | p[5]; sp->ipcp.opts |= (1 << IPCP_OPT_ADDRESS); if (debug) log(-1, "[wantaddr %s] ", sppp_dotted_quad(wantaddr)); /* * When doing dynamic address assignment, * we accept his offer. Otherwise, we * ignore it and thus continue to negotiate * our already existing value. * XXX: Bogus, if he said no once, he'll * just say no again, might as well die. */ if (sp->ipcp.flags & IPCP_MYADDR_DYN) { sppp_set_ip_addr(sp, wantaddr); if (debug) log(-1, "[agree] "); sp->ipcp.flags |= IPCP_MYADDR_SEEN; } } break; } } if (debug) log(-1, "\n"); free (buf, M_TEMP); return; } static void sppp_ipcp_tlu(struct sppp *sp) { /* we are up - notify isdn daemon */ if (sp->pp_con) sp->pp_con(sp); } static void sppp_ipcp_tld(struct sppp *sp) { } static void sppp_ipcp_tls(struct sppp *sp) { /* indicate to LCP that it must stay alive */ sp->lcp.protos |= (1 << IDX_IPCP); } static void sppp_ipcp_tlf(struct sppp *sp) { /* we no longer need LCP */ sp->lcp.protos &= ~(1 << IDX_IPCP); sppp_lcp_check_and_close(sp); } static void sppp_ipcp_scr(struct sppp *sp) { char opt[6 /* compression */ + 6 /* address */]; u_long ouraddr; int i = 0; if (sp->ipcp.opts & (1 << IPCP_OPT_COMPRESSION)) { opt[i++] = IPCP_OPT_COMPRESSION; opt[i++] = 6; opt[i++] = IPCP_COMP_VJ >> 8; opt[i++] = IPCP_COMP_VJ; opt[i++] = sp->ipcp.max_state; opt[i++] = sp->ipcp.compress_cid; } if (sp->ipcp.opts & (1 << IPCP_OPT_ADDRESS)) { sppp_get_ip_addrs(sp, &ouraddr, 0, 0); opt[i++] = IPCP_OPT_ADDRESS; opt[i++] = 6; opt[i++] = ouraddr >> 24; opt[i++] = ouraddr >> 16; opt[i++] = ouraddr >> 8; opt[i++] = ouraddr; } sp->confid[IDX_IPCP] = ++sp->pp_seq[IDX_IPCP]; sppp_cp_send(sp, PPP_IPCP, CONF_REQ, sp->confid[IDX_IPCP], i, &opt); } #else /* !INET */ static void sppp_ipcp_init(struct sppp *sp) { } static void sppp_ipcp_up(struct sppp *sp) { } static void sppp_ipcp_down(struct sppp *sp) { } static void sppp_ipcp_open(struct sppp *sp) { } static void sppp_ipcp_close(struct sppp *sp) { } static void sppp_ipcp_TO(void *cookie) { } static int sppp_ipcp_RCR(struct sppp *sp, struct lcp_header *h, int len) { return (0); } static void sppp_ipcp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len) { } static void sppp_ipcp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len) { } static void sppp_ipcp_tlu(struct sppp *sp) { } static void sppp_ipcp_tld(struct sppp *sp) { } static void sppp_ipcp_tls(struct sppp *sp) { } static void sppp_ipcp_tlf(struct sppp *sp) { } static void sppp_ipcp_scr(struct sppp *sp) { } #endif /* *--------------------------------------------------------------------------* * * * The IPv6CP implementation. * * * *--------------------------------------------------------------------------* */ #ifdef INET6 static void sppp_ipv6cp_init(struct sppp *sp) { sp->ipv6cp.opts = 0; sp->ipv6cp.flags = 0; sp->state[IDX_IPV6CP] = STATE_INITIAL; sp->fail_counter[IDX_IPV6CP] = 0; sp->pp_seq[IDX_IPV6CP] = 0; sp->pp_rseq[IDX_IPV6CP] = 0; callout_init(&sp->ch[IDX_IPV6CP], 1); } static void sppp_ipv6cp_up(struct sppp *sp) { sppp_up_event(&ipv6cp, sp); } static void sppp_ipv6cp_down(struct sppp *sp) { sppp_down_event(&ipv6cp, sp); } static void sppp_ipv6cp_open(struct sppp *sp) { STDDCL; struct in6_addr myaddr, hisaddr; #ifdef IPV6CP_MYIFID_DYN sp->ipv6cp.flags &= ~(IPV6CP_MYIFID_SEEN|IPV6CP_MYIFID_DYN); #else sp->ipv6cp.flags &= ~IPV6CP_MYIFID_SEEN; #endif sppp_get_ip6_addrs(sp, &myaddr, &hisaddr, 0); /* * If we don't have our address, this probably means our * interface doesn't want to talk IPv6 at all. (This could * be the case if somebody wants to speak only IPX, for * example.) Don't open IPv6CP in this case. */ if (IN6_IS_ADDR_UNSPECIFIED(&myaddr)) { /* XXX this message should go away */ if (debug) log(LOG_DEBUG, SPP_FMT "ipv6cp_open(): no IPv6 interface\n", SPP_ARGS(ifp)); return; } sp->ipv6cp.flags |= IPV6CP_MYIFID_SEEN; sp->ipv6cp.opts |= (1 << IPV6CP_OPT_IFID); sppp_open_event(&ipv6cp, sp); } static void sppp_ipv6cp_close(struct sppp *sp) { sppp_close_event(&ipv6cp, sp); } static void sppp_ipv6cp_TO(void *cookie) { sppp_to_event(&ipv6cp, (struct sppp *)cookie); } /* * Analyze a configure request. Return true if it was agreeable, and * caused action sca, false if it has been rejected or nak'ed, and * caused action scn. (The return value is used to make the state * transition decision in the state automaton.) */ static int sppp_ipv6cp_RCR(struct sppp *sp, struct lcp_header *h, int len) { u_char *buf, *r, *p; struct ifnet *ifp = SP2IFP(sp); int rlen, origlen, debug = ifp->if_flags & IFF_DEBUG; struct in6_addr myaddr, desiredaddr, suggestaddr; int ifidcount; int type; int collision, nohisaddr; char ip6buf[INET6_ADDRSTRLEN]; len -= 4; origlen = len; /* * Make sure to allocate a buf that can at least hold a * conf-nak with an `address' option. We might need it below. */ buf = r = malloc ((len < 6? 6: len), M_TEMP, M_NOWAIT); if (! buf) return (0); /* pass 1: see if we can recognize them */ if (debug) log(LOG_DEBUG, SPP_FMT "ipv6cp parse opts:", SPP_ARGS(ifp)); p = (void*) (h+1); ifidcount = 0; for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; len-=p[1], p+=p[1]) { if (debug) log(-1, " %s", sppp_ipv6cp_opt_name(*p)); switch (*p) { case IPV6CP_OPT_IFID: if (len >= 10 && p[1] == 10 && ifidcount == 0) { /* correctly formed address option */ ifidcount++; continue; } if (debug) log(-1, " [invalid]"); break; #ifdef notyet case IPV6CP_OPT_COMPRESSION: if (len >= 4 && p[1] >= 4) { /* correctly formed compress option */ continue; } if (debug) log(-1, " [invalid]"); break; #endif default: /* Others not supported. */ if (debug) log(-1, " [rej]"); break; } /* Add the option to rejected list. */ bcopy (p, r, p[1]); r += p[1]; rlen += p[1]; } if (rlen) { if (debug) log(-1, " send conf-rej\n"); sppp_cp_send (sp, PPP_IPV6CP, CONF_REJ, h->ident, rlen, buf); goto end; } else if (debug) log(-1, "\n"); /* pass 2: parse option values */ sppp_get_ip6_addrs(sp, &myaddr, 0, 0); if (debug) log(LOG_DEBUG, SPP_FMT "ipv6cp parse opt values: ", SPP_ARGS(ifp)); p = (void*) (h+1); len = origlen; type = CONF_ACK; for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; len-=p[1], p+=p[1]) { if (debug) log(-1, " %s", sppp_ipv6cp_opt_name(*p)); switch (*p) { #ifdef notyet case IPV6CP_OPT_COMPRESSION: continue; #endif case IPV6CP_OPT_IFID: bzero(&desiredaddr, sizeof(desiredaddr)); bcopy(&p[2], &desiredaddr.s6_addr[8], 8); collision = (bcmp(&desiredaddr.s6_addr[8], &myaddr.s6_addr[8], 8) == 0); nohisaddr = IN6_IS_ADDR_UNSPECIFIED(&desiredaddr); desiredaddr.s6_addr16[0] = htons(0xfe80); (void)in6_setscope(&desiredaddr, SP2IFP(sp), NULL); if (!collision && !nohisaddr) { /* no collision, hisaddr known - Conf-Ack */ type = CONF_ACK; if (debug) { log(-1, " %s [%s]", ip6_sprintf(ip6buf, &desiredaddr), sppp_cp_type_name(type)); } continue; } bzero(&suggestaddr, sizeof(suggestaddr)); if (collision && nohisaddr) { /* collision, hisaddr unknown - Conf-Rej */ type = CONF_REJ; bzero(&p[2], 8); } else { /* * - no collision, hisaddr unknown, or * - collision, hisaddr known * Conf-Nak, suggest hisaddr */ type = CONF_NAK; sppp_suggest_ip6_addr(sp, &suggestaddr); bcopy(&suggestaddr.s6_addr[8], &p[2], 8); } if (debug) log(-1, " %s [%s]", ip6_sprintf(ip6buf, &desiredaddr), sppp_cp_type_name(type)); break; } /* Add the option to nak'ed list. */ bcopy (p, r, p[1]); r += p[1]; rlen += p[1]; } if (rlen == 0 && type == CONF_ACK) { if (debug) log(-1, " send %s\n", sppp_cp_type_name(type)); sppp_cp_send (sp, PPP_IPV6CP, type, h->ident, origlen, h+1); } else { #ifdef DIAGNOSTIC if (type == CONF_ACK) panic("IPv6CP RCR: CONF_ACK with non-zero rlen"); #endif if (debug) { log(-1, " send %s suggest %s\n", sppp_cp_type_name(type), ip6_sprintf(ip6buf, &suggestaddr)); } sppp_cp_send (sp, PPP_IPV6CP, type, h->ident, rlen, buf); } end: free (buf, M_TEMP); return (rlen == 0); } /* * Analyze the IPv6CP Configure-Reject option list, and adjust our * negotiation. */ static void sppp_ipv6cp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len) { u_char *buf, *p; struct ifnet *ifp = SP2IFP(sp); int debug = ifp->if_flags & IFF_DEBUG; len -= 4; buf = malloc (len, M_TEMP, M_NOWAIT); if (!buf) return; if (debug) log(LOG_DEBUG, SPP_FMT "ipv6cp rej opts:", SPP_ARGS(ifp)); p = (void*) (h+1); for (; len >= 2 && p[1] >= 2 && len >= p[1]; len -= p[1], p += p[1]) { if (debug) log(-1, " %s", sppp_ipv6cp_opt_name(*p)); switch (*p) { case IPV6CP_OPT_IFID: /* * Peer doesn't grok address option. This is * bad. XXX Should we better give up here? */ sp->ipv6cp.opts &= ~(1 << IPV6CP_OPT_IFID); break; #ifdef notyet case IPV6CP_OPT_COMPRESS: sp->ipv6cp.opts &= ~(1 << IPV6CP_OPT_COMPRESS); break; #endif } } if (debug) log(-1, "\n"); free (buf, M_TEMP); return; } /* * Analyze the IPv6CP Configure-NAK option list, and adjust our * negotiation. */ static void sppp_ipv6cp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len) { u_char *buf, *p; struct ifnet *ifp = SP2IFP(sp); int debug = ifp->if_flags & IFF_DEBUG; struct in6_addr suggestaddr; char ip6buf[INET6_ADDRSTRLEN]; len -= 4; buf = malloc (len, M_TEMP, M_NOWAIT); if (!buf) return; if (debug) log(LOG_DEBUG, SPP_FMT "ipv6cp nak opts:", SPP_ARGS(ifp)); p = (void*) (h+1); for (; len >= 2 && p[1] >= 2 && len >= p[1]; len -= p[1], p += p[1]) { if (debug) log(-1, " %s", sppp_ipv6cp_opt_name(*p)); switch (*p) { case IPV6CP_OPT_IFID: /* * Peer doesn't like our local ifid. See * if we can do something for him. We'll drop * him our address then. */ if (len < 10 || p[1] != 10) break; bzero(&suggestaddr, sizeof(suggestaddr)); suggestaddr.s6_addr16[0] = htons(0xfe80); (void)in6_setscope(&suggestaddr, SP2IFP(sp), NULL); bcopy(&p[2], &suggestaddr.s6_addr[8], 8); sp->ipv6cp.opts |= (1 << IPV6CP_OPT_IFID); if (debug) log(-1, " [suggestaddr %s]", ip6_sprintf(ip6buf, &suggestaddr)); #ifdef IPV6CP_MYIFID_DYN /* * When doing dynamic address assignment, * we accept his offer. */ if (sp->ipv6cp.flags & IPV6CP_MYIFID_DYN) { struct in6_addr lastsuggest; /* * If equals to * , * we have a collision. generate new random * ifid. */ sppp_suggest_ip6_addr(&lastsuggest); if (IN6_ARE_ADDR_EQUAL(&suggestaddr, lastsuggest)) { if (debug) log(-1, " [random]"); sppp_gen_ip6_addr(sp, &suggestaddr); } sppp_set_ip6_addr(sp, &suggestaddr, 0); if (debug) log(-1, " [agree]"); sp->ipv6cp.flags |= IPV6CP_MYIFID_SEEN; } #else /* * Since we do not do dynamic address assignment, * we ignore it and thus continue to negotiate * our already existing value. This can possibly * go into infinite request-reject loop. * * This is not likely because we normally use * ifid based on MAC-address. * If you have no ethernet card on the node, too bad. * XXX should we use fail_counter? */ #endif break; #ifdef notyet case IPV6CP_OPT_COMPRESS: /* * Peer wants different compression parameters. */ break; #endif } } if (debug) log(-1, "\n"); free (buf, M_TEMP); return; } static void sppp_ipv6cp_tlu(struct sppp *sp) { /* we are up - notify isdn daemon */ if (sp->pp_con) sp->pp_con(sp); } static void sppp_ipv6cp_tld(struct sppp *sp) { } static void sppp_ipv6cp_tls(struct sppp *sp) { /* indicate to LCP that it must stay alive */ sp->lcp.protos |= (1 << IDX_IPV6CP); } static void sppp_ipv6cp_tlf(struct sppp *sp) { #if 0 /* need #if 0 to close IPv6CP properly */ /* we no longer need LCP */ sp->lcp.protos &= ~(1 << IDX_IPV6CP); sppp_lcp_check_and_close(sp); #endif } static void sppp_ipv6cp_scr(struct sppp *sp) { char opt[10 /* ifid */ + 4 /* compression, minimum */]; struct in6_addr ouraddr; int i = 0; if (sp->ipv6cp.opts & (1 << IPV6CP_OPT_IFID)) { sppp_get_ip6_addrs(sp, &ouraddr, 0, 0); opt[i++] = IPV6CP_OPT_IFID; opt[i++] = 10; bcopy(&ouraddr.s6_addr[8], &opt[i], 8); i += 8; } #ifdef notyet if (sp->ipv6cp.opts & (1 << IPV6CP_OPT_COMPRESSION)) { opt[i++] = IPV6CP_OPT_COMPRESSION; opt[i++] = 4; opt[i++] = 0; /* TBD */ opt[i++] = 0; /* TBD */ /* variable length data may follow */ } #endif sp->confid[IDX_IPV6CP] = ++sp->pp_seq[IDX_IPV6CP]; sppp_cp_send(sp, PPP_IPV6CP, CONF_REQ, sp->confid[IDX_IPV6CP], i, &opt); } #else /*INET6*/ static void sppp_ipv6cp_init(struct sppp *sp) { } static void sppp_ipv6cp_up(struct sppp *sp) { } static void sppp_ipv6cp_down(struct sppp *sp) { } static void sppp_ipv6cp_open(struct sppp *sp) { } static void sppp_ipv6cp_close(struct sppp *sp) { } static void sppp_ipv6cp_TO(void *sp) { } static int sppp_ipv6cp_RCR(struct sppp *sp, struct lcp_header *h, int len) { return 0; } static void sppp_ipv6cp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len) { } static void sppp_ipv6cp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len) { } static void sppp_ipv6cp_tlu(struct sppp *sp) { } static void sppp_ipv6cp_tld(struct sppp *sp) { } static void sppp_ipv6cp_tls(struct sppp *sp) { } static void sppp_ipv6cp_tlf(struct sppp *sp) { } static void sppp_ipv6cp_scr(struct sppp *sp) { } #endif /*INET6*/ /* *--------------------------------------------------------------------------* * * * The CHAP implementation. * * * *--------------------------------------------------------------------------* */ /* * The authentication protocols don't employ a full-fledged state machine as * the control protocols do, since they do have Open and Close events, but * not Up and Down, nor are they explicitly terminated. Also, use of the * authentication protocols may be different in both directions (this makes * sense, think of a machine that never accepts incoming calls but only * calls out, it doesn't require the called party to authenticate itself). * * Our state machine for the local authentication protocol (we are requesting * the peer to authenticate) looks like: * * RCA- * +--------------------------------------------+ * V scn,tld| * +--------+ Close +---------+ RCA+ * | |<----------------------------------| |------+ * +--->| Closed | TO* | Opened | sca | * | | |-----+ +-------| |<-----+ * | +--------+ irc | | +---------+ * | ^ | | ^ * | | | | | * | | | | | * | TO-| | | | * | |tld TO+ V | | * | | +------->+ | | * | | | | | | * | +--------+ V | | * | | |<----+<--------------------+ | * | | Req- | scr | * | | Sent | | * | | | | * | +--------+ | * | RCA- | | RCA+ | * +------+ +------------------------------------------+ * scn,tld sca,irc,ict,tlu * * * with: * * Open: LCP reached authentication phase * Close: LCP reached terminate phase * * RCA+: received reply (pap-req, chap-response), acceptable * RCN: received reply (pap-req, chap-response), not acceptable * TO+: timeout with restart counter >= 0 * TO-: timeout with restart counter < 0 * TO*: reschedule timeout for CHAP * * scr: send request packet (none for PAP, chap-challenge) * sca: send ack packet (pap-ack, chap-success) * scn: send nak packet (pap-nak, chap-failure) * ict: initialize re-challenge timer (CHAP only) * * tlu: this-layer-up, LCP reaches network phase * tld: this-layer-down, LCP enters terminate phase * * Note that in CHAP mode, after sending a new challenge, while the state * automaton falls back into Req-Sent state, it doesn't signal a tld * event to LCP, so LCP remains in network phase. Only after not getting * any response (or after getting an unacceptable response), CHAP closes, * causing LCP to enter terminate phase. * * With PAP, there is no initial request that can be sent. The peer is * expected to send one based on the successful negotiation of PAP as * the authentication protocol during the LCP option negotiation. * * Incoming authentication protocol requests (remote requests * authentication, we are peer) don't employ a state machine at all, * they are simply answered. Some peers [Ascend P50 firmware rev * 4.50] react allergically when sending IPCP requests while they are * still in authentication phase (thereby violating the standard that * demands that these NCP packets are to be discarded), so we keep * track of the peer demanding us to authenticate, and only proceed to * phase network once we've seen a positive acknowledge for the * authentication. */ /* * Handle incoming CHAP packets. */ static void sppp_chap_input(struct sppp *sp, struct mbuf *m) { STDDCL; struct lcp_header *h; int len; u_char *value, *name, digest[AUTHKEYLEN], dsize; int value_len, name_len; MD5_CTX ctx; len = m->m_pkthdr.len; if (len < 4) { if (debug) log(LOG_DEBUG, SPP_FMT "chap invalid packet length: %d bytes\n", SPP_ARGS(ifp), len); return; } h = mtod (m, struct lcp_header*); if (len > ntohs (h->len)) len = ntohs (h->len); switch (h->type) { /* challenge, failure and success are his authproto */ case CHAP_CHALLENGE: value = 1 + (u_char*)(h+1); value_len = value[-1]; name = value + value_len; name_len = len - value_len - 5; if (name_len < 0) { if (debug) { log(LOG_DEBUG, SPP_FMT "chap corrupted challenge " "<%s id=0x%x len=%d", SPP_ARGS(ifp), sppp_auth_type_name(PPP_CHAP, h->type), h->ident, ntohs(h->len)); sppp_print_bytes((u_char*) (h+1), len-4); log(-1, ">\n"); } break; } if (debug) { log(LOG_DEBUG, SPP_FMT "chap input <%s id=0x%x len=%d name=", SPP_ARGS(ifp), sppp_auth_type_name(PPP_CHAP, h->type), h->ident, ntohs(h->len)); sppp_print_string((char*) name, name_len); log(-1, " value-size=%d value=", value_len); sppp_print_bytes(value, value_len); log(-1, ">\n"); } /* Compute reply value. */ MD5Init(&ctx); MD5Update(&ctx, &h->ident, 1); MD5Update(&ctx, sp->myauth.secret, sppp_strnlen(sp->myauth.secret, AUTHKEYLEN)); MD5Update(&ctx, value, value_len); MD5Final(digest, &ctx); dsize = sizeof digest; sppp_auth_send(&chap, sp, CHAP_RESPONSE, h->ident, sizeof dsize, (const char *)&dsize, sizeof digest, digest, (size_t)sppp_strnlen(sp->myauth.name, AUTHNAMELEN), sp->myauth.name, 0); break; case CHAP_SUCCESS: if (debug) { log(LOG_DEBUG, SPP_FMT "chap success", SPP_ARGS(ifp)); if (len > 4) { log(-1, ": "); sppp_print_string((char*)(h + 1), len - 4); } log(-1, "\n"); } SPPP_LOCK(sp); sp->pp_flags &= ~PP_NEEDAUTH; if (sp->myauth.proto == PPP_CHAP && (sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) && (sp->lcp.protos & (1 << IDX_CHAP)) == 0) { /* * We are authenticator for CHAP but didn't * complete yet. Leave it to tlu to proceed * to network phase. */ SPPP_UNLOCK(sp); break; } SPPP_UNLOCK(sp); sppp_phase_network(sp); break; case CHAP_FAILURE: if (debug) { log(LOG_INFO, SPP_FMT "chap failure", SPP_ARGS(ifp)); if (len > 4) { log(-1, ": "); sppp_print_string((char*)(h + 1), len - 4); } log(-1, "\n"); } else log(LOG_INFO, SPP_FMT "chap failure\n", SPP_ARGS(ifp)); /* await LCP shutdown by authenticator */ break; /* response is my authproto */ case CHAP_RESPONSE: value = 1 + (u_char*)(h+1); value_len = value[-1]; name = value + value_len; name_len = len - value_len - 5; if (name_len < 0) { if (debug) { log(LOG_DEBUG, SPP_FMT "chap corrupted response " "<%s id=0x%x len=%d", SPP_ARGS(ifp), sppp_auth_type_name(PPP_CHAP, h->type), h->ident, ntohs(h->len)); sppp_print_bytes((u_char*)(h+1), len-4); log(-1, ">\n"); } break; } if (h->ident != sp->confid[IDX_CHAP]) { if (debug) log(LOG_DEBUG, SPP_FMT "chap dropping response for old ID " "(got %d, expected %d)\n", SPP_ARGS(ifp), h->ident, sp->confid[IDX_CHAP]); break; } if (name_len != sppp_strnlen(sp->hisauth.name, AUTHNAMELEN) || bcmp(name, sp->hisauth.name, name_len) != 0) { log(LOG_INFO, SPP_FMT "chap response, his name ", SPP_ARGS(ifp)); sppp_print_string(name, name_len); log(-1, " != expected "); sppp_print_string(sp->hisauth.name, sppp_strnlen(sp->hisauth.name, AUTHNAMELEN)); log(-1, "\n"); } if (debug) { log(LOG_DEBUG, SPP_FMT "chap input(%s) " "<%s id=0x%x len=%d name=", SPP_ARGS(ifp), sppp_state_name(sp->state[IDX_CHAP]), sppp_auth_type_name(PPP_CHAP, h->type), h->ident, ntohs (h->len)); sppp_print_string((char*)name, name_len); log(-1, " value-size=%d value=", value_len); sppp_print_bytes(value, value_len); log(-1, ">\n"); } if (value_len != AUTHKEYLEN) { if (debug) log(LOG_DEBUG, SPP_FMT "chap bad hash value length: " "%d bytes, should be %d\n", SPP_ARGS(ifp), value_len, AUTHKEYLEN); break; } MD5Init(&ctx); MD5Update(&ctx, &h->ident, 1); MD5Update(&ctx, sp->hisauth.secret, sppp_strnlen(sp->hisauth.secret, AUTHKEYLEN)); MD5Update(&ctx, sp->myauth.challenge, AUTHKEYLEN); MD5Final(digest, &ctx); #define FAILMSG "Failed..." #define SUCCMSG "Welcome!" if (value_len != sizeof digest || bcmp(digest, value, value_len) != 0) { /* action scn, tld */ sppp_auth_send(&chap, sp, CHAP_FAILURE, h->ident, sizeof(FAILMSG) - 1, (u_char *)FAILMSG, 0); chap.tld(sp); break; } /* action sca, perhaps tlu */ if (sp->state[IDX_CHAP] == STATE_REQ_SENT || sp->state[IDX_CHAP] == STATE_OPENED) sppp_auth_send(&chap, sp, CHAP_SUCCESS, h->ident, sizeof(SUCCMSG) - 1, (u_char *)SUCCMSG, 0); if (sp->state[IDX_CHAP] == STATE_REQ_SENT) { sppp_cp_change_state(&chap, sp, STATE_OPENED); chap.tlu(sp); } break; default: /* Unknown CHAP packet type -- ignore. */ if (debug) { log(LOG_DEBUG, SPP_FMT "chap unknown input(%s) " "<0x%x id=0x%xh len=%d", SPP_ARGS(ifp), sppp_state_name(sp->state[IDX_CHAP]), h->type, h->ident, ntohs(h->len)); sppp_print_bytes((u_char*)(h+1), len-4); log(-1, ">\n"); } break; } } static void sppp_chap_init(struct sppp *sp) { /* Chap doesn't have STATE_INITIAL at all. */ sp->state[IDX_CHAP] = STATE_CLOSED; sp->fail_counter[IDX_CHAP] = 0; sp->pp_seq[IDX_CHAP] = 0; sp->pp_rseq[IDX_CHAP] = 0; callout_init(&sp->ch[IDX_CHAP], 1); } static void sppp_chap_open(struct sppp *sp) { if (sp->myauth.proto == PPP_CHAP && (sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) != 0) { /* we are authenticator for CHAP, start it */ chap.scr(sp); sp->rst_counter[IDX_CHAP] = sp->lcp.max_configure; sppp_cp_change_state(&chap, sp, STATE_REQ_SENT); } /* nothing to be done if we are peer, await a challenge */ } static void sppp_chap_close(struct sppp *sp) { if (sp->state[IDX_CHAP] != STATE_CLOSED) sppp_cp_change_state(&chap, sp, STATE_CLOSED); } static void sppp_chap_TO(void *cookie) { struct sppp *sp = (struct sppp *)cookie; STDDCL; SPPP_LOCK(sp); if (debug) log(LOG_DEBUG, SPP_FMT "chap TO(%s) rst_counter = %d\n", SPP_ARGS(ifp), sppp_state_name(sp->state[IDX_CHAP]), sp->rst_counter[IDX_CHAP]); if (--sp->rst_counter[IDX_CHAP] < 0) /* TO- event */ switch (sp->state[IDX_CHAP]) { case STATE_REQ_SENT: chap.tld(sp); sppp_cp_change_state(&chap, sp, STATE_CLOSED); break; } else /* TO+ (or TO*) event */ switch (sp->state[IDX_CHAP]) { case STATE_OPENED: /* TO* event */ sp->rst_counter[IDX_CHAP] = sp->lcp.max_configure; /* FALLTHROUGH */ case STATE_REQ_SENT: chap.scr(sp); /* sppp_cp_change_state() will restart the timer */ sppp_cp_change_state(&chap, sp, STATE_REQ_SENT); break; } SPPP_UNLOCK(sp); } static void sppp_chap_tlu(struct sppp *sp) { STDDCL; int i; i = 0; sp->rst_counter[IDX_CHAP] = sp->lcp.max_configure; /* * Some broken CHAP implementations (Conware CoNet, firmware * 4.0.?) don't want to re-authenticate their CHAP once the * initial challenge-response exchange has taken place. * Provide for an option to avoid rechallenges. */ if ((sp->hisauth.flags & AUTHFLAG_NORECHALLENGE) == 0) { /* * Compute the re-challenge timeout. This will yield * a number between 300 and 810 seconds. */ i = 300 + ((unsigned)(random() & 0xff00) >> 7); callout_reset(&sp->ch[IDX_CHAP], i * hz, chap.TO, (void *)sp); } if (debug) { log(LOG_DEBUG, SPP_FMT "chap %s, ", SPP_ARGS(ifp), sp->pp_phase == PHASE_NETWORK? "reconfirmed": "tlu"); if ((sp->hisauth.flags & AUTHFLAG_NORECHALLENGE) == 0) log(-1, "next re-challenge in %d seconds\n", i); else log(-1, "re-challenging suppressed\n"); } SPPP_LOCK(sp); /* indicate to LCP that we need to be closed down */ sp->lcp.protos |= (1 << IDX_CHAP); if (sp->pp_flags & PP_NEEDAUTH) { /* * Remote is authenticator, but his auth proto didn't * complete yet. Defer the transition to network * phase. */ SPPP_UNLOCK(sp); return; } SPPP_UNLOCK(sp); /* * If we are already in phase network, we are done here. This * is the case if this is a dummy tlu event after a re-challenge. */ if (sp->pp_phase != PHASE_NETWORK) sppp_phase_network(sp); } static void sppp_chap_tld(struct sppp *sp) { STDDCL; if (debug) log(LOG_DEBUG, SPP_FMT "chap tld\n", SPP_ARGS(ifp)); callout_stop(&sp->ch[IDX_CHAP]); sp->lcp.protos &= ~(1 << IDX_CHAP); lcp.Close(sp); } static void sppp_chap_scr(struct sppp *sp) { u_long *ch; u_char clen; /* Compute random challenge. */ ch = (u_long *)sp->myauth.challenge; arc4random_buf(ch, 4 * sizeof(*ch)); clen = AUTHKEYLEN; sp->confid[IDX_CHAP] = ++sp->pp_seq[IDX_CHAP]; sppp_auth_send(&chap, sp, CHAP_CHALLENGE, sp->confid[IDX_CHAP], sizeof clen, (const char *)&clen, (size_t)AUTHKEYLEN, sp->myauth.challenge, (size_t)sppp_strnlen(sp->myauth.name, AUTHNAMELEN), sp->myauth.name, 0); } /* *--------------------------------------------------------------------------* * * * The PAP implementation. * * * *--------------------------------------------------------------------------* */ /* * For PAP, we need to keep a little state also if we are the peer, not the * authenticator. This is since we don't get a request to authenticate, but * have to repeatedly authenticate ourself until we got a response (or the * retry counter is expired). */ /* * Handle incoming PAP packets. */ static void sppp_pap_input(struct sppp *sp, struct mbuf *m) { STDDCL; struct lcp_header *h; int len; u_char *name, *passwd, mlen; int name_len, passwd_len; len = m->m_pkthdr.len; if (len < 5) { if (debug) log(LOG_DEBUG, SPP_FMT "pap invalid packet length: %d bytes\n", SPP_ARGS(ifp), len); return; } h = mtod (m, struct lcp_header*); if (len > ntohs (h->len)) len = ntohs (h->len); switch (h->type) { /* PAP request is my authproto */ case PAP_REQ: name = 1 + (u_char*)(h+1); name_len = name[-1]; passwd = name + name_len + 1; if (name_len > len - 6 || (passwd_len = passwd[-1]) > len - 6 - name_len) { if (debug) { log(LOG_DEBUG, SPP_FMT "pap corrupted input " "<%s id=0x%x len=%d", SPP_ARGS(ifp), sppp_auth_type_name(PPP_PAP, h->type), h->ident, ntohs(h->len)); sppp_print_bytes((u_char*)(h+1), len-4); log(-1, ">\n"); } break; } if (debug) { log(LOG_DEBUG, SPP_FMT "pap input(%s) " "<%s id=0x%x len=%d name=", SPP_ARGS(ifp), sppp_state_name(sp->state[IDX_PAP]), sppp_auth_type_name(PPP_PAP, h->type), h->ident, ntohs(h->len)); sppp_print_string((char*)name, name_len); log(-1, " passwd="); sppp_print_string((char*)passwd, passwd_len); log(-1, ">\n"); } if (name_len != sppp_strnlen(sp->hisauth.name, AUTHNAMELEN) || passwd_len != sppp_strnlen(sp->hisauth.secret, AUTHKEYLEN) || bcmp(name, sp->hisauth.name, name_len) != 0 || bcmp(passwd, sp->hisauth.secret, passwd_len) != 0) { /* action scn, tld */ mlen = sizeof(FAILMSG) - 1; sppp_auth_send(&pap, sp, PAP_NAK, h->ident, sizeof mlen, (const char *)&mlen, sizeof(FAILMSG) - 1, (u_char *)FAILMSG, 0); pap.tld(sp); break; } /* action sca, perhaps tlu */ if (sp->state[IDX_PAP] == STATE_REQ_SENT || sp->state[IDX_PAP] == STATE_OPENED) { mlen = sizeof(SUCCMSG) - 1; sppp_auth_send(&pap, sp, PAP_ACK, h->ident, sizeof mlen, (const char *)&mlen, sizeof(SUCCMSG) - 1, (u_char *)SUCCMSG, 0); } if (sp->state[IDX_PAP] == STATE_REQ_SENT) { sppp_cp_change_state(&pap, sp, STATE_OPENED); pap.tlu(sp); } break; /* ack and nak are his authproto */ case PAP_ACK: callout_stop(&sp->pap_my_to_ch); if (debug) { log(LOG_DEBUG, SPP_FMT "pap success", SPP_ARGS(ifp)); name_len = *((char *)h); if (len > 5 && name_len) { log(-1, ": "); sppp_print_string((char*)(h+1), name_len); } log(-1, "\n"); } SPPP_LOCK(sp); sp->pp_flags &= ~PP_NEEDAUTH; if (sp->myauth.proto == PPP_PAP && (sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) && (sp->lcp.protos & (1 << IDX_PAP)) == 0) { /* * We are authenticator for PAP but didn't * complete yet. Leave it to tlu to proceed * to network phase. */ SPPP_UNLOCK(sp); break; } SPPP_UNLOCK(sp); sppp_phase_network(sp); break; case PAP_NAK: callout_stop (&sp->pap_my_to_ch); if (debug) { log(LOG_INFO, SPP_FMT "pap failure", SPP_ARGS(ifp)); name_len = *((char *)h); if (len > 5 && name_len) { log(-1, ": "); sppp_print_string((char*)(h+1), name_len); } log(-1, "\n"); } else log(LOG_INFO, SPP_FMT "pap failure\n", SPP_ARGS(ifp)); /* await LCP shutdown by authenticator */ break; default: /* Unknown PAP packet type -- ignore. */ if (debug) { log(LOG_DEBUG, SPP_FMT "pap corrupted input " "<0x%x id=0x%x len=%d", SPP_ARGS(ifp), h->type, h->ident, ntohs(h->len)); sppp_print_bytes((u_char*)(h+1), len-4); log(-1, ">\n"); } break; } } static void sppp_pap_init(struct sppp *sp) { /* PAP doesn't have STATE_INITIAL at all. */ sp->state[IDX_PAP] = STATE_CLOSED; sp->fail_counter[IDX_PAP] = 0; sp->pp_seq[IDX_PAP] = 0; sp->pp_rseq[IDX_PAP] = 0; callout_init(&sp->ch[IDX_PAP], 1); callout_init(&sp->pap_my_to_ch, 1); } static void sppp_pap_open(struct sppp *sp) { if (sp->hisauth.proto == PPP_PAP && (sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) != 0) { /* we are authenticator for PAP, start our timer */ sp->rst_counter[IDX_PAP] = sp->lcp.max_configure; sppp_cp_change_state(&pap, sp, STATE_REQ_SENT); } if (sp->myauth.proto == PPP_PAP) { /* we are peer, send a request, and start a timer */ pap.scr(sp); callout_reset(&sp->pap_my_to_ch, sp->lcp.timeout, sppp_pap_my_TO, (void *)sp); } } static void sppp_pap_close(struct sppp *sp) { if (sp->state[IDX_PAP] != STATE_CLOSED) sppp_cp_change_state(&pap, sp, STATE_CLOSED); } /* * That's the timeout routine if we are authenticator. Since the * authenticator is basically passive in PAP, we can't do much here. */ static void sppp_pap_TO(void *cookie) { struct sppp *sp = (struct sppp *)cookie; STDDCL; SPPP_LOCK(sp); if (debug) log(LOG_DEBUG, SPP_FMT "pap TO(%s) rst_counter = %d\n", SPP_ARGS(ifp), sppp_state_name(sp->state[IDX_PAP]), sp->rst_counter[IDX_PAP]); if (--sp->rst_counter[IDX_PAP] < 0) /* TO- event */ switch (sp->state[IDX_PAP]) { case STATE_REQ_SENT: pap.tld(sp); sppp_cp_change_state(&pap, sp, STATE_CLOSED); break; } else /* TO+ event, not very much we could do */ switch (sp->state[IDX_PAP]) { case STATE_REQ_SENT: /* sppp_cp_change_state() will restart the timer */ sppp_cp_change_state(&pap, sp, STATE_REQ_SENT); break; } SPPP_UNLOCK(sp); } /* * That's the timeout handler if we are peer. Since the peer is active, * we need to retransmit our PAP request since it is apparently lost. * XXX We should impose a max counter. */ static void sppp_pap_my_TO(void *cookie) { struct sppp *sp = (struct sppp *)cookie; STDDCL; if (debug) log(LOG_DEBUG, SPP_FMT "pap peer TO\n", SPP_ARGS(ifp)); SPPP_LOCK(sp); pap.scr(sp); SPPP_UNLOCK(sp); } static void sppp_pap_tlu(struct sppp *sp) { STDDCL; sp->rst_counter[IDX_PAP] = sp->lcp.max_configure; if (debug) log(LOG_DEBUG, SPP_FMT "%s tlu\n", SPP_ARGS(ifp), pap.name); SPPP_LOCK(sp); /* indicate to LCP that we need to be closed down */ sp->lcp.protos |= (1 << IDX_PAP); if (sp->pp_flags & PP_NEEDAUTH) { /* * Remote is authenticator, but his auth proto didn't * complete yet. Defer the transition to network * phase. */ SPPP_UNLOCK(sp); return; } SPPP_UNLOCK(sp); sppp_phase_network(sp); } static void sppp_pap_tld(struct sppp *sp) { STDDCL; if (debug) log(LOG_DEBUG, SPP_FMT "pap tld\n", SPP_ARGS(ifp)); callout_stop (&sp->ch[IDX_PAP]); callout_stop (&sp->pap_my_to_ch); sp->lcp.protos &= ~(1 << IDX_PAP); lcp.Close(sp); } static void sppp_pap_scr(struct sppp *sp) { u_char idlen, pwdlen; sp->confid[IDX_PAP] = ++sp->pp_seq[IDX_PAP]; pwdlen = sppp_strnlen(sp->myauth.secret, AUTHKEYLEN); idlen = sppp_strnlen(sp->myauth.name, AUTHNAMELEN); sppp_auth_send(&pap, sp, PAP_REQ, sp->confid[IDX_PAP], sizeof idlen, (const char *)&idlen, (size_t)idlen, sp->myauth.name, sizeof pwdlen, (const char *)&pwdlen, (size_t)pwdlen, sp->myauth.secret, 0); } /* * Random miscellaneous functions. */ /* * Send a PAP or CHAP proto packet. * * Varadic function, each of the elements for the ellipsis is of type * ``size_t mlen, const u_char *msg''. Processing will stop iff * mlen == 0. * NOTE: never declare variadic functions with types subject to type * promotion (i.e. u_char). This is asking for big trouble depending * on the architecture you are on... */ static void sppp_auth_send(const struct cp *cp, struct sppp *sp, unsigned int type, unsigned int id, ...) { STDDCL; struct ppp_header *h; struct lcp_header *lh; struct mbuf *m; u_char *p; int len; unsigned int mlen; const char *msg; va_list ap; MGETHDR (m, M_NOWAIT, MT_DATA); if (! m) return; m->m_pkthdr.rcvif = 0; h = mtod (m, struct ppp_header*); h->address = PPP_ALLSTATIONS; /* broadcast address */ h->control = PPP_UI; /* Unnumbered Info */ h->protocol = htons(cp->proto); lh = (struct lcp_header*)(h + 1); lh->type = type; lh->ident = id; p = (u_char*) (lh+1); va_start(ap, id); len = 0; while ((mlen = (unsigned int)va_arg(ap, size_t)) != 0) { msg = va_arg(ap, const char *); len += mlen; if (len > MHLEN - PPP_HEADER_LEN - LCP_HEADER_LEN) { va_end(ap); m_freem(m); return; } bcopy(msg, p, mlen); p += mlen; } va_end(ap); m->m_pkthdr.len = m->m_len = PPP_HEADER_LEN + LCP_HEADER_LEN + len; lh->len = htons (LCP_HEADER_LEN + len); if (debug) { log(LOG_DEBUG, SPP_FMT "%s output <%s id=0x%x len=%d", SPP_ARGS(ifp), cp->name, sppp_auth_type_name(cp->proto, lh->type), lh->ident, ntohs(lh->len)); sppp_print_bytes((u_char*) (lh+1), len); log(-1, ">\n"); } if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3)) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } /* * Flush interface queue. */ static void sppp_qflush(struct ifqueue *ifq) { struct mbuf *m, *n; n = ifq->ifq_head; while ((m = n)) { n = m->m_nextpkt; m_freem (m); } ifq->ifq_head = 0; ifq->ifq_tail = 0; ifq->ifq_len = 0; } /* * Send keepalive packets, every 10 seconds. */ static void sppp_keepalive(void *dummy) { struct sppp *sp = (struct sppp*)dummy; struct ifnet *ifp = SP2IFP(sp); SPPP_LOCK(sp); /* Keepalive mode disabled or channel down? */ if (! (sp->pp_flags & PP_KEEPALIVE) || ! (ifp->if_drv_flags & IFF_DRV_RUNNING)) goto out; if (sp->pp_mode == PP_FR) { sppp_fr_keepalive (sp); goto out; } /* No keepalive in PPP mode if LCP not opened yet. */ if (sp->pp_mode != IFF_CISCO && sp->pp_phase < PHASE_AUTHENTICATE) goto out; if (sp->pp_alivecnt == MAXALIVECNT) { /* No keepalive packets got. Stop the interface. */ printf (SPP_FMT "down\n", SPP_ARGS(ifp)); if_down (ifp); sppp_qflush (&sp->pp_cpq); if (sp->pp_mode != IFF_CISCO) { /* XXX */ /* Shut down the PPP link. */ lcp.Down(sp); /* Initiate negotiation. XXX */ lcp.Up(sp); } } if (sp->pp_alivecnt <= MAXALIVECNT) ++sp->pp_alivecnt; if (sp->pp_mode == IFF_CISCO) sppp_cisco_send (sp, CISCO_KEEPALIVE_REQ, ++sp->pp_seq[IDX_LCP], sp->pp_rseq[IDX_LCP]); else if (sp->pp_phase >= PHASE_AUTHENTICATE) { uint32_t nmagic = htonl(sp->lcp.magic); sp->lcp.echoid = ++sp->pp_seq[IDX_LCP]; sppp_cp_send (sp, PPP_LCP, ECHO_REQ, sp->lcp.echoid, 4, &nmagic); } out: SPPP_UNLOCK(sp); callout_reset(&sp->keepalive_callout, hz * 10, sppp_keepalive, (void *)sp); } /* * Get both IP addresses. */ void sppp_get_ip_addrs(struct sppp *sp, u_long *src, u_long *dst, u_long *srcmask) { struct epoch_tracker et; struct ifnet *ifp = SP2IFP(sp); struct ifaddr *ifa; struct sockaddr_in *si, *sm; u_long ssrc, ddst; sm = NULL; ssrc = ddst = 0L; /* * Pick the first AF_INET address from the list, * aliases don't make any sense on a p2p link anyway. */ si = NULL; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) { si = (struct sockaddr_in *)ifa->ifa_addr; sm = (struct sockaddr_in *)ifa->ifa_netmask; if (si) break; } if (ifa) { if (si && si->sin_addr.s_addr) { ssrc = si->sin_addr.s_addr; if (srcmask) *srcmask = ntohl(sm->sin_addr.s_addr); } si = (struct sockaddr_in *)ifa->ifa_dstaddr; if (si && si->sin_addr.s_addr) ddst = si->sin_addr.s_addr; } NET_EPOCH_EXIT(et); if (dst) *dst = ntohl(ddst); if (src) *src = ntohl(ssrc); } #ifdef INET /* * Set my IP address. */ static void sppp_set_ip_addr(struct sppp *sp, u_long src) { STDDCL; struct epoch_tracker et; struct ifaddr *ifa; struct sockaddr_in *si; struct in_ifaddr *ia; /* * Pick the first AF_INET address from the list, * aliases don't make any sense on a p2p link anyway. */ si = NULL; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) { si = (struct sockaddr_in *)ifa->ifa_addr; if (si != NULL) { ifa_ref(ifa); break; } } } NET_EPOCH_EXIT(et); if (ifa != NULL) { int error; int fibnum = ifp->if_fib; rt_addrmsg(RTM_DELETE, ifa, fibnum); /* delete old route */ ia = ifatoia(ifa); error = in_handle_ifaddr_route(RTM_DELETE, ia); if (debug && error) { log(LOG_DEBUG, SPP_FMT "sppp_set_ip_addr: rtinit DEL failed, error=%d\n", SPP_ARGS(ifp), error); } /* set new address */ si->sin_addr.s_addr = htonl(src); IN_IFADDR_WLOCK(); LIST_REMOVE(ia, ia_hash); LIST_INSERT_HEAD(INADDR_HASH(si->sin_addr.s_addr), ia, ia_hash); IN_IFADDR_WUNLOCK(); rt_addrmsg(RTM_ADD, ifa, fibnum); /* add new route */ error = in_handle_ifaddr_route(RTM_ADD, ia); if (debug && error) { log(LOG_DEBUG, SPP_FMT "sppp_set_ip_addr: rtinit ADD failed, error=%d", SPP_ARGS(ifp), error); } ifa_free(ifa); } } #endif #ifdef INET6 /* * Get both IPv6 addresses. */ static void sppp_get_ip6_addrs(struct sppp *sp, struct in6_addr *src, struct in6_addr *dst, struct in6_addr *srcmask) { struct epoch_tracker et; struct ifnet *ifp = SP2IFP(sp); struct ifaddr *ifa; struct sockaddr_in6 *si, *sm; struct in6_addr ssrc, ddst; sm = NULL; bzero(&ssrc, sizeof(ssrc)); bzero(&ddst, sizeof(ddst)); /* * Pick the first link-local AF_INET6 address from the list, * aliases don't make any sense on a p2p link anyway. */ si = NULL; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET6) { si = (struct sockaddr_in6 *)ifa->ifa_addr; sm = (struct sockaddr_in6 *)ifa->ifa_netmask; if (si && IN6_IS_ADDR_LINKLOCAL(&si->sin6_addr)) break; } if (ifa) { if (si && !IN6_IS_ADDR_UNSPECIFIED(&si->sin6_addr)) { bcopy(&si->sin6_addr, &ssrc, sizeof(ssrc)); if (srcmask) { bcopy(&sm->sin6_addr, srcmask, sizeof(*srcmask)); } } si = (struct sockaddr_in6 *)ifa->ifa_dstaddr; if (si && !IN6_IS_ADDR_UNSPECIFIED(&si->sin6_addr)) bcopy(&si->sin6_addr, &ddst, sizeof(ddst)); } if (dst) bcopy(&ddst, dst, sizeof(*dst)); if (src) bcopy(&ssrc, src, sizeof(*src)); NET_EPOCH_EXIT(et); } #ifdef IPV6CP_MYIFID_DYN /* * Generate random ifid. */ static void sppp_gen_ip6_addr(struct sppp *sp, struct in6_addr *addr) { /* TBD */ } /* * Set my IPv6 address. */ static void sppp_set_ip6_addr(struct sppp *sp, const struct in6_addr *src) { STDDCL; struct epoch_tracker et; struct ifaddr *ifa; struct sockaddr_in6 *sin6; /* * Pick the first link-local AF_INET6 address from the list, * aliases don't make any sense on a p2p link anyway. */ sin6 = NULL; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (sin6 && IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { ifa_ref(ifa); break; } } } NET_EPOCH_EXIT(et); if (ifa != NULL) { int error; struct sockaddr_in6 new_sin6 = *sin6; bcopy(src, &new_sin6.sin6_addr, sizeof(new_sin6.sin6_addr)); error = in6_ifinit(ifp, ifatoia6(ifa), &new_sin6, 1); if (debug && error) { log(LOG_DEBUG, SPP_FMT "sppp_set_ip6_addr: in6_ifinit " " failed, error=%d\n", SPP_ARGS(ifp), error); } ifa_free(ifa); } } #endif /* * Suggest a candidate address to be used by peer. */ static void sppp_suggest_ip6_addr(struct sppp *sp, struct in6_addr *suggest) { struct in6_addr myaddr; struct timeval tv; sppp_get_ip6_addrs(sp, &myaddr, 0, 0); myaddr.s6_addr[8] &= ~0x02; /* u bit to "local" */ microtime(&tv); if ((tv.tv_usec & 0xff) == 0 && (tv.tv_sec & 0xff) == 0) { myaddr.s6_addr[14] ^= 0xff; myaddr.s6_addr[15] ^= 0xff; } else { myaddr.s6_addr[14] ^= (tv.tv_usec & 0xff); myaddr.s6_addr[15] ^= (tv.tv_sec & 0xff); } if (suggest) bcopy(&myaddr, suggest, sizeof(myaddr)); } #endif /*INET6*/ static int sppp_params(struct sppp *sp, u_long cmd, void *data) { u_long subcmd; struct ifreq *ifr = (struct ifreq *)data; struct spppreq *spr; int rv = 0; if ((spr = malloc(sizeof(struct spppreq), M_TEMP, M_NOWAIT)) == NULL) return (EAGAIN); /* * ifr_data_get_ptr(ifr) is supposed to point to a struct spppreq. * Check the cmd word first before attempting to fetch all the * data. */ rv = fueword(ifr_data_get_ptr(ifr), &subcmd); if (rv == -1) { rv = EFAULT; goto quit; } if (copyin(ifr_data_get_ptr(ifr), spr, sizeof(struct spppreq)) != 0) { rv = EFAULT; goto quit; } switch (subcmd) { case (u_long)SPPPIOGDEFS: if (cmd != SIOCGIFGENERIC) { rv = EINVAL; break; } /* * We copy over the entire current state, but clean * out some of the stuff we don't wanna pass up. * Remember, SIOCGIFGENERIC is unprotected, and can be * called by any user. No need to ever get PAP or * CHAP secrets back to userland anyway. */ spr->defs.pp_phase = sp->pp_phase; spr->defs.enable_vj = (sp->confflags & CONF_ENABLE_VJ) != 0; spr->defs.enable_ipv6 = (sp->confflags & CONF_ENABLE_IPV6) != 0; spr->defs.lcp = sp->lcp; spr->defs.ipcp = sp->ipcp; spr->defs.ipv6cp = sp->ipv6cp; spr->defs.myauth = sp->myauth; spr->defs.hisauth = sp->hisauth; bzero(spr->defs.myauth.secret, AUTHKEYLEN); bzero(spr->defs.myauth.challenge, AUTHKEYLEN); bzero(spr->defs.hisauth.secret, AUTHKEYLEN); bzero(spr->defs.hisauth.challenge, AUTHKEYLEN); /* * Fixup the LCP timeout value to milliseconds so * spppcontrol doesn't need to bother about the value * of "hz". We do the reverse calculation below when * setting it. */ spr->defs.lcp.timeout = sp->lcp.timeout * 1000 / hz; rv = copyout(spr, ifr_data_get_ptr(ifr), sizeof(struct spppreq)); break; case (u_long)SPPPIOSDEFS: if (cmd != SIOCSIFGENERIC) { rv = EINVAL; break; } /* * We have a very specific idea of which fields we * allow being passed back from userland, so to not * clobber our current state. For one, we only allow * setting anything if LCP is in dead or establish * phase. Once the authentication negotiations * started, the authentication settings must not be * changed again. (The administrator can force an * ifconfig down in order to get LCP back into dead * phase.) * * Also, we only allow for authentication parameters to be * specified. * * XXX Should allow to set or clear pp_flags. * * Finally, if the respective authentication protocol to * be used is set differently than 0, but the secret is * passed as all zeros, we don't trash the existing secret. * This allows an administrator to change the system name * only without clobbering the secret (which he didn't get * back in a previous SPPPIOGDEFS call). However, the * secrets are cleared if the authentication protocol is * reset to 0. */ if (sp->pp_phase != PHASE_DEAD && sp->pp_phase != PHASE_ESTABLISH) { rv = EBUSY; break; } if ((spr->defs.myauth.proto != 0 && spr->defs.myauth.proto != PPP_PAP && spr->defs.myauth.proto != PPP_CHAP) || (spr->defs.hisauth.proto != 0 && spr->defs.hisauth.proto != PPP_PAP && spr->defs.hisauth.proto != PPP_CHAP)) { rv = EINVAL; break; } if (spr->defs.myauth.proto == 0) /* resetting myauth */ bzero(&sp->myauth, sizeof sp->myauth); else { /* setting/changing myauth */ sp->myauth.proto = spr->defs.myauth.proto; bcopy(spr->defs.myauth.name, sp->myauth.name, AUTHNAMELEN); if (spr->defs.myauth.secret[0] != '\0') bcopy(spr->defs.myauth.secret, sp->myauth.secret, AUTHKEYLEN); } if (spr->defs.hisauth.proto == 0) /* resetting hisauth */ bzero(&sp->hisauth, sizeof sp->hisauth); else { /* setting/changing hisauth */ sp->hisauth.proto = spr->defs.hisauth.proto; sp->hisauth.flags = spr->defs.hisauth.flags; bcopy(spr->defs.hisauth.name, sp->hisauth.name, AUTHNAMELEN); if (spr->defs.hisauth.secret[0] != '\0') bcopy(spr->defs.hisauth.secret, sp->hisauth.secret, AUTHKEYLEN); } /* set LCP restart timer timeout */ if (spr->defs.lcp.timeout != 0) sp->lcp.timeout = spr->defs.lcp.timeout * hz / 1000; /* set VJ enable and IPv6 disable flags */ #ifdef INET if (spr->defs.enable_vj) sp->confflags |= CONF_ENABLE_VJ; else sp->confflags &= ~CONF_ENABLE_VJ; #endif #ifdef INET6 if (spr->defs.enable_ipv6) sp->confflags |= CONF_ENABLE_IPV6; else sp->confflags &= ~CONF_ENABLE_IPV6; #endif break; default: rv = EINVAL; } quit: free(spr, M_TEMP); return (rv); } static void sppp_phase_network(struct sppp *sp) { STDDCL; int i; u_long mask; sp->pp_phase = PHASE_NETWORK; if (debug) log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp), sppp_phase_name(sp->pp_phase)); /* Notify NCPs now. */ for (i = 0; i < IDX_COUNT; i++) if ((cps[i])->flags & CP_NCP) (cps[i])->Open(sp); /* Send Up events to all NCPs. */ for (i = 0, mask = 1; i < IDX_COUNT; i++, mask <<= 1) if ((sp->lcp.protos & mask) && ((cps[i])->flags & CP_NCP)) (cps[i])->Up(sp); /* if no NCP is starting, all this was in vain, close down */ sppp_lcp_check_and_close(sp); } static const char * sppp_cp_type_name(u_char type) { static char buf[12]; switch (type) { case CONF_REQ: return "conf-req"; case CONF_ACK: return "conf-ack"; case CONF_NAK: return "conf-nak"; case CONF_REJ: return "conf-rej"; case TERM_REQ: return "term-req"; case TERM_ACK: return "term-ack"; case CODE_REJ: return "code-rej"; case PROTO_REJ: return "proto-rej"; case ECHO_REQ: return "echo-req"; case ECHO_REPLY: return "echo-reply"; case DISC_REQ: return "discard-req"; } snprintf (buf, sizeof(buf), "cp/0x%x", type); return buf; } static const char * sppp_auth_type_name(u_short proto, u_char type) { static char buf[12]; switch (proto) { case PPP_CHAP: switch (type) { case CHAP_CHALLENGE: return "challenge"; case CHAP_RESPONSE: return "response"; case CHAP_SUCCESS: return "success"; case CHAP_FAILURE: return "failure"; } case PPP_PAP: switch (type) { case PAP_REQ: return "req"; case PAP_ACK: return "ack"; case PAP_NAK: return "nak"; } } snprintf (buf, sizeof(buf), "auth/0x%x", type); return buf; } static const char * sppp_lcp_opt_name(u_char opt) { static char buf[12]; switch (opt) { case LCP_OPT_MRU: return "mru"; case LCP_OPT_ASYNC_MAP: return "async-map"; case LCP_OPT_AUTH_PROTO: return "auth-proto"; case LCP_OPT_QUAL_PROTO: return "qual-proto"; case LCP_OPT_MAGIC: return "magic"; case LCP_OPT_PROTO_COMP: return "proto-comp"; case LCP_OPT_ADDR_COMP: return "addr-comp"; } snprintf (buf, sizeof(buf), "lcp/0x%x", opt); return buf; } #ifdef INET static const char * sppp_ipcp_opt_name(u_char opt) { static char buf[12]; switch (opt) { case IPCP_OPT_ADDRESSES: return "addresses"; case IPCP_OPT_COMPRESSION: return "compression"; case IPCP_OPT_ADDRESS: return "address"; } snprintf (buf, sizeof(buf), "ipcp/0x%x", opt); return buf; } #endif #ifdef INET6 static const char * sppp_ipv6cp_opt_name(u_char opt) { static char buf[12]; switch (opt) { case IPV6CP_OPT_IFID: return "ifid"; case IPV6CP_OPT_COMPRESSION: return "compression"; } sprintf (buf, "0x%x", opt); return buf; } #endif static const char * sppp_state_name(int state) { switch (state) { case STATE_INITIAL: return "initial"; case STATE_STARTING: return "starting"; case STATE_CLOSED: return "closed"; case STATE_STOPPED: return "stopped"; case STATE_CLOSING: return "closing"; case STATE_STOPPING: return "stopping"; case STATE_REQ_SENT: return "req-sent"; case STATE_ACK_RCVD: return "ack-rcvd"; case STATE_ACK_SENT: return "ack-sent"; case STATE_OPENED: return "opened"; } return "illegal"; } static const char * sppp_phase_name(enum ppp_phase phase) { switch (phase) { case PHASE_DEAD: return "dead"; case PHASE_ESTABLISH: return "establish"; case PHASE_TERMINATE: return "terminate"; case PHASE_AUTHENTICATE: return "authenticate"; case PHASE_NETWORK: return "network"; } return "illegal"; } static const char * sppp_proto_name(u_short proto) { static char buf[12]; switch (proto) { case PPP_LCP: return "lcp"; case PPP_IPCP: return "ipcp"; case PPP_PAP: return "pap"; case PPP_CHAP: return "chap"; case PPP_IPV6CP: return "ipv6cp"; } snprintf(buf, sizeof(buf), "proto/0x%x", (unsigned)proto); return buf; } static void sppp_print_bytes(const u_char *p, u_short len) { if (len) log(-1, " %*D", len, p, "-"); } static void sppp_print_string(const char *p, u_short len) { u_char c; while (len-- > 0) { c = *p++; /* * Print only ASCII chars directly. RFC 1994 recommends * using only them, but we don't rely on it. */ if (c < ' ' || c > '~') log(-1, "\\x%x", c); else log(-1, "%c", c); } } #ifdef INET static const char * sppp_dotted_quad(u_long addr) { static char s[16]; sprintf(s, "%d.%d.%d.%d", (int)((addr >> 24) & 0xff), (int)((addr >> 16) & 0xff), (int)((addr >> 8) & 0xff), (int)(addr & 0xff)); return s; } #endif static int sppp_strnlen(u_char *p, int max) { int len; for (len = 0; len < max && *p; ++p) ++len; return len; } /* a dummy, used to drop uninteresting events */ static void sppp_null(struct sppp *unused) { /* do just nothing */ } diff --git a/sys/net/if_tuntap.c b/sys/net/if_tuntap.c index b4b1b77ddc7c..bd9fc811d19f 100644 --- a/sys/net/if_tuntap.c +++ b/sys/net/if_tuntap.c @@ -1,2012 +1,2012 @@ /* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 1999-2000 by Maksim Yevmenkin * All rights reserved. * Copyright (c) 2019 Kyle Evans * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * BASED ON: * ------------------------------------------------------------------------- * * Copyright (c) 1988, Julian Onions * Nottingham University 1987. * * This source may be freely distributed, however I would be interested * in any changes that are made. * * This driver takes packets off the IP i/f and hands them up to a * user process to have its wicked way with. This driver has it's * roots in a similar driver written by Phil Cockcroft (formerly) at * UCL. This driver is based much more on read/write/poll mode of * operation though. * * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #endif #ifdef INET6 #include #include #endif #include #include #include #include #include #include #include #include #include struct tuntap_driver; /* * tun_list is protected by global tunmtx. Other mutable fields are * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is * static for the duration of a tunnel interface. */ struct tuntap_softc { TAILQ_ENTRY(tuntap_softc) tun_list; struct cdev *tun_alias; struct cdev *tun_dev; u_short tun_flags; /* misc flags */ #define TUN_OPEN 0x0001 #define TUN_INITED 0x0002 #define TUN_UNUSED1 0x0008 #define TUN_UNUSED2 0x0010 #define TUN_LMODE 0x0020 #define TUN_RWAIT 0x0040 #define TUN_ASYNC 0x0080 #define TUN_IFHEAD 0x0100 #define TUN_DYING 0x0200 #define TUN_L2 0x0400 #define TUN_VMNET 0x0800 #define TUN_DRIVER_IDENT_MASK (TUN_L2 | TUN_VMNET) #define TUN_READY (TUN_OPEN | TUN_INITED) pid_t tun_pid; /* owning pid */ struct ifnet *tun_ifp; /* the interface */ struct sigio *tun_sigio; /* async I/O info */ struct tuntap_driver *tun_drv; /* appropriate driver */ struct selinfo tun_rsel; /* read select */ struct mtx tun_mtx; /* softc field mutex */ struct cv tun_cv; /* for ref'd dev destroy */ struct ether_addr tun_ether; /* remote address */ int tun_busy; /* busy count */ int tun_vhdrlen; /* virtio-net header length */ }; #define TUN2IFP(sc) ((sc)->tun_ifp) #define TUNDEBUG if (tundebug) if_printf #define TUN_LOCK(tp) mtx_lock(&(tp)->tun_mtx) #define TUN_UNLOCK(tp) mtx_unlock(&(tp)->tun_mtx) #define TUN_LOCK_ASSERT(tp) mtx_assert(&(tp)->tun_mtx, MA_OWNED); #define TUN_VMIO_FLAG_MASK 0x0fff /* * Interface capabilities of a tap device that supports the virtio-net * header. */ #define TAP_VNET_HDR_CAPS (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 \ | IFCAP_VLAN_HWCSUM \ | IFCAP_TSO | IFCAP_LRO \ | IFCAP_VLAN_HWTSO) #define TAP_ALL_OFFLOAD (CSUM_TSO | CSUM_TCP | CSUM_UDP |\ CSUM_TCP_IPV6 | CSUM_UDP_IPV6) /* * All mutable global variables in if_tun are locked using tunmtx, with * the exception of tundebug, which is used unlocked, and the drivers' *clones, * which are static after setup. */ static struct mtx tunmtx; static eventhandler_tag arrival_tag; static eventhandler_tag clone_tag; static const char tunname[] = "tun"; static const char tapname[] = "tap"; static const char vmnetname[] = "vmnet"; static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface"); static int tundebug = 0; static int tundclone = 1; static int tap_allow_uopen = 0; /* allow user devfs cloning */ static int tapuponopen = 0; /* IFF_UP on open() */ static int tapdclone = 1; /* enable devfs cloning */ static TAILQ_HEAD(,tuntap_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); static struct sx tun_ioctl_sx; SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl"); SYSCTL_DECL(_net_link); /* tun */ static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IP tunnel software network interface"); SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0, "Enable legacy devfs interface creation"); /* tap */ static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Ethernet tunnel software network interface"); SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tap_allow_uopen, 0, "Enable legacy devfs interface creation for all users"); SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0, "Bring interface up when /dev/tap is opened"); SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0, "Enable legacy devfs interface creation"); SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tundebug, 0, ""); static int tun_create_device(struct tuntap_driver *drv, int unit, struct ucred *cr, struct cdev **dev, const char *name); static int tun_busy_locked(struct tuntap_softc *tp); static void tun_unbusy_locked(struct tuntap_softc *tp); static int tun_busy(struct tuntap_softc *tp); static void tun_unbusy(struct tuntap_softc *tp); static int tuntap_name2info(const char *name, int *unit, int *flags); static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev); static void tuncreate(struct cdev *dev); static void tundtor(void *data); static void tunrename(void *arg, struct ifnet *ifp); static int tunifioctl(struct ifnet *, u_long, caddr_t); static void tuninit(struct ifnet *); static void tunifinit(void *xtp); static int tuntapmodevent(module_t, int, void *); static int tunoutput(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *ro); static void tunstart(struct ifnet *); static void tunstart_l2(struct ifnet *); static int tun_clone_match(struct if_clone *ifc, const char *name); static int tap_clone_match(struct if_clone *ifc, const char *name); static int vmnet_clone_match(struct if_clone *ifc, const char *name); static int tun_clone_create(struct if_clone *, char *, size_t, caddr_t); static int tun_clone_destroy(struct if_clone *, struct ifnet *); static void tun_vnethdr_set(struct ifnet *ifp, int vhdrlen); static d_open_t tunopen; static d_read_t tunread; static d_write_t tunwrite; static d_ioctl_t tunioctl; static d_poll_t tunpoll; static d_kqfilter_t tunkqfilter; static int tunkqread(struct knote *, long); static int tunkqwrite(struct knote *, long); static void tunkqdetach(struct knote *); static struct filterops tun_read_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqread, }; static struct filterops tun_write_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqwrite, }; static struct tuntap_driver { struct cdevsw cdevsw; int ident_flags; struct unrhdr *unrhdr; struct clonedevs *clones; ifc_match_t *clone_match_fn; ifc_create_t *clone_create_fn; ifc_destroy_t *clone_destroy_fn; } tuntap_drivers[] = { { .ident_flags = 0, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = tunname, }, .clone_match_fn = tun_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, { .ident_flags = TUN_L2, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = tapname, }, .clone_match_fn = tap_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, { .ident_flags = TUN_L2 | TUN_VMNET, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = vmnetname, }, .clone_match_fn = vmnet_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, }; struct tuntap_driver_cloner { SLIST_ENTRY(tuntap_driver_cloner) link; struct tuntap_driver *drv; struct if_clone *cloner; }; VNET_DEFINE_STATIC(SLIST_HEAD(, tuntap_driver_cloner), tuntap_driver_cloners) = SLIST_HEAD_INITIALIZER(tuntap_driver_cloners); #define V_tuntap_driver_cloners VNET(tuntap_driver_cloners) /* * Mechanism for marking a tunnel device as busy so that we can safely do some * orthogonal operations (such as operations on devices) without racing against * tun_destroy. tun_destroy will wait on the condvar if we're at all busy or * open, to be woken up when the condition is alleviated. */ static int tun_busy_locked(struct tuntap_softc *tp) { TUN_LOCK_ASSERT(tp); if ((tp->tun_flags & TUN_DYING) != 0) { /* * Perhaps unintuitive, but the device is busy going away. * Other interpretations of EBUSY from tun_busy make little * sense, since making a busy device even more busy doesn't * sound like a problem. */ return (EBUSY); } ++tp->tun_busy; return (0); } static void tun_unbusy_locked(struct tuntap_softc *tp) { TUN_LOCK_ASSERT(tp); KASSERT(tp->tun_busy != 0, ("tun_unbusy: called for non-busy tunnel")); --tp->tun_busy; /* Wake up anything that may be waiting on our busy tunnel. */ if (tp->tun_busy == 0) cv_broadcast(&tp->tun_cv); } static int tun_busy(struct tuntap_softc *tp) { int ret; TUN_LOCK(tp); ret = tun_busy_locked(tp); TUN_UNLOCK(tp); return (ret); } static void tun_unbusy(struct tuntap_softc *tp) { TUN_LOCK(tp); tun_unbusy_locked(tp); TUN_UNLOCK(tp); } /* * Sets unit and/or flags given the device name. Must be called with correct * vnet context. */ static int tuntap_name2info(const char *name, int *outunit, int *outflags) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; char *dname; int flags, unit; bool found; if (name == NULL) return (EINVAL); /* * Needed for dev_stdclone, but dev_stdclone will not modify, it just * wants to be able to pass back a char * through the second param. We * will always set that as NULL here, so we'll fake it. */ dname = __DECONST(char *, name); found = false; KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), ("tuntap_driver_cloners failed to initialize")); SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { KASSERT(drvc->drv != NULL, ("tuntap_driver_cloners entry not properly initialized")); drv = drvc->drv; if (strcmp(name, drv->cdevsw.d_name) == 0) { found = true; unit = -1; flags = drv->ident_flags; break; } if (dev_stdclone(dname, NULL, drv->cdevsw.d_name, &unit) == 1) { found = true; flags = drv->ident_flags; break; } } if (!found) return (ENXIO); if (outunit != NULL) *outunit = unit; if (outflags != NULL) *outflags = flags; return (0); } /* * Get driver information from a set of flags specified. Masks the identifying * part of the flags and compares it against all of the available * tuntap_drivers. Must be called with correct vnet context. */ static struct tuntap_driver * tuntap_driver_from_flags(int tun_flags) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), ("tuntap_driver_cloners failed to initialize")); SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { KASSERT(drvc->drv != NULL, ("tuntap_driver_cloners entry not properly initialized")); drv = drvc->drv; if ((tun_flags & TUN_DRIVER_IDENT_MASK) == drv->ident_flags) return (drv); } return (NULL); } static int tun_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & TUN_L2) == 0) return (1); } return (0); } static int tap_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & (TUN_L2 | TUN_VMNET)) == TUN_L2) return (1); } return (0); } static int vmnet_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & TUN_VMNET) != 0) return (1); } return (0); } static int tun_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { struct tuntap_driver *drv; struct cdev *dev; int err, i, tunflags, unit; tunflags = 0; /* The name here tells us exactly what we're creating */ err = tuntap_name2info(name, &unit, &tunflags); if (err != 0) return (err); drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) return (ENXIO); if (unit != -1) { /* If this unit number is still available that's okay. */ if (alloc_unr_specific(drv->unrhdr, unit) == -1) return (EEXIST); } else { unit = alloc_unr(drv->unrhdr); } snprintf(name, IFNAMSIZ, "%s%d", drv->cdevsw.d_name, unit); /* find any existing device, or allocate new unit number */ dev = NULL; i = clone_create(&drv->clones, &drv->cdevsw, &unit, &dev, 0); /* No preexisting struct cdev *, create one */ if (i != 0) i = tun_create_device(drv, unit, NULL, &dev, name); if (i == 0) tuncreate(dev); return (i); } static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) { char devname[SPECNAMELEN + 1]; struct tuntap_driver *drv; int append_unit, i, u, tunflags; bool mayclone; if (*dev != NULL) return; tunflags = 0; CURVNET_SET(CRED_TO_VNET(cred)); if (tuntap_name2info(name, &u, &tunflags) != 0) goto out; /* Not recognized */ if (u != -1 && u > IF_MAXUNIT) goto out; /* Unit number too high */ mayclone = priv_check_cred(cred, PRIV_NET_IFCREATE) == 0; if ((tunflags & TUN_L2) != 0) { /* tap/vmnet allow user open with a sysctl */ mayclone = (mayclone || tap_allow_uopen) && tapdclone; } else { mayclone = mayclone && tundclone; } /* * If tun cloning is enabled, only the superuser can create an * interface. */ if (!mayclone) goto out; if (u == -1) append_unit = 1; else append_unit = 0; drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) goto out; /* find any existing device, or allocate new unit number */ i = clone_create(&drv->clones, &drv->cdevsw, &u, dev, 0); if (i) { if (append_unit) { namelen = snprintf(devname, sizeof(devname), "%s%d", name, u); name = devname; } i = tun_create_device(drv, u, cred, dev, name); } if (i == 0) if_clone_create(name, namelen, NULL); out: CURVNET_RESTORE(); } static void tun_destroy(struct tuntap_softc *tp) { TUN_LOCK(tp); tp->tun_flags |= TUN_DYING; if (tp->tun_busy != 0) cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx); else TUN_UNLOCK(tp); CURVNET_SET(TUN2IFP(tp)->if_vnet); /* destroy_dev will take care of any alias. */ destroy_dev(tp->tun_dev); seldrain(&tp->tun_rsel); knlist_clear(&tp->tun_rsel.si_note, 0); knlist_destroy(&tp->tun_rsel.si_note); if ((tp->tun_flags & TUN_L2) != 0) { ether_ifdetach(TUN2IFP(tp)); } else { bpfdetach(TUN2IFP(tp)); if_detach(TUN2IFP(tp)); } sx_xlock(&tun_ioctl_sx); TUN2IFP(tp)->if_softc = NULL; sx_xunlock(&tun_ioctl_sx); free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit); if_free(TUN2IFP(tp)); mtx_destroy(&tp->tun_mtx); cv_destroy(&tp->tun_cv); free(tp, M_TUN); CURVNET_RESTORE(); } static int tun_clone_destroy(struct if_clone *ifc __unused, struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; mtx_lock(&tunmtx); TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); return (0); } static void vnet_tun_init(const void *unused __unused) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; int i; for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; drvc = malloc(sizeof(*drvc), M_TUN, M_WAITOK | M_ZERO); drvc->drv = drv; drvc->cloner = if_clone_advanced(drv->cdevsw.d_name, 0, drv->clone_match_fn, drv->clone_create_fn, drv->clone_destroy_fn); SLIST_INSERT_HEAD(&V_tuntap_driver_cloners, drvc, link); }; } VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_tun_init, NULL); static void vnet_tun_uninit(const void *unused __unused) { struct tuntap_driver_cloner *drvc; while (!SLIST_EMPTY(&V_tuntap_driver_cloners)) { drvc = SLIST_FIRST(&V_tuntap_driver_cloners); SLIST_REMOVE_HEAD(&V_tuntap_driver_cloners, link); if_clone_detach(drvc->cloner); free(drvc, M_TUN); } } VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_tun_uninit, NULL); static void tun_uninit(const void *unused __unused) { struct tuntap_driver *drv; struct tuntap_softc *tp; int i; EVENTHANDLER_DEREGISTER(ifnet_arrival_event, arrival_tag); EVENTHANDLER_DEREGISTER(dev_clone, clone_tag); drain_dev_clone_events(); mtx_lock(&tunmtx); while ((tp = TAILQ_FIRST(&tunhead)) != NULL) { TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); mtx_lock(&tunmtx); } mtx_unlock(&tunmtx); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; delete_unrhdr(drv->unrhdr); clone_cleanup(&drv->clones); } mtx_destroy(&tunmtx); } SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL); static struct tuntap_driver * tuntap_driver_from_ifnet(const struct ifnet *ifp) { struct tuntap_driver *drv; int i; if (ifp == NULL) return (NULL); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; if (strcmp(ifp->if_dname, drv->cdevsw.d_name) == 0) return (drv); } return (NULL); } static int tuntapmodevent(module_t mod, int type, void *data) { struct tuntap_driver *drv; int i; switch (type) { case MOD_LOAD: mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; clone_setup(&drv->clones); drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx); } arrival_tag = EVENTHANDLER_REGISTER(ifnet_arrival_event, tunrename, 0, 1000); if (arrival_tag == NULL) return (ENOMEM); clone_tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); if (clone_tag == NULL) return (ENOMEM); break; case MOD_UNLOAD: /* See tun_uninit, so it's done after the vnet_sysuninit() */ break; default: return EOPNOTSUPP; } return 0; } static moduledata_t tuntap_mod = { "if_tuntap", tuntapmodevent, 0 }; /* We'll only ever have these two, so no need for a macro. */ static moduledata_t tun_mod = { "if_tun", NULL, 0 }; static moduledata_t tap_mod = { "if_tap", NULL, 0 }; DECLARE_MODULE(if_tuntap, tuntap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_tuntap, 1); DECLARE_MODULE(if_tun, tun_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_tun, 1); DECLARE_MODULE(if_tap, tap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_tap, 1); static int tun_create_device(struct tuntap_driver *drv, int unit, struct ucred *cr, struct cdev **dev, const char *name) { struct make_dev_args args; struct tuntap_softc *tp; int error; tp = malloc(sizeof(*tp), M_TUN, M_WAITOK | M_ZERO); mtx_init(&tp->tun_mtx, "tun_mtx", NULL, MTX_DEF); cv_init(&tp->tun_cv, "tun_condvar"); tp->tun_flags = drv->ident_flags; tp->tun_drv = drv; make_dev_args_init(&args); if (cr != NULL) args.mda_flags = MAKEDEV_REF; args.mda_devsw = &drv->cdevsw; args.mda_cr = cr; args.mda_uid = UID_UUCP; args.mda_gid = GID_DIALER; args.mda_mode = 0600; args.mda_unit = unit; args.mda_si_drv1 = tp; error = make_dev_s(&args, dev, "%s", name); if (error != 0) { free(tp, M_TUN); return (error); } KASSERT((*dev)->si_drv1 != NULL, ("Failed to set si_drv1 at %s creation", name)); tp->tun_dev = *dev; knlist_init_mtx(&tp->tun_rsel.si_note, &tp->tun_mtx); mtx_lock(&tunmtx); TAILQ_INSERT_TAIL(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); return (0); } static void tunstart(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; struct mbuf *m; TUNDEBUG(ifp, "starting\n"); if (ALTQ_IS_ENABLED(&ifp->if_snd)) { IFQ_LOCK(&ifp->if_snd); IFQ_POLL_NOLOCK(&ifp->if_snd, m); if (m == NULL) { IFQ_UNLOCK(&ifp->if_snd); return; } IFQ_UNLOCK(&ifp->if_snd); } TUN_LOCK(tp); if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { TUN_UNLOCK(tp); pgsigio(&tp->tun_sigio, SIGIO, 0); } else TUN_UNLOCK(tp); } /* * tunstart_l2 * * queue packets from higher level ready to put out */ static void tunstart_l2(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; TUNDEBUG(ifp, "starting\n"); /* * do not junk pending output if we are in VMnet mode. * XXX: can this do any harm because of queue overflow? */ TUN_LOCK(tp); if (((tp->tun_flags & TUN_VMNET) == 0) && ((tp->tun_flags & TUN_READY) != TUN_READY)) { struct mbuf *m; /* Unlocked read. */ TUNDEBUG(ifp, "not ready, tun_flags = 0x%x\n", tp->tun_flags); for (;;) { IF_DEQUEUE(&ifp->if_snd, m); if (m != NULL) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } else break; } TUN_UNLOCK(tp); return; } ifp->if_drv_flags |= IFF_DRV_OACTIVE; if (!IFQ_IS_EMPTY(&ifp->if_snd)) { if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } if ((tp->tun_flags & TUN_ASYNC) && (tp->tun_sigio != NULL)) { TUN_UNLOCK(tp); pgsigio(&tp->tun_sigio, SIGIO, 0); TUN_LOCK(tp); } selwakeuppri(&tp->tun_rsel, PZERO+1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */ } ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; TUN_UNLOCK(tp); } /* tunstart_l2 */ /* XXX: should return an error code so it can fail. */ static void tuncreate(struct cdev *dev) { struct tuntap_driver *drv; struct tuntap_softc *tp; struct ifnet *ifp; struct ether_addr eaddr; int iflags; u_char type; tp = dev->si_drv1; KASSERT(tp != NULL, ("si_drv1 should have been initialized at creation")); drv = tp->tun_drv; iflags = IFF_MULTICAST; if ((tp->tun_flags & TUN_L2) != 0) { type = IFT_ETHER; iflags |= IFF_BROADCAST | IFF_SIMPLEX; } else { type = IFT_PPP; iflags |= IFF_POINTOPOINT; } ifp = tp->tun_ifp = if_alloc(type); if (ifp == NULL) panic("%s%d: failed to if_alloc() interface.\n", drv->cdevsw.d_name, dev2unit(dev)); ifp->if_softc = tp; if_initname(ifp, drv->cdevsw.d_name, dev2unit(dev)); ifp->if_ioctl = tunifioctl; ifp->if_flags = iflags; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_capabilities |= IFCAP_LINKSTATE; ifp->if_capenable |= IFCAP_LINKSTATE; if ((tp->tun_flags & TUN_L2) != 0) { ifp->if_init = tunifinit; ifp->if_start = tunstart_l2; ether_gen_addr(ifp, &eaddr); ether_ifattach(ifp, eaddr.octet); } else { ifp->if_mtu = TUNMTU; ifp->if_start = tunstart; ifp->if_output = tunoutput; ifp->if_snd.ifq_drv_maxlen = 0; IFQ_SET_READY(&ifp->if_snd); if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); } TUN_LOCK(tp); tp->tun_flags |= TUN_INITED; TUN_UNLOCK(tp); TUNDEBUG(ifp, "interface %s is created, minor = %#x\n", ifp->if_xname, dev2unit(dev)); } static void tunrename(void *arg __unused, struct ifnet *ifp) { struct tuntap_softc *tp; int error; if ((ifp->if_flags & IFF_RENAMING) == 0) return; if (tuntap_driver_from_ifnet(ifp) == NULL) return; /* * We need to grab the ioctl sx long enough to make sure the softc is * still there. If it is, we can safely try to busy the tun device. * The busy may fail if the device is currently dying, in which case * we do nothing. If it doesn't fail, the busy count stops the device * from dying until we've created the alias (that will then be * subsequently destroyed). */ sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; if (tp == NULL) { sx_xunlock(&tun_ioctl_sx); return; } error = tun_busy(tp); sx_xunlock(&tun_ioctl_sx); if (error != 0) return; if (tp->tun_alias != NULL) { destroy_dev(tp->tun_alias); tp->tun_alias = NULL; } if (strcmp(ifp->if_xname, tp->tun_dev->si_name) == 0) goto out; /* * Failure's ok, aliases are created on a best effort basis. If a * tun user/consumer decides to rename the interface to conflict with * another device (non-ifnet) on the system, we will assume they know * what they are doing. make_dev_alias_p won't touch tun_alias on * failure, so we use it but ignore the return value. */ make_dev_alias_p(MAKEDEV_CHECKNAME, &tp->tun_alias, tp->tun_dev, "%s", ifp->if_xname); out: tun_unbusy(tp); } static int tunopen(struct cdev *dev, int flag, int mode, struct thread *td) { struct ifnet *ifp; struct tuntap_softc *tp; int error __diagused, tunflags; tunflags = 0; CURVNET_SET(TD_TO_VNET(td)); error = tuntap_name2info(dev->si_name, NULL, &tunflags); if (error != 0) { CURVNET_RESTORE(); return (error); /* Shouldn't happen */ } tp = dev->si_drv1; KASSERT(tp != NULL, ("si_drv1 should have been initialized at creation")); TUN_LOCK(tp); if ((tp->tun_flags & TUN_INITED) == 0) { TUN_UNLOCK(tp); CURVNET_RESTORE(); return (ENXIO); } if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) { TUN_UNLOCK(tp); CURVNET_RESTORE(); return (EBUSY); } error = tun_busy_locked(tp); KASSERT(error == 0, ("Must be able to busy an unopen tunnel")); ifp = TUN2IFP(tp); if ((tp->tun_flags & TUN_L2) != 0) { bcopy(IF_LLADDR(ifp), tp->tun_ether.octet, sizeof(tp->tun_ether.octet)); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if (tapuponopen) ifp->if_flags |= IFF_UP; } tp->tun_pid = td->td_proc->p_pid; tp->tun_flags |= TUN_OPEN; if_link_state_change(ifp, LINK_STATE_UP); TUNDEBUG(ifp, "open\n"); TUN_UNLOCK(tp); /* * This can fail with either ENOENT or EBUSY. This is in the middle of * d_open, so ENOENT should not be possible. EBUSY is possible, but * the only cdevpriv dtor being set will be tundtor and the softc being * passed is constant for a given cdev. We ignore the possible error * because of this as either "unlikely" or "not actually a problem." */ (void)devfs_set_cdevpriv(tp, tundtor); CURVNET_RESTORE(); return (0); } /* * tundtor - tear down the device - mark i/f down & delete * routing info */ static void tundtor(void *data) { struct proc *p; struct tuntap_softc *tp; struct ifnet *ifp; bool l2tun; tp = data; p = curproc; ifp = TUN2IFP(tp); TUN_LOCK(tp); /* * Realistically, we can't be obstinate here. This only means that the * tuntap device was closed out of order, and the last closer wasn't the * controller. These are still good to know about, though, as software * should avoid multiple processes with a tuntap device open and * ill-defined transfer of control (e.g., handoff, TUNSIFPID, close in * parent). */ if (p->p_pid != tp->tun_pid) { log(LOG_INFO, "pid %d (%s), %s: tun/tap protocol violation, non-controlling process closed last.\n", p->p_pid, p->p_comm, tp->tun_dev->si_name); } /* * junk all pending output */ CURVNET_SET(ifp->if_vnet); l2tun = false; if ((tp->tun_flags & TUN_L2) != 0) { l2tun = true; IF_DRAIN(&ifp->if_snd); } else { IFQ_PURGE(&ifp->if_snd); } /* For vmnet, we won't do most of the address/route bits */ if ((tp->tun_flags & TUN_VMNET) != 0 || (l2tun && (ifp->if_flags & IFF_LINK0) != 0)) goto out; if (ifp->if_flags & IFF_UP) { TUN_UNLOCK(tp); if_down(ifp); TUN_LOCK(tp); } /* Delete all addresses and routes which reference this interface. */ if (ifp->if_drv_flags & IFF_DRV_RUNNING) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; TUN_UNLOCK(tp); if_purgeaddrs(ifp); TUN_LOCK(tp); } out: if_link_state_change(ifp, LINK_STATE_DOWN); CURVNET_RESTORE(); funsetown(&tp->tun_sigio); selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); TUNDEBUG (ifp, "closed\n"); tp->tun_flags &= ~TUN_OPEN; tp->tun_pid = 0; tun_vnethdr_set(ifp, 0); tun_unbusy_locked(tp); TUN_UNLOCK(tp); } static void tuninit(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; TUNDEBUG(ifp, "tuninit\n"); TUN_LOCK(tp); ifp->if_drv_flags |= IFF_DRV_RUNNING; if ((tp->tun_flags & TUN_L2) == 0) { ifp->if_flags |= IFF_UP; getmicrotime(&ifp->if_lastchange); TUN_UNLOCK(tp); } else { ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; TUN_UNLOCK(tp); /* attempt to start output */ tunstart_l2(ifp); } } /* * Used only for l2 tunnel. */ static void tunifinit(void *xtp) { struct tuntap_softc *tp; tp = (struct tuntap_softc *)xtp; tuninit(tp->tun_ifp); } /* * To be called under TUN_LOCK. Update ifp->if_hwassist according to the * current value of ifp->if_capenable. */ static void tun_caps_changed(struct ifnet *ifp) { uint64_t hwassist = 0; TUN_LOCK_ASSERT((struct tuntap_softc *)ifp->if_softc); if (ifp->if_capenable & IFCAP_TXCSUM) hwassist |= CSUM_TCP | CSUM_UDP; if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6; if (ifp->if_capenable & IFCAP_TSO4) hwassist |= CSUM_IP_TSO; if (ifp->if_capenable & IFCAP_TSO6) hwassist |= CSUM_IP6_TSO; ifp->if_hwassist = hwassist; } /* * To be called under TUN_LOCK. Update tp->tun_vhdrlen and adjust * if_capabilities and if_capenable as needed. */ static void tun_vnethdr_set(struct ifnet *ifp, int vhdrlen) { struct tuntap_softc *tp = ifp->if_softc; TUN_LOCK_ASSERT(tp); if (tp->tun_vhdrlen == vhdrlen) return; /* * Update if_capabilities to reflect the * functionalities offered by the virtio-net * header. */ if (vhdrlen != 0) ifp->if_capabilities |= TAP_VNET_HDR_CAPS; else ifp->if_capabilities &= ~TAP_VNET_HDR_CAPS; /* * Disable any capabilities that we don't * support anymore. */ ifp->if_capenable &= ifp->if_capabilities; tun_caps_changed(ifp); tp->tun_vhdrlen = vhdrlen; TUNDEBUG(ifp, "vnet_hdr_len=%d, if_capabilities=%x\n", vhdrlen, ifp->if_capabilities); } /* * Process an ioctl request. */ static int tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct tuntap_softc *tp; struct ifstat *ifs; struct ifmediareq *ifmr; int dummy, error = 0; bool l2tun; ifmr = NULL; sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; if (tp == NULL) { error = ENXIO; goto bad; } l2tun = (tp->tun_flags & TUN_L2) != 0; switch(cmd) { case SIOCGIFSTATUS: ifs = (struct ifstat *)data; TUN_LOCK(tp); if (tp->tun_pid) snprintf(ifs->ascii, sizeof(ifs->ascii), "\tOpened by PID %d\n", tp->tun_pid); else ifs->ascii[0] = '\0'; TUN_UNLOCK(tp); break; case SIOCSIFADDR: if (l2tun) error = ether_ioctl(ifp, cmd, data); else tuninit(ifp); if (error == 0) TUNDEBUG(ifp, "address set\n"); break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; TUNDEBUG(ifp, "mtu set\n"); break; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCGIFMEDIA: if (!l2tun) { error = EINVAL; break; } ifmr = (struct ifmediareq *)data; dummy = ifmr->ifm_count; ifmr->ifm_count = 1; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (tp->tun_flags & TUN_OPEN) ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_current = ifmr->ifm_active; if (dummy >= 1) { int media = IFM_ETHER; error = copyout(&media, ifmr->ifm_ulist, sizeof(int)); } break; case SIOCSIFCAP: TUN_LOCK(tp); ifp->if_capenable = ifr->ifr_reqcap; tun_caps_changed(ifp); TUN_UNLOCK(tp); VLAN_CAPABILITIES(ifp); break; default: if (l2tun) { error = ether_ioctl(ifp, cmd, data); } else { error = EINVAL; } } bad: sx_xunlock(&tun_ioctl_sx); return (error); } /* * tunoutput - queue packets from higher level ready to put out. */ static int tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, struct route *ro) { struct tuntap_softc *tp = ifp->if_softc; u_short cached_tun_flags; int error; u_int32_t af; TUNDEBUG (ifp, "tunoutput\n"); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m0); if (error) { m_freem(m0); return (error); } #endif /* Could be unlocked read? */ TUN_LOCK(tp); cached_tun_flags = tp->tun_flags; TUN_UNLOCK(tp); if ((cached_tun_flags & TUN_READY) != TUN_READY) { TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); m_freem (m0); return (EHOSTDOWN); } if ((ifp->if_flags & IFF_UP) != IFF_UP) { m_freem (m0); return (EHOSTDOWN); } /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else - af = dst->sa_family; + af = RO_GET_FAMILY(ro, dst); if (bpf_peers_present(ifp->if_bpf)) bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0); /* prepend sockaddr? this may abort if the mbuf allocation fails */ if (cached_tun_flags & TUN_LMODE) { /* allocate space for sockaddr */ M_PREPEND(m0, dst->sa_len, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else { bcopy(dst, m0->m_data, dst->sa_len); } } if (cached_tun_flags & TUN_IFHEAD) { /* Prepend the address family */ M_PREPEND(m0, 4, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else *(u_int32_t *)m0->m_data = htonl(af); } else { #ifdef INET if (af != AF_INET) #endif { m_freem(m0); return (EAFNOSUPPORT); } } error = (ifp->if_transmit)(ifp, m0); if (error) return (ENOBUFS); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); return (0); } /* * the cdevsw interface is now pretty minimal. */ static int tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { struct ifreq ifr, *ifrp; struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct tuninfo *tunp; int error, iflags, ival; bool l2tun; l2tun = (tp->tun_flags & TUN_L2) != 0; if (l2tun) { /* tap specific ioctls */ switch(cmd) { /* VMware/VMnet port ioctl's */ #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) case _IO('V', 0): ival = IOCPARM_IVAL(data); data = (caddr_t)&ival; /* FALLTHROUGH */ #endif case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */ iflags = *(int *)data; iflags &= TUN_VMIO_FLAG_MASK; iflags &= ~IFF_CANTCHANGE; iflags |= IFF_UP; TUN_LOCK(tp); ifp->if_flags = iflags | (ifp->if_flags & IFF_CANTCHANGE); TUN_UNLOCK(tp); return (0); case SIOCGIFADDR: /* get MAC address of the remote side */ TUN_LOCK(tp); bcopy(&tp->tun_ether.octet, data, sizeof(tp->tun_ether.octet)); TUN_UNLOCK(tp); return (0); case SIOCSIFADDR: /* set MAC address of the remote side */ TUN_LOCK(tp); bcopy(data, &tp->tun_ether.octet, sizeof(tp->tun_ether.octet)); TUN_UNLOCK(tp); return (0); case TAPSVNETHDR: ival = *(int *)data; if (ival != 0 && ival != sizeof(struct virtio_net_hdr) && ival != sizeof(struct virtio_net_hdr_mrg_rxbuf)) { return (EINVAL); } TUN_LOCK(tp); tun_vnethdr_set(ifp, ival); TUN_UNLOCK(tp); return (0); case TAPGVNETHDR: TUN_LOCK(tp); *(int *)data = tp->tun_vhdrlen; TUN_UNLOCK(tp); return (0); } /* Fall through to the common ioctls if unhandled */ } else { switch (cmd) { case TUNSLMODE: TUN_LOCK(tp); if (*(int *)data) { tp->tun_flags |= TUN_LMODE; tp->tun_flags &= ~TUN_IFHEAD; } else tp->tun_flags &= ~TUN_LMODE; TUN_UNLOCK(tp); return (0); case TUNSIFHEAD: TUN_LOCK(tp); if (*(int *)data) { tp->tun_flags |= TUN_IFHEAD; tp->tun_flags &= ~TUN_LMODE; } else tp->tun_flags &= ~TUN_IFHEAD; TUN_UNLOCK(tp); return (0); case TUNGIFHEAD: TUN_LOCK(tp); *(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0; TUN_UNLOCK(tp); return (0); case TUNSIFMODE: /* deny this if UP */ if (TUN2IFP(tp)->if_flags & IFF_UP) return (EBUSY); switch (*(int *)data & ~IFF_MULTICAST) { case IFF_POINTOPOINT: case IFF_BROADCAST: TUN_LOCK(tp); TUN2IFP(tp)->if_flags &= ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); TUN2IFP(tp)->if_flags |= *(int *)data; TUN_UNLOCK(tp); break; default: return (EINVAL); } return (0); case TUNSIFPID: TUN_LOCK(tp); tp->tun_pid = curthread->td_proc->p_pid; TUN_UNLOCK(tp); return (0); } /* Fall through to the common ioctls if unhandled */ } switch (cmd) { case TUNGIFNAME: ifrp = (struct ifreq *)data; strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname, IFNAMSIZ); return (0); case TUNSIFINFO: tunp = (struct tuninfo *)data; if (TUN2IFP(tp)->if_type != tunp->type) return (EPROTOTYPE); TUN_LOCK(tp); if (TUN2IFP(tp)->if_mtu != tunp->mtu) { strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); ifr.ifr_mtu = tunp->mtu; CURVNET_SET(TUN2IFP(tp)->if_vnet); error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp), (caddr_t)&ifr, td); CURVNET_RESTORE(); if (error) { TUN_UNLOCK(tp); return (error); } } TUN2IFP(tp)->if_baudrate = tunp->baudrate; TUN_UNLOCK(tp); break; case TUNGIFINFO: tunp = (struct tuninfo *)data; TUN_LOCK(tp); tunp->mtu = TUN2IFP(tp)->if_mtu; tunp->type = TUN2IFP(tp)->if_type; tunp->baudrate = TUN2IFP(tp)->if_baudrate; TUN_UNLOCK(tp); break; case TUNSDEBUG: tundebug = *(int *)data; break; case TUNGDEBUG: *(int *)data = tundebug; break; case FIONBIO: break; case FIOASYNC: TUN_LOCK(tp); if (*(int *)data) tp->tun_flags |= TUN_ASYNC; else tp->tun_flags &= ~TUN_ASYNC; TUN_UNLOCK(tp); break; case FIONREAD: if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) { struct mbuf *mb; IFQ_LOCK(&TUN2IFP(tp)->if_snd); IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb); for (*(int *)data = 0; mb != NULL; mb = mb->m_next) *(int *)data += mb->m_len; IFQ_UNLOCK(&TUN2IFP(tp)->if_snd); } else *(int *)data = 0; break; case FIOSETOWN: return (fsetown(*(int *)data, &tp->tun_sigio)); case FIOGETOWN: *(int *)data = fgetown(&tp->tun_sigio); return (0); /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: return (fsetown(-(*(int *)data), &tp->tun_sigio)); /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(&tp->tun_sigio); return (0); default: return (ENOTTY); } return (0); } /* * The cdevsw read interface - reads a packet at a time, or at * least as much of a packet as can be read. */ static int tunread(struct cdev *dev, struct uio *uio, int flag) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct mbuf *m; size_t len; int error = 0; TUNDEBUG (ifp, "read\n"); TUN_LOCK(tp); if ((tp->tun_flags & TUN_READY) != TUN_READY) { TUN_UNLOCK(tp); TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); return (EHOSTDOWN); } tp->tun_flags &= ~TUN_RWAIT; for (;;) { IFQ_DEQUEUE(&ifp->if_snd, m); if (m != NULL) break; if (flag & O_NONBLOCK) { TUN_UNLOCK(tp); return (EWOULDBLOCK); } tp->tun_flags |= TUN_RWAIT; error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), "tunread", 0); if (error != 0) { TUN_UNLOCK(tp); return (error); } } TUN_UNLOCK(tp); if ((tp->tun_flags & TUN_L2) != 0) BPF_MTAP(ifp, m); len = min(tp->tun_vhdrlen, uio->uio_resid); if (len > 0) { struct virtio_net_hdr_mrg_rxbuf vhdr; bzero(&vhdr, sizeof(vhdr)); if (m->m_pkthdr.csum_flags & TAP_ALL_OFFLOAD) { m = virtio_net_tx_offload(ifp, m, false, &vhdr.hdr); } TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, " "gs %u, cs %u, co %u\n", vhdr.hdr.flags, vhdr.hdr.gso_type, vhdr.hdr.hdr_len, vhdr.hdr.gso_size, vhdr.hdr.csum_start, vhdr.hdr.csum_offset); error = uiomove(&vhdr, len, uio); } while (m && uio->uio_resid > 0 && error == 0) { len = min(uio->uio_resid, m->m_len); if (len != 0) error = uiomove(mtod(m, void *), len, uio); m = m_free(m); } if (m) { TUNDEBUG(ifp, "Dropping mbuf\n"); m_freem(m); } return (error); } static int tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m, struct virtio_net_hdr_mrg_rxbuf *vhdr) { struct epoch_tracker et; struct ether_header *eh; struct ifnet *ifp; ifp = TUN2IFP(tp); /* * Only pass a unicast frame to ether_input(), if it would * actually have been received by non-virtual hardware. */ if (m->m_len < sizeof(struct ether_header)) { m_freem(m); return (0); } eh = mtod(m, struct ether_header *); if (eh && (ifp->if_flags & IFF_PROMISC) == 0 && !ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) { m_freem(m); return (0); } if (vhdr != NULL && virtio_net_rx_csum(m, &vhdr->hdr)) { m_freem(m); return (0); } /* Pass packet up to parent. */ CURVNET_SET(ifp->if_vnet); NET_EPOCH_ENTER(et); (*ifp->if_input)(ifp, m); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); /* ibytes are counted in parent */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); return (0); } static int tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m) { struct epoch_tracker et; struct ifnet *ifp; int family, isr; ifp = TUN2IFP(tp); /* Could be unlocked read? */ TUN_LOCK(tp); if (tp->tun_flags & TUN_IFHEAD) { TUN_UNLOCK(tp); if (m->m_len < sizeof(family) && (m = m_pullup(m, sizeof(family))) == NULL) return (ENOBUFS); family = ntohl(*mtod(m, u_int32_t *)); m_adj(m, sizeof(family)); } else { TUN_UNLOCK(tp); family = AF_INET; } BPF_MTAP2(ifp, &family, sizeof(family), m); switch (family) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif default: m_freem(m); return (EAFNOSUPPORT); } random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); CURVNET_SET(ifp->if_vnet); M_SETFIB(m, ifp->if_fib); NET_EPOCH_ENTER(et); netisr_dispatch(isr, m); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } /* * the cdevsw write interface - an atomic write is a packet - or else! */ static int tunwrite(struct cdev *dev, struct uio *uio, int flag) { struct virtio_net_hdr_mrg_rxbuf vhdr; struct tuntap_softc *tp; struct ifnet *ifp; struct mbuf *m; uint32_t mru; int align, vhdrlen, error; bool l2tun; tp = dev->si_drv1; ifp = TUN2IFP(tp); TUNDEBUG(ifp, "tunwrite\n"); if ((ifp->if_flags & IFF_UP) != IFF_UP) /* ignore silently */ return (0); if (uio->uio_resid == 0) return (0); l2tun = (tp->tun_flags & TUN_L2) != 0; mru = l2tun ? TAPMRU : TUNMRU; vhdrlen = tp->tun_vhdrlen; align = 0; if (l2tun) { align = ETHER_ALIGN; mru += vhdrlen; } else if ((tp->tun_flags & TUN_IFHEAD) != 0) mru += sizeof(uint32_t); /* family */ if (uio->uio_resid < 0 || uio->uio_resid > mru) { TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid); return (EIO); } if (vhdrlen > 0) { error = uiomove(&vhdr, vhdrlen, uio); if (error != 0) return (error); TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, " "gs %u, cs %u, co %u\n", vhdr.hdr.flags, vhdr.hdr.gso_type, vhdr.hdr.hdr_len, vhdr.hdr.gso_size, vhdr.hdr.csum_start, vhdr.hdr.csum_offset); } if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return (ENOBUFS); } m->m_pkthdr.rcvif = ifp; #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif if (l2tun) return (tunwrite_l2(tp, m, vhdrlen > 0 ? &vhdr : NULL)); return (tunwrite_l3(tp, m)); } /* * tunpoll - the poll interface, this is only useful on reads * really. The write detect always returns true, write never blocks * anyway, it either accepts the packet or drops it. */ static int tunpoll(struct cdev *dev, int events, struct thread *td) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); int revents = 0; TUNDEBUG(ifp, "tunpoll\n"); if (events & (POLLIN | POLLRDNORM)) { IFQ_LOCK(&ifp->if_snd); if (!IFQ_IS_EMPTY(&ifp->if_snd)) { TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); revents |= events & (POLLIN | POLLRDNORM); } else { TUNDEBUG(ifp, "tunpoll waiting\n"); selrecord(td, &tp->tun_rsel); } IFQ_UNLOCK(&ifp->if_snd); } revents |= events & (POLLOUT | POLLWRNORM); return (revents); } /* * tunkqfilter - support for the kevent() system call. */ static int tunkqfilter(struct cdev *dev, struct knote *kn) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); switch(kn->kn_filter) { case EVFILT_READ: TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_read_filterops; break; case EVFILT_WRITE: TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_write_filterops; break; default: TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n", ifp->if_xname, dev2unit(dev)); return(EINVAL); } kn->kn_hook = tp; knlist_add(&tp->tun_rsel.si_note, kn, 0); return (0); } /* * Return true of there is data in the interface queue. */ static int tunkqread(struct knote *kn, long hint) { int ret; struct tuntap_softc *tp = kn->kn_hook; struct cdev *dev = tp->tun_dev; struct ifnet *ifp = TUN2IFP(tp); if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { TUNDEBUG(ifp, "%s have data in the queue. Len = %d, minor = %#x\n", ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); ret = 1; } else { TUNDEBUG(ifp, "%s waiting for data, minor = %#x\n", ifp->if_xname, dev2unit(dev)); ret = 0; } return (ret); } /* * Always can write, always return MTU in kn->data. */ static int tunkqwrite(struct knote *kn, long hint) { struct tuntap_softc *tp = kn->kn_hook; struct ifnet *ifp = TUN2IFP(tp); kn->kn_data = ifp->if_mtu; return (1); } static void tunkqdetach(struct knote *kn) { struct tuntap_softc *tp = kn->kn_hook; knlist_remove(&tp->tun_rsel.si_note, kn, 0); } diff --git a/sys/net/route.h b/sys/net/route.h index 67217f237e0b..ec77d39b9649 100644 --- a/sys/net/route.h +++ b/sys/net/route.h @@ -1,449 +1,453 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.h 8.4 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _NET_ROUTE_H_ #define _NET_ROUTE_H_ #include /* * Kernel resident routing tables. * * The routing tables are initialized when interface addresses * are set by making entries for all directly connected interfaces. */ /* * Struct route consiste of a destination address, * a route entry pointer, link-layer prepend data pointer along * with its length. */ struct route { struct nhop_object *ro_nh; struct llentry *ro_lle; /* * ro_prepend and ro_plen are only used for bpf to pass in a * preformed header. They are not cacheable. */ char *ro_prepend; uint16_t ro_plen; uint16_t ro_flags; uint16_t ro_mtu; /* saved ro_rt mtu */ uint16_t spare; struct sockaddr ro_dst; }; #define RT_L2_ME_BIT 2 /* dst L2 addr is our address */ #define RT_MAY_LOOP_BIT 3 /* dst may require loop copy */ #define RT_HAS_HEADER_BIT 4 /* mbuf already have its header prepended */ #define RT_L2_ME (1 << RT_L2_ME_BIT) /* 0x0004 */ #define RT_MAY_LOOP (1 << RT_MAY_LOOP_BIT) /* 0x0008 */ #define RT_HAS_HEADER (1 << RT_HAS_HEADER_BIT) /* 0x0010 */ #define RT_REJECT 0x0020 /* Destination is reject */ #define RT_BLACKHOLE 0x0040 /* Destination is blackhole */ #define RT_HAS_GW 0x0080 /* Destination has GW */ #define RT_LLE_CACHE 0x0100 /* Cache link layer */ struct rt_metrics { u_long rmx_locks; /* Kernel must leave these values alone */ u_long rmx_mtu; /* MTU for this path */ u_long rmx_hopcount; /* max hops expected */ u_long rmx_expire; /* lifetime for route, e.g. redirect */ u_long rmx_recvpipe; /* inbound delay-bandwidth product */ u_long rmx_sendpipe; /* outbound delay-bandwidth product */ u_long rmx_ssthresh; /* outbound gateway buffer limit */ u_long rmx_rtt; /* estimated round trip time */ u_long rmx_rttvar; /* estimated rtt variance */ u_long rmx_pksent; /* packets sent using this route */ u_long rmx_weight; /* route weight */ u_long rmx_nhidx; /* route nexhop index */ u_long rmx_filler[2]; /* will be used for T/TCP later */ }; /* * rmx_rtt and rmx_rttvar are stored as microseconds; * RTTTOPRHZ(rtt) converts to a value suitable for use * by a protocol slowtimo counter. */ #define RTM_RTTUNIT 1000000 /* units for rtt, rttvar, as units per sec */ #define RTTTOPRHZ(r) ((r) / (RTM_RTTUNIT / PR_SLOWHZ)) /* lle state is exported in rmx_state rt_metrics field */ #define rmx_state rmx_weight /* default route weight */ #define RT_DEFAULT_WEIGHT 1 #define RT_MAX_WEIGHT 16777215 /* 3 bytes */ /* * Keep a generation count of routing table, incremented on route addition, * so we can invalidate caches. This is accessed without a lock, as precision * is not required. */ typedef volatile u_int rt_gen_t; /* tree generation (for adds) */ #define RT_GEN(fibnum, af) rt_tables_get_gen(fibnum, af) #define RT_DEFAULT_FIB 0 /* Explicitly mark fib=0 restricted cases */ #define RT_ALL_FIBS -1 /* Announce event for every fib */ #ifdef _KERNEL VNET_DECLARE(uint32_t, _rt_numfibs); /* number of existing route tables */ #define V_rt_numfibs VNET(_rt_numfibs) /* temporary compat arg */ #define rt_numfibs V_rt_numfibs VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce interfaces to all fibs */ #define V_rt_add_addr_allfibs VNET(rt_add_addr_allfibs) /* Calculate flowid for locally-originated packets */ #define V_fib_hash_outbound VNET(fib_hash_outbound) VNET_DECLARE(u_int, fib_hash_outbound); /* Outbound flowid generation rules */ #ifdef RSS #define fib4_calc_packet_hash xps_proto_software_hash_v4 #define fib6_calc_packet_hash xps_proto_software_hash_v6 #define CALC_FLOWID_OUTBOUND_SENDTO true #ifdef ROUTE_MPATH #define CALC_FLOWID_OUTBOUND V_fib_hash_outbound #else #define CALC_FLOWID_OUTBOUND false #endif #else /* !RSS */ #define fib4_calc_packet_hash fib4_calc_software_hash #define fib6_calc_packet_hash fib6_calc_software_hash #ifdef ROUTE_MPATH #define CALC_FLOWID_OUTBOUND_SENDTO V_fib_hash_outbound #define CALC_FLOWID_OUTBOUND V_fib_hash_outbound #else #define CALC_FLOWID_OUTBOUND_SENDTO false #define CALC_FLOWID_OUTBOUND false #endif #endif /* RSS */ #endif /* _KERNEL */ /* * We distinguish between routes to hosts and routes to networks, * preferring the former if available. For each route we infer * the interface to use from the gateway address supplied when * the route was entered. Routes that forward packets through * gateways are marked so that the output routines know to address the * gateway rather than the ultimate destination. */ #define RTF_UP 0x1 /* route usable */ #define RTF_GATEWAY 0x2 /* destination is a gateway */ #define RTF_HOST 0x4 /* host entry (net otherwise) */ #define RTF_REJECT 0x8 /* host or net unreachable */ #define RTF_DYNAMIC 0x10 /* created dynamically (by redirect) */ #define RTF_MODIFIED 0x20 /* modified dynamically (by redirect) */ #define RTF_DONE 0x40 /* message confirmed */ /* 0x80 unused, was RTF_DELCLONE */ /* 0x100 unused, was RTF_CLONING */ #define RTF_XRESOLVE 0x200 /* external daemon resolves name */ #define RTF_LLINFO 0x400 /* DEPRECATED - exists ONLY for backward compatibility */ #define RTF_LLDATA 0x400 /* used by apps to add/del L2 entries */ #define RTF_STATIC 0x800 /* manually added */ #define RTF_BLACKHOLE 0x1000 /* just discard pkts (during updates) */ #define RTF_PROTO2 0x4000 /* protocol specific routing flag */ #define RTF_PROTO1 0x8000 /* protocol specific routing flag */ /* 0x10000 unused, was RTF_PRCLONING */ /* 0x20000 unused, was RTF_WASCLONED */ #define RTF_PROTO3 0x40000 /* protocol specific routing flag */ #define RTF_FIXEDMTU 0x80000 /* MTU was explicitly specified */ #define RTF_PINNED 0x100000 /* route is immutable */ #define RTF_LOCAL 0x200000 /* route represents a local address */ #define RTF_BROADCAST 0x400000 /* route represents a bcast address */ #define RTF_MULTICAST 0x800000 /* route represents a mcast address */ /* 0x8000000 and up unassigned */ #define RTF_STICKY 0x10000000 /* always route dst->src */ /* 0x40000000 unused, was RTF_RNH_LOCKED */ #define RTF_GWFLAG_COMPAT 0x80000000 /* a compatibility bit for interacting with existing routing apps */ /* Mask of RTF flags that are allowed to be modified by RTM_CHANGE. */ #define RTF_FMASK \ (RTF_PROTO1 | RTF_PROTO2 | RTF_PROTO3 | RTF_BLACKHOLE | \ RTF_REJECT | RTF_STATIC | RTF_STICKY) /* * fib_ nexthop API flags. */ /* Consumer-visible nexthop info flags */ #define NHF_MULTIPATH 0x0008 /* Nexhop is a nexthop group */ #define NHF_REJECT 0x0010 /* RTF_REJECT */ #define NHF_BLACKHOLE 0x0020 /* RTF_BLACKHOLE */ #define NHF_REDIRECT 0x0040 /* RTF_DYNAMIC|RTF_MODIFIED */ #define NHF_DEFAULT 0x0080 /* Default route */ #define NHF_BROADCAST 0x0100 /* RTF_BROADCAST */ #define NHF_GATEWAY 0x0200 /* RTF_GATEWAY */ #define NHF_HOST 0x0400 /* RTF_HOST */ /* Nexthop request flags */ #define NHR_NONE 0x00 /* empty flags field */ #define NHR_REF 0x01 /* reference nexhop */ #define NHR_NODEFAULT 0x02 /* uRPF: do not consider default route */ /* Control plane route request flags */ #define NHR_COPY 0x100 /* Copy rte data */ #define NHR_UNLOCKED 0x200 /* Do not lock table */ /* * Routing statistics. */ struct rtstat { uint64_t rts_badredirect; /* bogus redirect calls */ uint64_t rts_dynamic; /* routes created by redirects */ uint64_t rts_newgateway; /* routes modified by redirects */ uint64_t rts_unreach; /* lookups which failed */ uint64_t rts_wildcard; /* lookups satisfied by a wildcard */ uint64_t rts_nh_idx_alloc_failure; /* nexthop index alloc failure*/ uint64_t rts_nh_alloc_failure; /* nexthop allocation failure*/ uint64_t rts_add_failure; /* # of route addition failures */ uint64_t rts_add_retry; /* # of route addition retries */ uint64_t rts_del_failure; /* # of route deletion failure */ uint64_t rts_del_retry; /* # of route deletion retries */ }; /* * Structures for routing messages. */ struct rt_msghdr { u_short rtm_msglen; /* to skip over non-understood messages */ u_char rtm_version; /* future binary compatibility */ u_char rtm_type; /* message type */ u_short rtm_index; /* index for associated ifp */ u_short _rtm_spare1; int rtm_flags; /* flags, incl. kern & message, e.g. DONE */ int rtm_addrs; /* bitmask identifying sockaddrs in msg */ pid_t rtm_pid; /* identify sender */ int rtm_seq; /* for sender to identify action */ int rtm_errno; /* why failed */ int rtm_fmask; /* bitmask used in RTM_CHANGE message */ u_long rtm_inits; /* which metrics we are initializing */ struct rt_metrics rtm_rmx; /* metrics themselves */ }; #define RTM_VERSION 5 /* Up the ante and ignore older versions */ /* * Message types. * * The format for each message is annotated below using the following * identifiers: * * (1) struct rt_msghdr * (2) struct ifa_msghdr * (3) struct if_msghdr * (4) struct ifma_msghdr * (5) struct if_announcemsghdr * */ #define RTM_ADD 0x1 /* (1) Add Route */ #define RTM_DELETE 0x2 /* (1) Delete Route */ #define RTM_CHANGE 0x3 /* (1) Change Metrics or flags */ #define RTM_GET 0x4 /* (1) Report Metrics */ #define RTM_LOSING 0x5 /* (1) Kernel Suspects Partitioning */ #define RTM_REDIRECT 0x6 /* (1) Told to use different route */ #define RTM_MISS 0x7 /* (1) Lookup failed on this address */ #define RTM_LOCK 0x8 /* (1) fix specified metrics */ /* 0x9 */ /* 0xa */ #define RTM_RESOLVE 0xb /* (1) req to resolve dst to LL addr */ #define RTM_NEWADDR 0xc /* (2) address being added to iface */ #define RTM_DELADDR 0xd /* (2) address being removed from iface */ #define RTM_IFINFO 0xe /* (3) iface going up/down etc. */ #define RTM_NEWMADDR 0xf /* (4) mcast group membership being added to if */ #define RTM_DELMADDR 0x10 /* (4) mcast group membership being deleted */ #define RTM_IFANNOUNCE 0x11 /* (5) iface arrival/departure */ #define RTM_IEEE80211 0x12 /* (5) IEEE80211 wireless event */ /* * Bitmask values for rtm_inits and rmx_locks. */ #define RTV_MTU 0x1 /* init or lock _mtu */ #define RTV_HOPCOUNT 0x2 /* init or lock _hopcount */ #define RTV_EXPIRE 0x4 /* init or lock _expire */ #define RTV_RPIPE 0x8 /* init or lock _recvpipe */ #define RTV_SPIPE 0x10 /* init or lock _sendpipe */ #define RTV_SSTHRESH 0x20 /* init or lock _ssthresh */ #define RTV_RTT 0x40 /* init or lock _rtt */ #define RTV_RTTVAR 0x80 /* init or lock _rttvar */ #define RTV_WEIGHT 0x100 /* init or lock _weight */ /* * Bitmask values for rtm_addrs. */ #define RTA_DST 0x1 /* destination sockaddr present */ #define RTA_GATEWAY 0x2 /* gateway sockaddr present */ #define RTA_NETMASK 0x4 /* netmask sockaddr present */ #define RTA_GENMASK 0x8 /* cloning mask sockaddr present */ #define RTA_IFP 0x10 /* interface name sockaddr present */ #define RTA_IFA 0x20 /* interface addr sockaddr present */ #define RTA_AUTHOR 0x40 /* sockaddr for author of redirect */ #define RTA_BRD 0x80 /* for NEWADDR, broadcast or p-p dest addr */ /* * Index offsets for sockaddr array for alternate internal encoding. */ #define RTAX_DST 0 /* destination sockaddr present */ #define RTAX_GATEWAY 1 /* gateway sockaddr present */ #define RTAX_NETMASK 2 /* netmask sockaddr present */ #define RTAX_GENMASK 3 /* cloning mask sockaddr present */ #define RTAX_IFP 4 /* interface name sockaddr present */ #define RTAX_IFA 5 /* interface addr sockaddr present */ #define RTAX_AUTHOR 6 /* sockaddr for author of redirect */ #define RTAX_BRD 7 /* for NEWADDR, broadcast or p-p dest addr */ #define RTAX_MAX 8 /* size of array to allocate */ struct rtentry; struct nhop_object; typedef int rib_filter_f_t(const struct rtentry *, const struct nhop_object *, void *); struct rt_addrinfo { int rti_addrs; /* Route RTF_ flags */ int rti_flags; /* Route RTF_ flags */ struct sockaddr *rti_info[RTAX_MAX]; /* Sockaddr data */ struct ifaddr *rti_ifa; /* value of rt_ifa addr */ struct ifnet *rti_ifp; /* route interface */ rib_filter_f_t *rti_filter; /* filter function */ void *rti_filterdata; /* filter paramenters */ u_long rti_mflags; /* metrics RTV_ flags */ u_long rti_spare; /* Will be used for fib */ struct rt_metrics *rti_rmx; /* Pointer to route metrics */ }; /* * This macro returns the size of a struct sockaddr when passed * through a routing socket. Basically we round up sa_len to * a multiple of sizeof(long), with a minimum of sizeof(long). * The case sa_len == 0 should only apply to empty structures. */ #define SA_SIZE(sa) \ ( (((struct sockaddr *)(sa))->sa_len == 0) ? \ sizeof(long) : \ 1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(long) - 1) ) ) #define sa_equal(a, b) ( \ (((const struct sockaddr *)(a))->sa_len == ((const struct sockaddr *)(b))->sa_len) && \ (bcmp((a), (b), ((const struct sockaddr *)(b))->sa_len) == 0)) #ifdef _KERNEL #define RT_LINK_IS_UP(ifp) (!((ifp)->if_capabilities & IFCAP_LINKSTATE) \ || (ifp)->if_link_state == LINK_STATE_UP) #define RO_NHFREE(_ro) do { \ if ((_ro)->ro_nh) { \ NH_FREE((_ro)->ro_nh); \ (_ro)->ro_nh = NULL; \ } \ } while (0) #define RO_INVALIDATE_CACHE(ro) do { \ if ((ro)->ro_lle != NULL) { \ LLE_FREE((ro)->ro_lle); \ (ro)->ro_lle = NULL; \ } \ if ((ro)->ro_nh != NULL) { \ NH_FREE((ro)->ro_nh); \ (ro)->ro_nh = NULL; \ } \ } while (0) +#define RO_GET_FAMILY(ro, dst) ((ro) != NULL && \ + (ro)->ro_flags & RT_HAS_GW \ + ? (ro)->ro_dst.sa_family : (dst)->sa_family) + /* * Validate a cached route based on a supplied cookie. If there is an * out-of-date cache, simply free it. Update the generation number * for the new allocation */ #define NH_VALIDATE(ro, cookiep, fibnum) do { \ rt_gen_t cookie = RT_GEN(fibnum, (ro)->ro_dst.sa_family); \ if (*(cookiep) != cookie) { \ RO_INVALIDATE_CACHE(ro); \ *(cookiep) = cookie; \ } \ } while (0) struct ifmultiaddr; struct rib_head; void rt_ieee80211msg(struct ifnet *, int, void *, size_t); void rt_ifannouncemsg(struct ifnet *, int); void rt_ifmsg(struct ifnet *); void rt_missmsg(int, struct rt_addrinfo *, int, int); void rt_missmsg_fib(int, struct rt_addrinfo *, int, int, int); int rt_addrmsg(int, struct ifaddr *, int); int rt_routemsg(int, struct rtentry *, struct nhop_object *, int); int rt_routemsg_info(int, struct rt_addrinfo *, int); void rt_newmaddrmsg(int, struct ifmultiaddr *); void rt_maskedcopy(struct sockaddr *, struct sockaddr *, struct sockaddr *); struct rib_head *rt_table_init(int, int, u_int); void rt_table_destroy(struct rib_head *); u_int rt_tables_get_gen(uint32_t table, sa_family_t family); struct sockaddr *rtsock_fix_netmask(const struct sockaddr *dst, const struct sockaddr *smask, struct sockaddr_storage *dmask); void rt_updatemtu(struct ifnet *); void rt_flushifroutes(struct ifnet *ifp); /* XXX MRT NEW VERSIONS THAT USE FIBs * For now the protocol indepedent versions are the same as the AF_INET ones * but this will change.. */ int rtioctl_fib(u_long, caddr_t, u_int); int rib_lookup_info(uint32_t, const struct sockaddr *, uint32_t, uint32_t, struct rt_addrinfo *); void rib_free_info(struct rt_addrinfo *info); /* New API */ void rib_flush_routes_family(int family); struct nhop_object *rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, uint32_t flowid); #endif #endif diff --git a/sys/net/route/route_ctl.c b/sys/net/route/route_ctl.c index a686d1623053..6db088102cd3 100644 --- a/sys/net/route/route_ctl.c +++ b/sys/net/route/route_ctl.c @@ -1,1560 +1,1588 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * This file contains control plane routing tables functions. * * All functions assumes they are called in net epoch. */ struct rib_subscription { CK_STAILQ_ENTRY(rib_subscription) next; rib_subscription_cb_t *func; void *arg; struct rib_head *rnh; enum rib_subscription_type type; struct epoch_context epoch_ctx; }; static int add_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc); static int add_route_nhop(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *rnd, struct rib_cmd_info *rc); static int del_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc); static int change_route(struct rib_head *rnh, struct rt_addrinfo *info, struct route_nhop_data *nhd_orig, struct rib_cmd_info *rc); static int rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc); static void rib_notify(struct rib_head *rnh, enum rib_subscription_type type, struct rib_cmd_info *rc); static void destroy_subscription_epoch(epoch_context_t ctx); #ifdef ROUTE_MPATH static bool rib_can_multipath(struct rib_head *rh); #endif /* Per-vnet multipath routing configuration */ SYSCTL_DECL(_net_route); #define V_rib_route_multipath VNET(rib_route_multipath) #ifdef ROUTE_MPATH #define _MP_FLAGS CTLFLAG_RW #else #define _MP_FLAGS CTLFLAG_RD #endif VNET_DEFINE(u_int, rib_route_multipath) = 1; SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET, &VNET_NAME(rib_route_multipath), 0, "Enable route multipath"); #undef _MP_FLAGS +#if defined(INET) && defined(INET6) +FEATURE(ipv4_rfc5549_support, "Route IPv4 packets via IPv6 nexthops"); +#define V_rib_route_ipv6_nexthop VNET(rib_route_ipv6_nexthop) +VNET_DEFINE(u_int, rib_route_ipv6_nexthop) = 1; +SYSCTL_UINT(_net_route, OID_AUTO, ipv6_nexthop, CTLFLAG_RW | CTLFLAG_VNET, + &VNET_NAME(rib_route_ipv6_nexthop), 0, "Enable IPv4 route via IPv6 Next Hop address"); +#endif + /* Routing table UMA zone */ VNET_DEFINE_STATIC(uma_zone_t, rtzone); #define V_rtzone VNET(rtzone) void vnet_rtzone_init() { V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } #ifdef VIMAGE void vnet_rtzone_destroy() { uma_zdestroy(V_rtzone); } #endif static void destroy_rtentry(struct rtentry *rt) { #ifdef VIMAGE struct nhop_object *nh = rt->rt_nhop; /* * At this moment rnh, nh_control may be already freed. * nhop interface may have been migrated to a different vnet. * Use vnet stored in the nexthop to delete the entry. */ #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) { struct weightened_nhop *wn; uint32_t num_nhops; wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); nh = wn[0].nh; } #endif CURVNET_SET(nhop_get_vnet(nh)); #endif /* Unreference nexthop */ nhop_free_any(rt->rt_nhop); uma_zfree(V_rtzone, rt); CURVNET_RESTORE(); } /* * Epoch callback indicating rtentry is safe to destroy */ static void destroy_rtentry_epoch(epoch_context_t ctx) { struct rtentry *rt; rt = __containerof(ctx, struct rtentry, rt_epoch_ctx); destroy_rtentry(rt); } /* * Schedule rtentry deletion */ static void rtfree(struct rtentry *rt) { KASSERT(rt != NULL, ("%s: NULL rt", __func__)); epoch_call(net_epoch_preempt, destroy_rtentry_epoch, &rt->rt_epoch_ctx); } static struct rib_head * get_rnh(uint32_t fibnum, const struct rt_addrinfo *info) { struct rib_head *rnh; struct sockaddr *dst; KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum")); dst = info->rti_info[RTAX_DST]; rnh = rt_tables_get_rnh(fibnum, dst->sa_family); return (rnh); } +#if defined(INET) && defined(INET6) +static bool +rib_can_ipv6_nexthop_address(struct rib_head *rh) +{ + int result; + + CURVNET_SET(rh->rib_vnet); + result = !!V_rib_route_ipv6_nexthop; + CURVNET_RESTORE(); + + return (result); +} +#endif + #ifdef ROUTE_MPATH static bool rib_can_multipath(struct rib_head *rh) { int result; CURVNET_SET(rh->rib_vnet); result = !!V_rib_route_multipath; CURVNET_RESTORE(); return (result); } /* * Check is nhop is multipath-eligible. * Avoid nhops without gateways and redirects. * * Returns 1 for multipath-eligible nexthop, * 0 otherwise. */ bool nhop_can_multipath(const struct nhop_object *nh) { if ((nh->nh_flags & NHF_MULTIPATH) != 0) return (1); if ((nh->nh_flags & NHF_GATEWAY) == 0) return (0); if ((nh->nh_flags & NHF_REDIRECT) != 0) return (0); return (1); } #endif static int get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight) { uint32_t weight; if (info->rti_mflags & RTV_WEIGHT) weight = info->rti_rmx->rmx_weight; else weight = default_weight; /* Keep upper 1 byte for adm distance purposes */ if (weight > RT_MAX_WEIGHT) weight = RT_MAX_WEIGHT; return (weight); } bool rt_is_host(const struct rtentry *rt) { return (rt->rte_flags & RTF_HOST); } sa_family_t rt_get_family(const struct rtentry *rt) { const struct sockaddr *dst; dst = (const struct sockaddr *)rt_key_const(rt); return (dst->sa_family); } /* * Returns pointer to nexthop or nexthop group * associated with @rt */ struct nhop_object * rt_get_raw_nhop(const struct rtentry *rt) { return (rt->rt_nhop); } #ifdef INET /* * Stores IPv4 address and prefix length of @rt inside * @paddr and @plen. * @pscopeid is currently always set to 0. */ void rt_get_inet_prefix_plen(const struct rtentry *rt, struct in_addr *paddr, int *plen, uint32_t *pscopeid) { const struct sockaddr_in *dst; dst = (const struct sockaddr_in *)rt_key_const(rt); KASSERT((dst->sin_family == AF_INET), ("rt family is %d, not inet", dst->sin_family)); *paddr = dst->sin_addr; dst = (const struct sockaddr_in *)rt_mask_const(rt); if (dst == NULL) *plen = 32; else *plen = bitcount32(dst->sin_addr.s_addr); *pscopeid = 0; } /* * Stores IPv4 address and prefix mask of @rt inside * @paddr and @pmask. Sets mask to INADDR_ANY for host routes. * @pscopeid is currently always set to 0. */ void rt_get_inet_prefix_pmask(const struct rtentry *rt, struct in_addr *paddr, struct in_addr *pmask, uint32_t *pscopeid) { const struct sockaddr_in *dst; dst = (const struct sockaddr_in *)rt_key_const(rt); KASSERT((dst->sin_family == AF_INET), ("rt family is %d, not inet", dst->sin_family)); *paddr = dst->sin_addr; dst = (const struct sockaddr_in *)rt_mask_const(rt); if (dst == NULL) pmask->s_addr = INADDR_BROADCAST; else *pmask = dst->sin_addr; *pscopeid = 0; } #endif #ifdef INET6 static int inet6_get_plen(const struct in6_addr *addr) { return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) + bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3])); } /* * Stores IPv6 address and prefix length of @rt inside * @paddr and @plen. Addresses are returned in de-embedded form. * Scopeid is set to 0 for non-LL addresses. */ void rt_get_inet6_prefix_plen(const struct rtentry *rt, struct in6_addr *paddr, int *plen, uint32_t *pscopeid) { const struct sockaddr_in6 *dst; dst = (const struct sockaddr_in6 *)rt_key_const(rt); KASSERT((dst->sin6_family == AF_INET6), ("rt family is %d, not inet6", dst->sin6_family)); if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr)) in6_splitscope(&dst->sin6_addr, paddr, pscopeid); else *paddr = dst->sin6_addr; dst = (const struct sockaddr_in6 *)rt_mask_const(rt); if (dst == NULL) *plen = 128; else *plen = inet6_get_plen(&dst->sin6_addr); } /* * Stores IPv6 address and prefix mask of @rt inside * @paddr and @pmask. Addresses are returned in de-embedded form. * Scopeid is set to 0 for non-LL addresses. */ void rt_get_inet6_prefix_pmask(const struct rtentry *rt, struct in6_addr *paddr, struct in6_addr *pmask, uint32_t *pscopeid) { const struct sockaddr_in6 *dst; dst = (const struct sockaddr_in6 *)rt_key_const(rt); KASSERT((dst->sin6_family == AF_INET6), ("rt family is %d, not inet", dst->sin6_family)); if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr)) in6_splitscope(&dst->sin6_addr, paddr, pscopeid); else *paddr = dst->sin6_addr; dst = (const struct sockaddr_in6 *)rt_mask_const(rt); if (dst == NULL) memset(pmask, 0xFF, sizeof(struct in6_addr)); else *pmask = dst->sin6_addr; } #endif static void rt_set_expire_info(struct rtentry *rt, const struct rt_addrinfo *info) { /* Kernel -> userland timebase conversion. */ if (info->rti_mflags & RTV_EXPIRE) rt->rt_expire = info->rti_rmx->rmx_expire ? info->rti_rmx->rmx_expire - time_second + time_uptime : 0; } /* * Check if specified @gw matches gw data in the nexthop @nh. * * Returns true if matches, false otherwise. */ bool match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw) { if (nh->gw_sa.sa_family != gw->sa_family) return (false); switch (gw->sa_family) { case AF_INET: return (nh->gw4_sa.sin_addr.s_addr == ((const struct sockaddr_in *)gw)->sin_addr.s_addr); case AF_INET6: { const struct sockaddr_in6 *gw6; gw6 = (const struct sockaddr_in6 *)gw; /* * Currently (2020-09) IPv6 gws in kernel have their * scope embedded. Once this becomes false, this code * has to be revisited. */ if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr, &gw6->sin6_addr)) return (true); return (false); } case AF_LINK: { const struct sockaddr_dl *sdl; sdl = (const struct sockaddr_dl *)gw; return (nh->gwl_sa.sdl_index == sdl->sdl_index); } default: return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0); } /* NOTREACHED */ return (false); } /* * Checks if data in @info matches nexhop @nh. * * Returns 0 on success, * ESRCH if not matched, * ENOENT if filter function returned false */ int check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt, const struct nhop_object *nh) { const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY]; if (info->rti_filter != NULL) { if (info->rti_filter(rt, nh, info->rti_filterdata) == 0) return (ENOENT); else return (0); } if ((gw != NULL) && !match_nhop_gw(nh, gw)) return (ESRCH); return (0); } /* * Checks if nexhop @nh can be rewritten by data in @info because * of higher "priority". Currently the only case for such scenario * is kernel installing interface routes, marked by RTF_PINNED flag. * * Returns: * 1 if @info data has higher priority * 0 if priority is the same * -1 if priority is lower */ int can_override_nhop(const struct rt_addrinfo *info, const struct nhop_object *nh) { if (info->rti_flags & RTF_PINNED) { return (NH_IS_PINNED(nh)) ? 0 : 1; } else { return (NH_IS_PINNED(nh)) ? -1 : 0; } } /* * Runs exact prefix match based on @dst and @netmask. * Returns matched @rtentry if found or NULL. * If rtentry was found, saves nexthop / weight value into @rnd. */ static struct rtentry * lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst, const struct sockaddr *netmask, struct route_nhop_data *rnd) { struct rtentry *rt; RIB_LOCK_ASSERT(rnh); rt = (struct rtentry *)rnh->rnh_lookup(__DECONST(void *, dst), __DECONST(void *, netmask), &rnh->head); if (rt != NULL) { rnd->rnd_nhop = rt->rt_nhop; rnd->rnd_weight = rt->rt_weight; } else { rnd->rnd_nhop = NULL; rnd->rnd_weight = 0; } return (rt); } /* * Runs exact prefix match based on dst/netmask from @info. * Assumes RIB lock is held. * Returns matched @rtentry if found or NULL. * If rtentry was found, saves nexthop / weight value into @rnd. */ struct rtentry * lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info, struct route_nhop_data *rnd) { struct rtentry *rt; rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], rnd); return (rt); } /* * Adds route defined by @info into the kernel table specified by @fibnum and * sa_family in @info->rti_info[RTAX_DST]. * * Returns 0 on success and fills in operation metadata into @rc. */ int rib_add_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc) { struct rib_head *rnh; int error; NET_EPOCH_ASSERT(); rnh = get_rnh(fibnum, info); if (rnh == NULL) return (EAFNOSUPPORT); /* * Check consistency between RTF_HOST flag and netmask * existence. */ if (info->rti_flags & RTF_HOST) info->rti_info[RTAX_NETMASK] = NULL; else if (info->rti_info[RTAX_NETMASK] == NULL) return (EINVAL); bzero(rc, sizeof(struct rib_cmd_info)); rc->rc_cmd = RTM_ADD; error = add_route(rnh, info, rc); if (error == 0) rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); return (error); } /* * Checks if @dst and @gateway is valid combination. * * Returns true if is valid, false otherwise. */ static bool check_gateway(struct rib_head *rnh, struct sockaddr *dst, struct sockaddr *gateway) { if (dst->sa_family == gateway->sa_family) return (true); else if (gateway->sa_family == AF_UNSPEC) return (true); else if (gateway->sa_family == AF_LINK) return (true); - return (false); +#if defined(INET) && defined(INET6) + else if (dst->sa_family == AF_INET && gateway->sa_family == AF_INET6 && + rib_can_ipv6_nexthop_address(rnh)) + return (true); +#endif + else + return (false); } /* * Creates rtentry and nexthop based on @info data. * Return 0 and fills in rtentry into @prt on success, * return errno otherwise. */ static int create_rtentry(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry **prt) { struct sockaddr *dst, *ndst, *gateway, *netmask; struct rtentry *rt; struct nhop_object *nh; struct ifaddr *ifa; int error, flags; dst = info->rti_info[RTAX_DST]; gateway = info->rti_info[RTAX_GATEWAY]; netmask = info->rti_info[RTAX_NETMASK]; flags = info->rti_flags; if ((flags & RTF_GATEWAY) && !gateway) return (EINVAL); if (dst && gateway && !check_gateway(rnh, dst, gateway)) return (EINVAL); if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) return (EINVAL); if (info->rti_ifa == NULL) { error = rt_getifa_fib(info, rnh->rib_fibnum); if (error) return (error); } error = nhop_create_from_info(rnh, info, &nh); if (error != 0) return (error); rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO); if (rt == NULL) { nhop_free(nh); return (ENOBUFS); } rt->rte_flags = (RTF_UP | flags) & RTE_RT_FLAG_MASK; rt->rt_nhop = nh; /* Fill in dst */ memcpy(&rt->rt_dst, dst, dst->sa_len); rt_key(rt) = &rt->rt_dst; /* * point to the (possibly newly malloc'd) dest address. */ ndst = (struct sockaddr *)rt_key(rt); /* * make sure it contains the value we want (masked if needed). */ if (netmask) { rt_maskedcopy(dst, ndst, netmask); } else bcopy(dst, ndst, dst->sa_len); /* * We use the ifa reference returned by rt_getifa_fib(). * This moved from below so that rnh->rnh_addaddr() can * examine the ifa and ifa->ifa_ifp if it so desires. */ ifa = info->rti_ifa; rt->rt_weight = get_info_weight(info, RT_DEFAULT_WEIGHT); rt_set_expire_info(rt, info); *prt = rt; return (0); } static int add_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc) { struct nhop_object *nh_orig; struct route_nhop_data rnd_orig, rnd_add; struct nhop_object *nh; struct rtentry *rt, *rt_orig; int error; error = create_rtentry(rnh, info, &rt); if (error != 0) return (error); rnd_add.rnd_nhop = rt->rt_nhop; rnd_add.rnd_weight = rt->rt_weight; nh = rt->rt_nhop; RIB_WLOCK(rnh); error = add_route_nhop(rnh, rt, info, &rnd_add, rc); if (error == 0) { RIB_WUNLOCK(rnh); return (0); } /* addition failed. Lookup prefix in the rib to determine the cause */ rt_orig = lookup_prefix(rnh, info, &rnd_orig); if (rt_orig == NULL) { /* No prefix -> rnh_addaddr() failed to allocate memory */ RIB_WUNLOCK(rnh); nhop_free(nh); uma_zfree(V_rtzone, rt); return (ENOMEM); } /* We have existing route in the RIB. */ nh_orig = rnd_orig.rnd_nhop; /* Check if new route has higher preference */ if (can_override_nhop(info, nh_orig) > 0) { /* Update nexthop to the new route */ change_route_nhop(rnh, rt_orig, info, &rnd_add, rc); RIB_WUNLOCK(rnh); uma_zfree(V_rtzone, rt); nhop_free(nh_orig); return (0); } RIB_WUNLOCK(rnh); #ifdef ROUTE_MPATH if (rib_can_multipath(rnh) && nhop_can_multipath(rnd_add.rnd_nhop) && nhop_can_multipath(rnd_orig.rnd_nhop)) error = add_route_mpath(rnh, info, rt, &rnd_add, &rnd_orig, rc); else #endif /* Unable to add - another route with the same preference exists */ error = EEXIST; /* * ROUTE_MPATH disabled: failed to add route, free both nhop and rt. * ROUTE_MPATH enabled: original nhop reference is unused in any case, * free rt only if not _adding_ new route to rib (e.g. the case * when initial lookup returned existing route, but then it got * deleted prior to multipath group insertion, leading to a simple * non-multipath add as a result). */ nhop_free(nh); if ((error != 0) || rc->rc_cmd != RTM_ADD) uma_zfree(V_rtzone, rt); return (error); } /* * Removes route defined by @info from the kernel table specified by @fibnum and * sa_family in @info->rti_info[RTAX_DST]. * * Returns 0 on success and fills in operation metadata into @rc. */ int rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc) { struct rib_head *rnh; struct sockaddr *dst_orig, *netmask; struct sockaddr_storage mdst; int error; NET_EPOCH_ASSERT(); rnh = get_rnh(fibnum, info); if (rnh == NULL) return (EAFNOSUPPORT); bzero(rc, sizeof(struct rib_cmd_info)); rc->rc_cmd = RTM_DELETE; dst_orig = info->rti_info[RTAX_DST]; netmask = info->rti_info[RTAX_NETMASK]; if (netmask != NULL) { /* Ensure @dst is always properly masked */ if (dst_orig->sa_len > sizeof(mdst)) return (EINVAL); rt_maskedcopy(dst_orig, (struct sockaddr *)&mdst, netmask); info->rti_info[RTAX_DST] = (struct sockaddr *)&mdst; } error = del_route(rnh, info, rc); info->rti_info[RTAX_DST] = dst_orig; return (error); } /* * Conditionally unlinks rtentry matching data inside @info from @rnh. * Returns 0 on success with operation result stored in @rc. * On error, returns: * ESRCH - if prefix was not found, * EADDRINUSE - if trying to delete higher priority route. * ENOENT - if supplied filter function returned 0 (not matched). */ static int rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc) { struct rtentry *rt; struct nhop_object *nh; struct radix_node *rn; struct route_nhop_data rnd; int error; rt = lookup_prefix(rnh, info, &rnd); if (rt == NULL) return (ESRCH); nh = rt->rt_nhop; #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) { error = del_route_mpath(rnh, info, rt, (struct nhgrp_object *)nh, rc); return (error); } #endif error = check_info_match_nhop(info, rt, nh); if (error != 0) return (error); if (can_override_nhop(info, nh) < 0) return (EADDRINUSE); /* * Remove the item from the tree and return it. * Complain if it is not there and do no more processing. */ rn = rnh->rnh_deladdr(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rn == NULL) return (ESRCH); if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic ("rtrequest delete"); rt = RNTORT(rn); rt->rte_flags &= ~RTF_UP; /* Finalize notification */ rib_bump_gen(rnh); rnh->rnh_prefixes--; rc->rc_cmd = RTM_DELETE; rc->rc_rt = rt; rc->rc_nh_old = rt->rt_nhop; rc->rc_nh_weight = rt->rt_weight; rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); return (0); } static int del_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc) { int error; RIB_WLOCK(rnh); error = rt_unlinkrte(rnh, info, rc); RIB_WUNLOCK(rnh); if (error != 0) return (error); rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); /* * If the caller wants it, then it can have it, * the entry will be deleted after the end of the current epoch. */ if (rc->rc_cmd == RTM_DELETE) rtfree(rc->rc_rt); #ifdef ROUTE_MPATH else { /* * Deleting 1 path may result in RTM_CHANGE to * a different mpath group/nhop. * Free old mpath group. */ nhop_free_any(rc->rc_nh_old); } #endif return (0); } int rib_change_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc) { RIB_RLOCK_TRACKER; struct route_nhop_data rnd_orig; struct rib_head *rnh; struct rtentry *rt; int error; NET_EPOCH_ASSERT(); rnh = get_rnh(fibnum, info); if (rnh == NULL) return (EAFNOSUPPORT); bzero(rc, sizeof(struct rib_cmd_info)); rc->rc_cmd = RTM_CHANGE; /* Check if updated gateway exists */ if ((info->rti_flags & RTF_GATEWAY) && (info->rti_info[RTAX_GATEWAY] == NULL)) { /* * route(8) adds RTF_GATEWAY flag if -interface is not set. * Remove RTF_GATEWAY to enforce consistency and maintain * compatibility.. */ info->rti_flags &= ~RTF_GATEWAY; } /* * route change is done in multiple steps, with dropping and * reacquiring lock. In the situations with multiple processes * changes the same route in can lead to the case when route * is changed between the steps. Address it by retrying the operation * multiple times before failing. */ RIB_RLOCK(rnh); rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); return (ESRCH); } rnd_orig.rnd_nhop = rt->rt_nhop; rnd_orig.rnd_weight = rt->rt_weight; RIB_RUNLOCK(rnh); for (int i = 0; i < RIB_MAX_RETRIES; i++) { error = change_route(rnh, info, &rnd_orig, rc); if (error != EAGAIN) break; } return (error); } static int change_nhop(struct rib_head *rnh, struct rt_addrinfo *info, struct nhop_object *nh_orig, struct nhop_object **nh_new) { int error; /* * New gateway could require new ifaddr, ifp; * flags may also be different; ifp may be specified * by ll sockaddr when protocol address is ambiguous */ if (((nh_orig->nh_flags & NHF_GATEWAY) && info->rti_info[RTAX_GATEWAY] != NULL) || info->rti_info[RTAX_IFP] != NULL || (info->rti_info[RTAX_IFA] != NULL && !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) { error = rt_getifa_fib(info, rnh->rib_fibnum); if (error != 0) { info->rti_ifa = NULL; return (error); } } error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new); info->rti_ifa = NULL; return (error); } #ifdef ROUTE_MPATH static int change_mpath_route(struct rib_head *rnh, struct rt_addrinfo *info, struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) { int error = 0; struct nhop_object *nh, *nh_orig, *nh_new; struct route_nhop_data rnd_new; nh = NULL; nh_orig = rnd_orig->rnd_nhop; struct weightened_nhop *wn = NULL, *wn_new; uint32_t num_nhops; wn = nhgrp_get_nhops((struct nhgrp_object *)nh_orig, &num_nhops); nh_orig = NULL; for (int i = 0; i < num_nhops; i++) { if (check_info_match_nhop(info, NULL, wn[i].nh)) { nh_orig = wn[i].nh; break; } } if (nh_orig == NULL) return (ESRCH); error = change_nhop(rnh, info, nh_orig, &nh_new); if (error != 0) return (error); wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT | M_ZERO); if (wn_new == NULL) { nhop_free(nh_new); return (EAGAIN); } memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop)); for (int i = 0; i < num_nhops; i++) { if (wn[i].nh == nh_orig) { wn[i].nh = nh_new; wn[i].weight = get_info_weight(info, rnd_orig->rnd_weight); break; } } error = nhgrp_get_group(rnh, wn_new, num_nhops, &rnd_new); nhop_free(nh_new); free(wn_new, M_TEMP); if (error != 0) return (error); error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc); return (error); } #endif static int change_route(struct rib_head *rnh, struct rt_addrinfo *info, struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) { int error = 0; struct nhop_object *nh, *nh_orig; struct route_nhop_data rnd_new; nh = NULL; nh_orig = rnd_orig->rnd_nhop; if (nh_orig == NULL) return (ESRCH); #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh_orig)) return (change_mpath_route(rnh, info, rnd_orig, rc)); #endif rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight); error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop); if (error != 0) return (error); error = change_route_conditional(rnh, NULL, info, rnd_orig, &rnd_new, rc); return (error); } /* * Insert @rt with nhop data from @rnd_new to @rnh. * Returns 0 on success and stores operation results in @rc. */ static int add_route_nhop(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *rnd, struct rib_cmd_info *rc) { struct sockaddr *ndst, *netmask; struct radix_node *rn; int error = 0; RIB_WLOCK_ASSERT(rnh); ndst = (struct sockaddr *)rt_key(rt); netmask = info->rti_info[RTAX_NETMASK]; rt->rt_nhop = rnd->rnd_nhop; rt->rt_weight = rnd->rnd_weight; rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); if (rn != NULL) { if (rt->rt_expire > 0) tmproutes_update(rnh, rt); /* Finalize notification */ rib_bump_gen(rnh); rnh->rnh_prefixes++; rc->rc_cmd = RTM_ADD; rc->rc_rt = rt; rc->rc_nh_old = NULL; rc->rc_nh_new = rnd->rnd_nhop; rc->rc_nh_weight = rnd->rnd_weight; rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); } else { /* Existing route or memory allocation failure */ error = EEXIST; } return (error); } /* * Switch @rt nhop/weigh to the ones specified in @rnd. * Conditionally set rt_expire if set in @info. * Returns 0 on success. */ int change_route_nhop(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *rnd, struct rib_cmd_info *rc) { struct nhop_object *nh_orig; RIB_WLOCK_ASSERT(rnh); nh_orig = rt->rt_nhop; if (rnd->rnd_nhop != NULL) { /* Changing expiration & nexthop & weight to a new one */ rt_set_expire_info(rt, info); rt->rt_nhop = rnd->rnd_nhop; rt->rt_weight = rnd->rnd_weight; if (rt->rt_expire > 0) tmproutes_update(rnh, rt); } else { /* Route deletion requested. */ struct sockaddr *ndst, *netmask; struct radix_node *rn; ndst = (struct sockaddr *)rt_key(rt); netmask = info->rti_info[RTAX_NETMASK]; rn = rnh->rnh_deladdr(ndst, netmask, &rnh->head); if (rn == NULL) return (ESRCH); rt = RNTORT(rn); rt->rte_flags &= ~RTF_UP; } /* Finalize notification */ rib_bump_gen(rnh); if (rnd->rnd_nhop == NULL) rnh->rnh_prefixes--; rc->rc_cmd = (rnd->rnd_nhop != NULL) ? RTM_CHANGE : RTM_DELETE; rc->rc_rt = rt; rc->rc_nh_old = nh_orig; rc->rc_nh_new = rnd->rnd_nhop; rc->rc_nh_weight = rnd->rnd_weight; rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); return (0); } /* * Conditionally update route nhop/weight IFF data in @nhd_orig is * consistent with the current route data. * Nexthop in @nhd_new is consumed. */ int change_route_conditional(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_new, struct rib_cmd_info *rc) { struct rtentry *rt_new; int error = 0; RIB_WLOCK(rnh); rt_new = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rt_new == NULL) { if (rnd_orig->rnd_nhop == NULL) error = add_route_nhop(rnh, rt, info, rnd_new, rc); else { /* * Prefix does not exist, which was not our assumption. * Update @rnd_orig with the new data and return */ rnd_orig->rnd_nhop = NULL; rnd_orig->rnd_weight = 0; error = EAGAIN; } } else { /* Prefix exists, try to update */ if (rnd_orig->rnd_nhop == rt_new->rt_nhop) { /* * Nhop/mpath group hasn't changed. Flip * to the new precalculated one and return */ error = change_route_nhop(rnh, rt_new, info, rnd_new, rc); } else { /* Update and retry */ rnd_orig->rnd_nhop = rt_new->rt_nhop; rnd_orig->rnd_weight = rt_new->rt_weight; error = EAGAIN; } } RIB_WUNLOCK(rnh); if (error == 0) { rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); if (rnd_orig->rnd_nhop != NULL) nhop_free_any(rnd_orig->rnd_nhop); } else { if (rnd_new->rnd_nhop != NULL) nhop_free_any(rnd_new->rnd_nhop); } return (error); } /* * Performs modification of routing table specificed by @action. * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST]. * Needs to be run in network epoch. * * Returns 0 on success and fills in @rc with action result. */ int rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info, struct rib_cmd_info *rc) { int error; switch (action) { case RTM_ADD: error = rib_add_route(fibnum, info, rc); break; case RTM_DELETE: error = rib_del_route(fibnum, info, rc); break; case RTM_CHANGE: error = rib_change_route(fibnum, info, rc); break; default: error = ENOTSUP; } return (error); } struct rt_delinfo { struct rt_addrinfo info; struct rib_head *rnh; struct rtentry *head; struct rib_cmd_info rc; }; /* * Conditionally unlinks @rn from radix tree based * on info data passed in @arg. */ static int rt_checkdelroute(struct radix_node *rn, void *arg) { struct rt_delinfo *di; struct rt_addrinfo *info; struct rtentry *rt; di = (struct rt_delinfo *)arg; rt = (struct rtentry *)rn; info = &di->info; info->rti_info[RTAX_DST] = rt_key(rt); info->rti_info[RTAX_NETMASK] = rt_mask(rt); if (rt_unlinkrte(di->rnh, info, &di->rc) != 0) return (0); /* * Add deleted rtentries to the list to GC them * after dropping the lock. * * XXX: Delayed notifications not implemented * for nexthop updates. */ if (di->rc.rc_cmd == RTM_DELETE) { /* Add to the list and return */ rt->rt_chain = di->head; di->head = rt; #ifdef ROUTE_MPATH } else { /* * RTM_CHANGE to a diferent nexthop or nexthop group. * Free old multipath group. */ nhop_free_any(di->rc.rc_nh_old); #endif } return (0); } /* * Iterates over a routing table specified by @fibnum and @family and * deletes elements marked by @filter_f. * @fibnum: rtable id * @family: AF_ address family * @filter_f: function returning non-zero value for items to delete * @arg: data to pass to the @filter_f function * @report: true if rtsock notification is needed. */ void rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *arg, bool report) { struct rib_head *rnh; struct rt_delinfo di; struct rtentry *rt; struct nhop_object *nh; struct epoch_tracker et; rnh = rt_tables_get_rnh(fibnum, family); if (rnh == NULL) return; bzero(&di, sizeof(di)); di.info.rti_filter = filter_f; di.info.rti_filterdata = arg; di.rnh = rnh; di.rc.rc_cmd = RTM_DELETE; NET_EPOCH_ENTER(et); RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di); RIB_WUNLOCK(rnh); /* We might have something to reclaim. */ bzero(&di.rc, sizeof(di.rc)); di.rc.rc_cmd = RTM_DELETE; while (di.head != NULL) { rt = di.head; di.head = rt->rt_chain; rt->rt_chain = NULL; nh = rt->rt_nhop; di.rc.rc_rt = rt; di.rc.rc_nh_old = nh; rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc); /* TODO std rt -> rt_addrinfo export */ di.info.rti_info[RTAX_DST] = rt_key(rt); di.info.rti_info[RTAX_NETMASK] = rt_mask(rt); if (report) { #ifdef ROUTE_MPATH struct nhgrp_object *nhg; struct weightened_nhop *wn; uint32_t num_nhops; if (NH_IS_NHGRP(nh)) { nhg = (struct nhgrp_object *)nh; wn = nhgrp_get_nhops(nhg, &num_nhops); for (int i = 0; i < num_nhops; i++) rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum); } else #endif rt_routemsg(RTM_DELETE, rt, nh, fibnum); } rtfree(rt); } NET_EPOCH_EXIT(et); } static int rt_delete_unconditional(struct radix_node *rn, void *arg) { struct rtentry *rt = RNTORT(rn); struct rib_head *rnh = (struct rib_head *)arg; rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head); if (RNTORT(rn) == rt) rtfree(rt); return (0); } /* * Removes all routes from the routing table without executing notifications. * rtentres will be removed after the end of a current epoch. */ static void rib_flush_routes(struct rib_head *rnh) { RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh); RIB_WUNLOCK(rnh); } void rib_flush_routes_family(int family) { struct rib_head *rnh; for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) { if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL) rib_flush_routes(rnh); } } static void rib_notify(struct rib_head *rnh, enum rib_subscription_type type, struct rib_cmd_info *rc) { struct rib_subscription *rs; CK_STAILQ_FOREACH(rs, &rnh->rnh_subscribers, next) { if (rs->type == type) rs->func(rnh, rc, rs->arg); } } static struct rib_subscription * allocate_subscription(rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type, bool waitok) { struct rib_subscription *rs; int flags = M_ZERO | (waitok ? M_WAITOK : M_NOWAIT); rs = malloc(sizeof(struct rib_subscription), M_RTABLE, flags); if (rs == NULL) return (NULL); rs->func = f; rs->arg = arg; rs->type = type; return (rs); } /* * Subscribe for the changes in the routing table specified by @fibnum and * @family. * * Returns pointer to the subscription structure on success. */ struct rib_subscription * rib_subscribe(uint32_t fibnum, int family, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type, bool waitok) { struct rib_head *rnh; struct epoch_tracker et; NET_EPOCH_ENTER(et); KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__)); rnh = rt_tables_get_rnh(fibnum, family); NET_EPOCH_EXIT(et); return (rib_subscribe_internal(rnh, f, arg, type, waitok)); } struct rib_subscription * rib_subscribe_internal(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type, bool waitok) { struct rib_subscription *rs; struct epoch_tracker et; if ((rs = allocate_subscription(f, arg, type, waitok)) == NULL) return (NULL); rs->rnh = rnh; NET_EPOCH_ENTER(et); RIB_WLOCK(rnh); CK_STAILQ_INSERT_HEAD(&rnh->rnh_subscribers, rs, next); RIB_WUNLOCK(rnh); NET_EPOCH_EXIT(et); return (rs); } struct rib_subscription * rib_subscribe_locked(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type) { struct rib_subscription *rs; NET_EPOCH_ASSERT(); RIB_WLOCK_ASSERT(rnh); if ((rs = allocate_subscription(f, arg, type, false)) == NULL) return (NULL); rs->rnh = rnh; CK_STAILQ_INSERT_HEAD(&rnh->rnh_subscribers, rs, next); return (rs); } /* * Remove rtable subscription @rs from the routing table. * Needs to be run in network epoch. */ void rib_unsubscribe(struct rib_subscription *rs) { struct rib_head *rnh = rs->rnh; NET_EPOCH_ASSERT(); RIB_WLOCK(rnh); CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next); RIB_WUNLOCK(rnh); epoch_call(net_epoch_preempt, destroy_subscription_epoch, &rs->epoch_ctx); } void rib_unsubscribe_locked(struct rib_subscription *rs) { struct rib_head *rnh = rs->rnh; NET_EPOCH_ASSERT(); RIB_WLOCK_ASSERT(rnh); CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next); epoch_call(net_epoch_preempt, destroy_subscription_epoch, &rs->epoch_ctx); } /* * Epoch callback indicating subscription is safe to destroy */ static void destroy_subscription_epoch(epoch_context_t ctx) { struct rib_subscription *rs; rs = __containerof(ctx, struct rib_subscription, epoch_ctx); free(rs, M_RTABLE); } void rib_init_subscriptions(struct rib_head *rnh) { CK_STAILQ_INIT(&rnh->rnh_subscribers); } void rib_destroy_subscriptions(struct rib_head *rnh) { struct rib_subscription *rs; struct epoch_tracker et; NET_EPOCH_ENTER(et); RIB_WLOCK(rnh); while ((rs = CK_STAILQ_FIRST(&rnh->rnh_subscribers)) != NULL) { CK_STAILQ_REMOVE_HEAD(&rnh->rnh_subscribers, next); epoch_call(net_epoch_preempt, destroy_subscription_epoch, &rs->epoch_ctx); } RIB_WUNLOCK(rnh); NET_EPOCH_EXIT(et); } diff --git a/sys/netgraph/netflow/netflow.c b/sys/netgraph/netflow/netflow.c index 7d4108ee59a7..5c0358d9b67a 100644 --- a/sys/netgraph/netflow/netflow.c +++ b/sys/netgraph/netflow/netflow.c @@ -1,1178 +1,1182 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010-2011 Alexander V. Chernikov * Copyright (c) 2004-2005 Gleb Smirnoff * Copyright (c) 2001-2003 Roman V. Palagin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $SourceForge: netflow.c,v 1.41 2004/09/05 11:41:10 glebius Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NBUCKETS (65536) /* must be power of 2 */ /* This hash is for TCP or UDP packets. */ #define FULL_HASH(addr1, addr2, port1, port2) \ (((addr1 ^ (addr1 >> 16) ^ \ htons(addr2 ^ (addr2 >> 16))) ^ \ port1 ^ htons(port2)) & \ (NBUCKETS - 1)) /* This hash is for all other IP packets. */ #define ADDR_HASH(addr1, addr2) \ ((addr1 ^ (addr1 >> 16) ^ \ htons(addr2 ^ (addr2 >> 16))) & \ (NBUCKETS - 1)) /* Macros to shorten logical constructions */ /* XXX: priv must exist in namespace */ #define INACTIVE(fle) (time_uptime - fle->f.last > priv->nfinfo_inact_t) #define AGED(fle) (time_uptime - fle->f.first > priv->nfinfo_act_t) #define ISFREE(fle) (fle->f.packets == 0) /* * 4 is a magical number: statistically number of 4-packet flows is * bigger than 5,6,7...-packet flows by an order of magnitude. Most UDP/ICMP * scans are 1 packet (~ 90% of flow cache). TCP scans are 2-packet in case * of reachable host and 4-packet otherwise. */ #define SMALL(fle) (fle->f.packets <= 4) MALLOC_DEFINE(M_NETFLOW_HASH, "netflow_hash", "NetFlow hash"); static int export_add(item_p, struct flow_entry *); static int export_send(priv_p, fib_export_p, item_p, int); #ifdef INET static int hash_insert(priv_p, struct flow_hash_entry *, struct flow_rec *, int, uint8_t, uint8_t); #endif #ifdef INET6 static int hash6_insert(priv_p, struct flow_hash_entry *, struct flow6_rec *, int, uint8_t, uint8_t); #endif static void expire_flow(priv_p, fib_export_p, struct flow_entry *, int); #ifdef INET /* * Generate hash for a given flow record. * * FIB is not used here, because: * most VRFS will carry public IPv4 addresses which are unique even * without FIB private addresses can overlap, but this is worked out * via flow_rec bcmp() containing fib id. In IPv6 world addresses are * all globally unique (it's not fully true, there is FC00::/7 for example, * but chances of address overlap are MUCH smaller) */ static inline uint32_t ip_hash(struct flow_rec *r) { switch (r->r_ip_p) { case IPPROTO_TCP: case IPPROTO_UDP: return FULL_HASH(r->r_src.s_addr, r->r_dst.s_addr, r->r_sport, r->r_dport); default: return ADDR_HASH(r->r_src.s_addr, r->r_dst.s_addr); } } #endif #ifdef INET6 /* Generate hash for a given flow6 record. Use lower 4 octets from v6 addresses */ static inline uint32_t ip6_hash(struct flow6_rec *r) { switch (r->r_ip_p) { case IPPROTO_TCP: case IPPROTO_UDP: return FULL_HASH(r->src.r_src6.__u6_addr.__u6_addr32[3], r->dst.r_dst6.__u6_addr.__u6_addr32[3], r->r_sport, r->r_dport); default: return ADDR_HASH(r->src.r_src6.__u6_addr.__u6_addr32[3], r->dst.r_dst6.__u6_addr.__u6_addr32[3]); } } #endif /* * Detach export datagram from priv, if there is any. * If there is no, allocate a new one. */ static item_p get_export_dgram(priv_p priv, fib_export_p fe) { item_p item = NULL; mtx_lock(&fe->export_mtx); if (fe->exp.item != NULL) { item = fe->exp.item; fe->exp.item = NULL; } mtx_unlock(&fe->export_mtx); if (item == NULL) { struct netflow_v5_export_dgram *dgram; struct mbuf *m; m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) return (NULL); item = ng_package_data(m, NG_NOFLAGS); if (item == NULL) return (NULL); dgram = mtod(m, struct netflow_v5_export_dgram *); dgram->header.count = 0; dgram->header.version = htons(NETFLOW_V5); dgram->header.pad = 0; } return (item); } /* * Re-attach incomplete datagram back to priv. * If there is already another one, then send incomplete. */ static void return_export_dgram(priv_p priv, fib_export_p fe, item_p item, int flags) { /* * It may happen on SMP, that some thread has already * put its item there, in this case we bail out and * send what we have to collector. */ mtx_lock(&fe->export_mtx); if (fe->exp.item == NULL) { fe->exp.item = item; mtx_unlock(&fe->export_mtx); } else { mtx_unlock(&fe->export_mtx); export_send(priv, fe, item, flags); } } /* * The flow is over. Call export_add() and free it. If datagram is * full, then call export_send(). */ static void expire_flow(priv_p priv, fib_export_p fe, struct flow_entry *fle, int flags) { struct netflow_export_item exp; uint16_t version = fle->f.version; if ((priv->export != NULL) && (version == IPVERSION)) { exp.item = get_export_dgram(priv, fe); if (exp.item == NULL) { priv->nfinfo_export_failed++; if (priv->export9 != NULL) priv->nfinfo_export9_failed++; /* fle definitely contains IPv4 flow. */ uma_zfree_arg(priv->zone, fle, priv); return; } if (export_add(exp.item, fle) > 0) export_send(priv, fe, exp.item, flags); else return_export_dgram(priv, fe, exp.item, NG_QUEUE); } if (priv->export9 != NULL) { exp.item9 = get_export9_dgram(priv, fe, &exp.item9_opt); if (exp.item9 == NULL) { priv->nfinfo_export9_failed++; if (version == IPVERSION) uma_zfree_arg(priv->zone, fle, priv); #ifdef INET6 else if (version == IP6VERSION) uma_zfree_arg(priv->zone6, fle, priv); #endif else panic("ng_netflow: Unknown IP proto: %d", version); return; } if (export9_add(exp.item9, exp.item9_opt, fle) > 0) export9_send(priv, fe, exp.item9, exp.item9_opt, flags); else return_export9_dgram(priv, fe, exp.item9, exp.item9_opt, NG_QUEUE); } if (version == IPVERSION) uma_zfree_arg(priv->zone, fle, priv); #ifdef INET6 else if (version == IP6VERSION) uma_zfree_arg(priv->zone6, fle, priv); #endif } /* Get a snapshot of node statistics */ void ng_netflow_copyinfo(priv_p priv, struct ng_netflow_info *i) { i->nfinfo_bytes = counter_u64_fetch(priv->nfinfo_bytes); i->nfinfo_packets = counter_u64_fetch(priv->nfinfo_packets); i->nfinfo_bytes6 = counter_u64_fetch(priv->nfinfo_bytes6); i->nfinfo_packets6 = counter_u64_fetch(priv->nfinfo_packets6); i->nfinfo_sbytes = counter_u64_fetch(priv->nfinfo_sbytes); i->nfinfo_spackets = counter_u64_fetch(priv->nfinfo_spackets); i->nfinfo_sbytes6 = counter_u64_fetch(priv->nfinfo_sbytes6); i->nfinfo_spackets6 = counter_u64_fetch(priv->nfinfo_spackets6); i->nfinfo_act_exp = counter_u64_fetch(priv->nfinfo_act_exp); i->nfinfo_inact_exp = counter_u64_fetch(priv->nfinfo_inact_exp); i->nfinfo_used = uma_zone_get_cur(priv->zone); #ifdef INET6 i->nfinfo_used6 = uma_zone_get_cur(priv->zone6); #endif i->nfinfo_alloc_failed = priv->nfinfo_alloc_failed; i->nfinfo_export_failed = priv->nfinfo_export_failed; i->nfinfo_export9_failed = priv->nfinfo_export9_failed; i->nfinfo_realloc_mbuf = priv->nfinfo_realloc_mbuf; i->nfinfo_alloc_fibs = priv->nfinfo_alloc_fibs; i->nfinfo_inact_t = priv->nfinfo_inact_t; i->nfinfo_act_t = priv->nfinfo_act_t; } /* * Insert a record into defined slot. * * First we get for us a free flow entry, then fill in all * possible fields in it. * * TODO: consider dropping hash mutex while filling in datagram, * as this was done in previous version. Need to test & profile * to be sure. */ #ifdef INET static int hash_insert(priv_p priv, struct flow_hash_entry *hsh, struct flow_rec *r, int plen, uint8_t flags, uint8_t tcp_flags) { struct flow_entry *fle; mtx_assert(&hsh->mtx, MA_OWNED); fle = uma_zalloc_arg(priv->zone, priv, M_NOWAIT); if (fle == NULL) { priv->nfinfo_alloc_failed++; return (ENOMEM); } /* * Now fle is totally ours. It is detached from all lists, * we can safely edit it. */ fle->f.version = IPVERSION; bcopy(r, &fle->f.r, sizeof(struct flow_rec)); fle->f.bytes = plen; fle->f.packets = 1; fle->f.tcp_flags = tcp_flags; fle->f.first = fle->f.last = time_uptime; /* * First we do route table lookup on destination address. So we can * fill in out_ifx, dst_mask, nexthop, and dst_as in future releases. */ if ((flags & NG_NETFLOW_CONF_NODSTLOOKUP) == 0) { struct rtentry *rt; struct route_nhop_data rnd; rt = fib4_lookup_rt(r->fib, fle->f.r.r_dst, 0, NHR_NONE, &rnd); if (rt != NULL) { struct in_addr addr; uint32_t scopeid; struct nhop_object *nh = nhop_select_func(rnd.rnd_nhop, 0); int plen; rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid); fle->f.fle_o_ifx = nh->nh_ifp->if_index; if (nh->gw_sa.sa_family == AF_INET) fle->f.next_hop = nh->gw4_sa.sin_addr; + /* + * XXX we're leaving an empty gateway here for + * IPv6 nexthops. + */ fle->f.dst_mask = plen; } } /* Do route lookup on source address, to fill in src_mask. */ if ((flags & NG_NETFLOW_CONF_NOSRCLOOKUP) == 0) { struct rtentry *rt; struct route_nhop_data rnd; rt = fib4_lookup_rt(r->fib, fle->f.r.r_src, 0, NHR_NONE, &rnd); if (rt != NULL) { struct in_addr addr; uint32_t scopeid; int plen; rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid); fle->f.src_mask = plen; } } /* Push new flow at the and of hash. */ TAILQ_INSERT_TAIL(&hsh->head, fle, fle_hash); return (0); } #endif #ifdef INET6 static int hash6_insert(priv_p priv, struct flow_hash_entry *hsh6, struct flow6_rec *r, int plen, uint8_t flags, uint8_t tcp_flags) { struct flow6_entry *fle6; mtx_assert(&hsh6->mtx, MA_OWNED); fle6 = uma_zalloc_arg(priv->zone6, priv, M_NOWAIT); if (fle6 == NULL) { priv->nfinfo_alloc_failed++; return (ENOMEM); } /* * Now fle is totally ours. It is detached from all lists, * we can safely edit it. */ fle6->f.version = IP6VERSION; bcopy(r, &fle6->f.r, sizeof(struct flow6_rec)); fle6->f.bytes = plen; fle6->f.packets = 1; fle6->f.tcp_flags = tcp_flags; fle6->f.first = fle6->f.last = time_uptime; /* * First we do route table lookup on destination address. So we can * fill in out_ifx, dst_mask, nexthop, and dst_as in future releases. */ if ((flags & NG_NETFLOW_CONF_NODSTLOOKUP) == 0) { struct rtentry *rt; struct route_nhop_data rnd; rt = fib6_lookup_rt(r->fib, &fle6->f.r.dst.r_dst6, 0, NHR_NONE, &rnd); if (rt != NULL) { struct in6_addr addr; uint32_t scopeid; struct nhop_object *nh = nhop_select_func(rnd.rnd_nhop, 0); int plen; rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid); fle6->f.fle_o_ifx = nh->nh_ifp->if_index; if (nh->gw_sa.sa_family == AF_INET6) fle6->f.n.next_hop6 = nh->gw6_sa.sin6_addr; fle6->f.dst_mask = plen; } } if ((flags & NG_NETFLOW_CONF_NOSRCLOOKUP) == 0) { /* Do route lookup on source address, to fill in src_mask. */ struct rtentry *rt; struct route_nhop_data rnd; rt = fib6_lookup_rt(r->fib, &fle6->f.r.src.r_src6, 0, NHR_NONE, &rnd); if (rt != NULL) { struct in6_addr addr; uint32_t scopeid; int plen; rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid); fle6->f.src_mask = plen; } } /* Push new flow at the and of hash. */ TAILQ_INSERT_TAIL(&hsh6->head, (struct flow_entry *)fle6, fle_hash); return (0); } #endif /* * Non-static functions called from ng_netflow.c */ /* Allocate memory and set up flow cache */ void ng_netflow_cache_init(priv_p priv) { struct flow_hash_entry *hsh; int i; /* Initialize cache UMA zone. */ priv->zone = uma_zcreate("NetFlow IPv4 cache", sizeof(struct flow_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); uma_zone_set_max(priv->zone, CACHESIZE); #ifdef INET6 priv->zone6 = uma_zcreate("NetFlow IPv6 cache", sizeof(struct flow6_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); uma_zone_set_max(priv->zone6, CACHESIZE); #endif /* Allocate hash. */ priv->hash = malloc(NBUCKETS * sizeof(struct flow_hash_entry), M_NETFLOW_HASH, M_WAITOK | M_ZERO); /* Initialize hash. */ for (i = 0, hsh = priv->hash; i < NBUCKETS; i++, hsh++) { mtx_init(&hsh->mtx, "hash mutex", NULL, MTX_DEF); TAILQ_INIT(&hsh->head); } #ifdef INET6 /* Allocate hash. */ priv->hash6 = malloc(NBUCKETS * sizeof(struct flow_hash_entry), M_NETFLOW_HASH, M_WAITOK | M_ZERO); /* Initialize hash. */ for (i = 0, hsh = priv->hash6; i < NBUCKETS; i++, hsh++) { mtx_init(&hsh->mtx, "hash mutex", NULL, MTX_DEF); TAILQ_INIT(&hsh->head); } #endif priv->nfinfo_bytes = counter_u64_alloc(M_WAITOK); priv->nfinfo_packets = counter_u64_alloc(M_WAITOK); priv->nfinfo_bytes6 = counter_u64_alloc(M_WAITOK); priv->nfinfo_packets6 = counter_u64_alloc(M_WAITOK); priv->nfinfo_sbytes = counter_u64_alloc(M_WAITOK); priv->nfinfo_spackets = counter_u64_alloc(M_WAITOK); priv->nfinfo_sbytes6 = counter_u64_alloc(M_WAITOK); priv->nfinfo_spackets6 = counter_u64_alloc(M_WAITOK); priv->nfinfo_act_exp = counter_u64_alloc(M_WAITOK); priv->nfinfo_inact_exp = counter_u64_alloc(M_WAITOK); ng_netflow_v9_cache_init(priv); CTR0(KTR_NET, "ng_netflow startup()"); } /* Initialize new FIB table for v5 and v9 */ int ng_netflow_fib_init(priv_p priv, int fib) { fib_export_p fe = priv_to_fib(priv, fib); CTR1(KTR_NET, "ng_netflow(): fib init: %d", fib); if (fe != NULL) return (0); if ((fe = malloc(sizeof(struct fib_export), M_NETGRAPH, M_NOWAIT | M_ZERO)) == NULL) return (ENOMEM); mtx_init(&fe->export_mtx, "export dgram lock", NULL, MTX_DEF); mtx_init(&fe->export9_mtx, "export9 dgram lock", NULL, MTX_DEF); fe->fib = fib; fe->domain_id = fib; if (atomic_cmpset_ptr((volatile uintptr_t *)&priv->fib_data[fib], (uintptr_t)NULL, (uintptr_t)fe) == 0) { /* FIB already set up by other ISR */ CTR3(KTR_NET, "ng_netflow(): fib init: %d setup %p but got %p", fib, fe, priv_to_fib(priv, fib)); mtx_destroy(&fe->export_mtx); mtx_destroy(&fe->export9_mtx); free(fe, M_NETGRAPH); } else { /* Increase counter for statistics */ CTR3(KTR_NET, "ng_netflow(): fib %d setup to %p (%p)", fib, fe, priv_to_fib(priv, fib)); priv->nfinfo_alloc_fibs++; } return (0); } /* Free all flow cache memory. Called from node close method. */ void ng_netflow_cache_flush(priv_p priv) { struct flow_entry *fle, *fle1; struct flow_hash_entry *hsh; struct netflow_export_item exp; fib_export_p fe; int i; bzero(&exp, sizeof(exp)); /* * We are going to free probably billable data. * Expire everything before freeing it. * No locking is required since callout is already drained. */ for (hsh = priv->hash, i = 0; i < NBUCKETS; hsh++, i++) TAILQ_FOREACH_SAFE(fle, &hsh->head, fle_hash, fle1) { TAILQ_REMOVE(&hsh->head, fle, fle_hash); fe = priv_to_fib(priv, fle->f.r.fib); expire_flow(priv, fe, fle, NG_QUEUE); } #ifdef INET6 for (hsh = priv->hash6, i = 0; i < NBUCKETS; hsh++, i++) TAILQ_FOREACH_SAFE(fle, &hsh->head, fle_hash, fle1) { TAILQ_REMOVE(&hsh->head, fle, fle_hash); fe = priv_to_fib(priv, fle->f.r.fib); expire_flow(priv, fe, fle, NG_QUEUE); } #endif uma_zdestroy(priv->zone); /* Destroy hash mutexes. */ for (i = 0, hsh = priv->hash; i < NBUCKETS; i++, hsh++) mtx_destroy(&hsh->mtx); /* Free hash memory. */ if (priv->hash != NULL) free(priv->hash, M_NETFLOW_HASH); #ifdef INET6 uma_zdestroy(priv->zone6); /* Destroy hash mutexes. */ for (i = 0, hsh = priv->hash6; i < NBUCKETS; i++, hsh++) mtx_destroy(&hsh->mtx); /* Free hash memory. */ if (priv->hash6 != NULL) free(priv->hash6, M_NETFLOW_HASH); #endif for (i = 0; i < priv->maxfibs; i++) { if ((fe = priv_to_fib(priv, i)) == NULL) continue; if (fe->exp.item != NULL) export_send(priv, fe, fe->exp.item, NG_QUEUE); if (fe->exp.item9 != NULL) export9_send(priv, fe, fe->exp.item9, fe->exp.item9_opt, NG_QUEUE); mtx_destroy(&fe->export_mtx); mtx_destroy(&fe->export9_mtx); free(fe, M_NETGRAPH); } counter_u64_free(priv->nfinfo_bytes); counter_u64_free(priv->nfinfo_packets); counter_u64_free(priv->nfinfo_bytes6); counter_u64_free(priv->nfinfo_packets6); counter_u64_free(priv->nfinfo_sbytes); counter_u64_free(priv->nfinfo_spackets); counter_u64_free(priv->nfinfo_sbytes6); counter_u64_free(priv->nfinfo_spackets6); counter_u64_free(priv->nfinfo_act_exp); counter_u64_free(priv->nfinfo_inact_exp); ng_netflow_v9_cache_flush(priv); } #ifdef INET /* Insert packet from into flow cache. */ int ng_netflow_flow_add(priv_p priv, fib_export_p fe, struct ip *ip, caddr_t upper_ptr, uint8_t upper_proto, uint8_t flags, unsigned int src_if_index) { struct flow_entry *fle, *fle1; struct flow_hash_entry *hsh; struct flow_rec r; int hlen, plen; int error = 0; uint16_t eproto; uint8_t tcp_flags = 0; bzero(&r, sizeof(r)); if (ip->ip_v != IPVERSION) return (EINVAL); hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) return (EINVAL); eproto = ETHERTYPE_IP; /* Assume L4 template by default */ r.flow_type = NETFLOW_V9_FLOW_V4_L4; r.r_src = ip->ip_src; r.r_dst = ip->ip_dst; r.fib = fe->fib; plen = ntohs(ip->ip_len); r.r_ip_p = ip->ip_p; r.r_tos = ip->ip_tos; r.r_i_ifx = src_if_index; /* * XXX NOTE: only first fragment of fragmented TCP, UDP and * ICMP packet will be recorded with proper s_port and d_port. * Following fragments will be recorded simply as IP packet with * ip_proto = ip->ip_p and s_port, d_port set to zero. * I know, it looks like bug. But I don't want to re-implement * ip packet assebmling here. Anyway, (in)famous trafd works this way - * and nobody complains yet :) */ if ((ip->ip_off & htons(IP_OFFMASK)) == 0) switch(r.r_ip_p) { case IPPROTO_TCP: { struct tcphdr *tcp; tcp = (struct tcphdr *)((caddr_t )ip + hlen); r.r_sport = tcp->th_sport; r.r_dport = tcp->th_dport; tcp_flags = tcp->th_flags; break; } case IPPROTO_UDP: r.r_ports = *(uint32_t *)((caddr_t )ip + hlen); break; } counter_u64_add(priv->nfinfo_packets, 1); counter_u64_add(priv->nfinfo_bytes, plen); /* Find hash slot. */ hsh = &priv->hash[ip_hash(&r)]; mtx_lock(&hsh->mtx); /* * Go through hash and find our entry. If we encounter an * entry, that should be expired, purge it. We do a reverse * search since most active entries are first, and most * searches are done on most active entries. */ TAILQ_FOREACH_REVERSE_SAFE(fle, &hsh->head, fhead, fle_hash, fle1) { if (bcmp(&r, &fle->f.r, sizeof(struct flow_rec)) == 0) break; if ((INACTIVE(fle) && SMALL(fle)) || AGED(fle)) { TAILQ_REMOVE(&hsh->head, fle, fle_hash); expire_flow(priv, priv_to_fib(priv, fle->f.r.fib), fle, NG_QUEUE); counter_u64_add(priv->nfinfo_act_exp, 1); } } if (fle) { /* An existent entry. */ fle->f.bytes += plen; fle->f.packets ++; fle->f.tcp_flags |= tcp_flags; fle->f.last = time_uptime; /* * We have the following reasons to expire flow in active way: * - it hit active timeout * - a TCP connection closed * - it is going to overflow counter */ if (tcp_flags & TH_FIN || tcp_flags & TH_RST || AGED(fle) || (fle->f.bytes >= (CNTR_MAX - IF_MAXMTU)) ) { TAILQ_REMOVE(&hsh->head, fle, fle_hash); expire_flow(priv, priv_to_fib(priv, fle->f.r.fib), fle, NG_QUEUE); counter_u64_add(priv->nfinfo_act_exp, 1); } else { /* * It is the newest, move it to the tail, * if it isn't there already. Next search will * locate it quicker. */ if (fle != TAILQ_LAST(&hsh->head, fhead)) { TAILQ_REMOVE(&hsh->head, fle, fle_hash); TAILQ_INSERT_TAIL(&hsh->head, fle, fle_hash); } } } else /* A new flow entry. */ error = hash_insert(priv, hsh, &r, plen, flags, tcp_flags); mtx_unlock(&hsh->mtx); return (error); } #endif #ifdef INET6 /* Insert IPv6 packet from into flow cache. */ int ng_netflow_flow6_add(priv_p priv, fib_export_p fe, struct ip6_hdr *ip6, caddr_t upper_ptr, uint8_t upper_proto, uint8_t flags, unsigned int src_if_index) { struct flow_entry *fle = NULL, *fle1; struct flow6_entry *fle6; struct flow_hash_entry *hsh; struct flow6_rec r; int plen; int error = 0; uint8_t tcp_flags = 0; /* check version */ if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) return (EINVAL); bzero(&r, sizeof(r)); r.src.r_src6 = ip6->ip6_src; r.dst.r_dst6 = ip6->ip6_dst; r.fib = fe->fib; /* Assume L4 template by default */ r.flow_type = NETFLOW_V9_FLOW_V6_L4; plen = ntohs(ip6->ip6_plen) + sizeof(struct ip6_hdr); #if 0 /* XXX: set DSCP/CoS value */ r.r_tos = ip->ip_tos; #endif if ((flags & NG_NETFLOW_IS_FRAG) == 0) { switch(upper_proto) { case IPPROTO_TCP: { struct tcphdr *tcp; tcp = (struct tcphdr *)upper_ptr; r.r_ports = *(uint32_t *)upper_ptr; tcp_flags = tcp->th_flags; break; } case IPPROTO_UDP: case IPPROTO_SCTP: r.r_ports = *(uint32_t *)upper_ptr; break; } } r.r_ip_p = upper_proto; r.r_i_ifx = src_if_index; counter_u64_add(priv->nfinfo_packets6, 1); counter_u64_add(priv->nfinfo_bytes6, plen); /* Find hash slot. */ hsh = &priv->hash6[ip6_hash(&r)]; mtx_lock(&hsh->mtx); /* * Go through hash and find our entry. If we encounter an * entry, that should be expired, purge it. We do a reverse * search since most active entries are first, and most * searches are done on most active entries. */ TAILQ_FOREACH_REVERSE_SAFE(fle, &hsh->head, fhead, fle_hash, fle1) { if (fle->f.version != IP6VERSION) continue; fle6 = (struct flow6_entry *)fle; if (bcmp(&r, &fle6->f.r, sizeof(struct flow6_rec)) == 0) break; if ((INACTIVE(fle6) && SMALL(fle6)) || AGED(fle6)) { TAILQ_REMOVE(&hsh->head, fle, fle_hash); expire_flow(priv, priv_to_fib(priv, fle->f.r.fib), fle, NG_QUEUE); counter_u64_add(priv->nfinfo_act_exp, 1); } } if (fle != NULL) { /* An existent entry. */ fle6 = (struct flow6_entry *)fle; fle6->f.bytes += plen; fle6->f.packets ++; fle6->f.tcp_flags |= tcp_flags; fle6->f.last = time_uptime; /* * We have the following reasons to expire flow in active way: * - it hit active timeout * - a TCP connection closed * - it is going to overflow counter */ if (tcp_flags & TH_FIN || tcp_flags & TH_RST || AGED(fle6) || (fle6->f.bytes >= (CNTR_MAX - IF_MAXMTU)) ) { TAILQ_REMOVE(&hsh->head, fle, fle_hash); expire_flow(priv, priv_to_fib(priv, fle->f.r.fib), fle, NG_QUEUE); counter_u64_add(priv->nfinfo_act_exp, 1); } else { /* * It is the newest, move it to the tail, * if it isn't there already. Next search will * locate it quicker. */ if (fle != TAILQ_LAST(&hsh->head, fhead)) { TAILQ_REMOVE(&hsh->head, fle, fle_hash); TAILQ_INSERT_TAIL(&hsh->head, fle, fle_hash); } } } else /* A new flow entry. */ error = hash6_insert(priv, hsh, &r, plen, flags, tcp_flags); mtx_unlock(&hsh->mtx); return (error); } #endif /* * Return records from cache to userland. * * TODO: matching particular IP should be done in kernel, here. */ int ng_netflow_flow_show(priv_p priv, struct ngnf_show_header *req, struct ngnf_show_header *resp) { struct flow_hash_entry *hsh; struct flow_entry *fle; struct flow_entry_data *data = (struct flow_entry_data *)(resp + 1); #ifdef INET6 struct flow6_entry_data *data6 = (struct flow6_entry_data *)(resp + 1); #endif int i, max; i = req->hash_id; if (i > NBUCKETS-1) return (EINVAL); #ifdef INET6 if (req->version == 6) { resp->version = 6; hsh = priv->hash6 + i; max = NREC6_AT_ONCE; } else #endif if (req->version == 4) { resp->version = 4; hsh = priv->hash + i; max = NREC_AT_ONCE; } else return (EINVAL); /* * We will transfer not more than NREC_AT_ONCE. More data * will come in next message. * We send current hash index and current record number in list * to userland, and userland should return it back to us. * Then, we will restart with new entry. * * The resulting cache snapshot can be inaccurate if flow expiration * is taking place on hash item between userland data requests for * this hash item id. */ resp->nentries = 0; for (; i < NBUCKETS; hsh++, i++) { int list_id; if (mtx_trylock(&hsh->mtx) == 0) { /* * Requested hash index is not available, * relay decision to skip or re-request data * to userland. */ resp->hash_id = i; resp->list_id = 0; return (0); } list_id = 0; TAILQ_FOREACH(fle, &hsh->head, fle_hash) { if (hsh->mtx.mtx_lock & MTX_CONTESTED) { resp->hash_id = i; resp->list_id = list_id; mtx_unlock(&hsh->mtx); return (0); } list_id++; /* Search for particular record in list. */ if (req->list_id > 0) { if (list_id < req->list_id) continue; /* Requested list position found. */ req->list_id = 0; } #ifdef INET6 if (req->version == 6) { struct flow6_entry *fle6; fle6 = (struct flow6_entry *)fle; bcopy(&fle6->f, data6 + resp->nentries, sizeof(fle6->f)); } else #endif bcopy(&fle->f, data + resp->nentries, sizeof(fle->f)); resp->nentries++; if (resp->nentries == max) { resp->hash_id = i; /* * If it was the last item in list * we simply skip to next hash_id. */ resp->list_id = list_id + 1; mtx_unlock(&hsh->mtx); return (0); } } mtx_unlock(&hsh->mtx); } resp->hash_id = resp->list_id = 0; return (0); } /* We have full datagram in privdata. Send it to export hook. */ static int export_send(priv_p priv, fib_export_p fe, item_p item, int flags) { struct mbuf *m = NGI_M(item); struct netflow_v5_export_dgram *dgram = mtod(m, struct netflow_v5_export_dgram *); struct netflow_v5_header *header = &dgram->header; struct timespec ts; int error = 0; /* Fill mbuf header. */ m->m_len = m->m_pkthdr.len = sizeof(struct netflow_v5_record) * header->count + sizeof(struct netflow_v5_header); /* Fill export header. */ header->sys_uptime = htonl(MILLIUPTIME(time_uptime)); getnanotime(&ts); header->unix_secs = htonl(ts.tv_sec); header->unix_nsecs = htonl(ts.tv_nsec); header->engine_type = 0; header->engine_id = fe->domain_id; header->pad = 0; header->flow_seq = htonl(atomic_fetchadd_32(&fe->flow_seq, header->count)); header->count = htons(header->count); if (priv->export != NULL) NG_FWD_ITEM_HOOK_FLAGS(error, item, priv->export, flags); else NG_FREE_ITEM(item); return (error); } /* Add export record to dgram. */ static int export_add(item_p item, struct flow_entry *fle) { struct netflow_v5_export_dgram *dgram = mtod(NGI_M(item), struct netflow_v5_export_dgram *); struct netflow_v5_header *header = &dgram->header; struct netflow_v5_record *rec; rec = &dgram->r[header->count]; header->count ++; KASSERT(header->count <= NETFLOW_V5_MAX_RECORDS, ("ng_netflow: export too big")); /* Fill in export record. */ rec->src_addr = fle->f.r.r_src.s_addr; rec->dst_addr = fle->f.r.r_dst.s_addr; rec->next_hop = fle->f.next_hop.s_addr; rec->i_ifx = htons(fle->f.fle_i_ifx); rec->o_ifx = htons(fle->f.fle_o_ifx); rec->packets = htonl(fle->f.packets); rec->octets = htonl(fle->f.bytes); rec->first = htonl(MILLIUPTIME(fle->f.first)); rec->last = htonl(MILLIUPTIME(fle->f.last)); rec->s_port = fle->f.r.r_sport; rec->d_port = fle->f.r.r_dport; rec->flags = fle->f.tcp_flags; rec->prot = fle->f.r.r_ip_p; rec->tos = fle->f.r.r_tos; rec->dst_mask = fle->f.dst_mask; rec->src_mask = fle->f.src_mask; rec->pad1 = 0; rec->pad2 = 0; /* Not supported fields. */ rec->src_as = rec->dst_as = 0; if (header->count == NETFLOW_V5_MAX_RECORDS) return (1); /* end of datagram */ else return (0); } /* Periodic flow expiry run. */ void ng_netflow_expire(void *arg) { struct flow_entry *fle, *fle1; struct flow_hash_entry *hsh; priv_p priv = (priv_p )arg; int used, i; /* * Going through all the cache. */ used = uma_zone_get_cur(priv->zone); for (hsh = priv->hash, i = 0; i < NBUCKETS; hsh++, i++) { /* * Skip entries, that are already being worked on. */ if (mtx_trylock(&hsh->mtx) == 0) continue; TAILQ_FOREACH_SAFE(fle, &hsh->head, fle_hash, fle1) { /* * Interrupt thread wants this entry! * Quick! Quick! Bail out! */ if (hsh->mtx.mtx_lock & MTX_CONTESTED) break; /* * Don't expire aggressively while hash collision * ratio is predicted small. */ if (used <= (NBUCKETS*2) && !INACTIVE(fle)) break; if ((INACTIVE(fle) && (SMALL(fle) || (used > (NBUCKETS*2)))) || AGED(fle)) { TAILQ_REMOVE(&hsh->head, fle, fle_hash); expire_flow(priv, priv_to_fib(priv, fle->f.r.fib), fle, NG_NOFLAGS); used--; counter_u64_add(priv->nfinfo_inact_exp, 1); } } mtx_unlock(&hsh->mtx); } #ifdef INET6 used = uma_zone_get_cur(priv->zone6); for (hsh = priv->hash6, i = 0; i < NBUCKETS; hsh++, i++) { struct flow6_entry *fle6; /* * Skip entries, that are already being worked on. */ if (mtx_trylock(&hsh->mtx) == 0) continue; TAILQ_FOREACH_SAFE(fle, &hsh->head, fle_hash, fle1) { fle6 = (struct flow6_entry *)fle; /* * Interrupt thread wants this entry! * Quick! Quick! Bail out! */ if (hsh->mtx.mtx_lock & MTX_CONTESTED) break; /* * Don't expire aggressively while hash collision * ratio is predicted small. */ if (used <= (NBUCKETS*2) && !INACTIVE(fle6)) break; if ((INACTIVE(fle6) && (SMALL(fle6) || (used > (NBUCKETS*2)))) || AGED(fle6)) { TAILQ_REMOVE(&hsh->head, fle, fle_hash); expire_flow(priv, priv_to_fib(priv, fle->f.r.fib), fle, NG_NOFLAGS); used--; counter_u64_add(priv->nfinfo_inact_exp, 1); } } mtx_unlock(&hsh->mtx); } #endif /* Schedule next expire. */ callout_reset(&priv->exp_callout, (1*hz), &ng_netflow_expire, (void *)priv); } diff --git a/sys/netgraph/ng_iface.c b/sys/netgraph/ng_iface.c index 1e586d687244..e6871435fa88 100644 --- a/sys/netgraph/ng_iface.c +++ b/sys/netgraph/ng_iface.c @@ -1,818 +1,818 @@ /* * ng_iface.c */ /*- * Copyright (c) 1996-1999 Whistle Communications, Inc. * All rights reserved. * * Subject to the following obligations and disclaimer of warranty, use and * redistribution of this software, in source or object code forms, with or * without modifications are expressly permitted by Whistle Communications; * provided, however, that: * 1. Any and all reproductions of the source or object code must include the * copyright notice above and the following disclaimer of warranties; and * 2. No rights are granted, in any manner or form, to use Whistle * Communications, Inc. trademarks, including the mark "WHISTLE * COMMUNICATIONS" on advertising, endorsements, or otherwise except as * such appears in the above copyright notice or in the software. * * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. * * Author: Archie Cobbs * * $FreeBSD$ * $Whistle: ng_iface.c,v 1.33 1999/11/01 09:24:51 julian Exp $ */ /* * This node is also a system networking interface. It has * a hook for each protocol (IP, AppleTalk, etc). Packets * are simply relayed between the interface and the hooks. * * Interfaces are named ng0, ng1, etc. New nodes take the * first available interface name. * * This node also includes Berkeley packet filter support. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef NG_SEPARATE_MALLOC static MALLOC_DEFINE(M_NETGRAPH_IFACE, "netgraph_iface", "netgraph iface node"); #else #define M_NETGRAPH_IFACE M_NETGRAPH #endif static SYSCTL_NODE(_net_graph, OID_AUTO, iface, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Point to point netgraph interface"); VNET_DEFINE_STATIC(int, ng_iface_max_nest) = 2; #define V_ng_iface_max_nest VNET(ng_iface_max_nest) SYSCTL_INT(_net_graph_iface, OID_AUTO, max_nesting, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ng_iface_max_nest), 0, "Max nested tunnels"); /* This struct describes one address family */ struct iffam { sa_family_t family; /* Address family */ const char *hookname; /* Name for hook */ }; typedef const struct iffam *iffam_p; /* List of address families supported by our interface */ const static struct iffam gFamilies[] = { { AF_INET, NG_IFACE_HOOK_INET }, { AF_INET6, NG_IFACE_HOOK_INET6 }, }; #define NUM_FAMILIES nitems(gFamilies) /* Node private data */ struct ng_iface_private { struct ifnet *ifp; /* Our interface */ int unit; /* Interface unit number */ node_p node; /* Our netgraph node */ hook_p hooks[NUM_FAMILIES]; /* Hook for each address family */ struct rmlock lock; /* Protect private data changes */ }; typedef struct ng_iface_private *priv_p; #define PRIV_RLOCK(priv, t) rm_rlock(&priv->lock, t) #define PRIV_RUNLOCK(priv, t) rm_runlock(&priv->lock, t) #define PRIV_WLOCK(priv) rm_wlock(&priv->lock) #define PRIV_WUNLOCK(priv) rm_wunlock(&priv->lock) /* Interface methods */ static void ng_iface_start(struct ifnet *ifp); static int ng_iface_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int ng_iface_output(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, struct route *ro); static void ng_iface_bpftap(struct ifnet *ifp, struct mbuf *m, sa_family_t family); static int ng_iface_send(struct ifnet *ifp, struct mbuf *m, sa_family_t sa); #ifdef DEBUG static void ng_iface_print_ioctl(struct ifnet *ifp, int cmd, caddr_t data); #endif /* Netgraph methods */ static int ng_iface_mod_event(module_t, int, void *); static ng_constructor_t ng_iface_constructor; static ng_rcvmsg_t ng_iface_rcvmsg; static ng_shutdown_t ng_iface_shutdown; static ng_newhook_t ng_iface_newhook; static ng_rcvdata_t ng_iface_rcvdata; static ng_disconnect_t ng_iface_disconnect; /* Helper stuff */ static iffam_p get_iffam_from_af(sa_family_t family); static iffam_p get_iffam_from_hook(priv_p priv, hook_p hook); static iffam_p get_iffam_from_name(const char *name); static hook_p *get_hook_from_iffam(priv_p priv, iffam_p iffam); /* List of commands and how to convert arguments to/from ASCII */ static const struct ng_cmdlist ng_iface_cmds[] = { { NGM_IFACE_COOKIE, NGM_IFACE_GET_IFNAME, "getifname", NULL, &ng_parse_string_type }, { NGM_IFACE_COOKIE, NGM_IFACE_POINT2POINT, "point2point", NULL, NULL }, { NGM_IFACE_COOKIE, NGM_IFACE_BROADCAST, "broadcast", NULL, NULL }, { NGM_IFACE_COOKIE, NGM_IFACE_GET_IFINDEX, "getifindex", NULL, &ng_parse_uint32_type }, { 0 } }; /* Node type descriptor */ static struct ng_type typestruct = { .version = NG_ABI_VERSION, .name = NG_IFACE_NODE_TYPE, .mod_event = ng_iface_mod_event, .constructor = ng_iface_constructor, .rcvmsg = ng_iface_rcvmsg, .shutdown = ng_iface_shutdown, .newhook = ng_iface_newhook, .rcvdata = ng_iface_rcvdata, .disconnect = ng_iface_disconnect, .cmdlist = ng_iface_cmds, }; NETGRAPH_INIT(iface, &typestruct); VNET_DEFINE_STATIC(struct unrhdr *, ng_iface_unit); #define V_ng_iface_unit VNET(ng_iface_unit) /************************************************************************ HELPER STUFF ************************************************************************/ /* * Get the family descriptor from the family ID */ static __inline iffam_p get_iffam_from_af(sa_family_t family) { iffam_p iffam; int k; for (k = 0; k < NUM_FAMILIES; k++) { iffam = &gFamilies[k]; if (iffam->family == family) return (iffam); } return (NULL); } /* * Get the family descriptor from the hook */ static __inline iffam_p get_iffam_from_hook(priv_p priv, hook_p hook) { int k; for (k = 0; k < NUM_FAMILIES; k++) if (priv->hooks[k] == hook) return (&gFamilies[k]); return (NULL); } /* * Get the hook from the iffam descriptor */ static __inline hook_p * get_hook_from_iffam(priv_p priv, iffam_p iffam) { return (&priv->hooks[iffam - gFamilies]); } /* * Get the iffam descriptor from the name */ static __inline iffam_p get_iffam_from_name(const char *name) { iffam_p iffam; int k; for (k = 0; k < NUM_FAMILIES; k++) { iffam = &gFamilies[k]; if (!strcmp(iffam->hookname, name)) return (iffam); } return (NULL); } /************************************************************************ INTERFACE STUFF ************************************************************************/ /* * Process an ioctl for the virtual interface */ static int ng_iface_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifreq *const ifr = (struct ifreq *) data; int error = 0; #ifdef DEBUG ng_iface_print_ioctl(ifp, command, data); #endif switch (command) { /* These two are mostly handled at a higher layer */ case SIOCSIFADDR: ifp->if_flags |= IFF_UP; ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE); break; case SIOCGIFADDR: break; /* Set flags */ case SIOCSIFFLAGS: /* * If the interface is marked up and stopped, then start it. * If it is marked down and running, then stop it. */ if (ifr->ifr_flags & IFF_UP) { if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE); ifp->if_drv_flags |= IFF_DRV_RUNNING; } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); } break; /* Set the interface MTU */ case SIOCSIFMTU: if (ifr->ifr_mtu > NG_IFACE_MTU_MAX || ifr->ifr_mtu < NG_IFACE_MTU_MIN) error = EINVAL; else ifp->if_mtu = ifr->ifr_mtu; break; /* Stuff that's not supported */ case SIOCADDMULTI: case SIOCDELMULTI: error = 0; break; case SIOCSIFPHYS: error = EOPNOTSUPP; break; default: error = EINVAL; break; } return (error); } /* * This routine is called to deliver a packet out the interface. * We simply look at the address family and relay the packet to * the corresponding hook, if it exists and is connected. */ static int ng_iface_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { uint32_t af; int error; /* Check interface flags */ if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) { m_freem(m); return (ENETDOWN); } /* Protect from deadly infinite recursion. */ error = if_tunnel_check_nesting(ifp, m, NGM_IFACE_COOKIE, V_ng_iface_max_nest); if (error) { m_freem(m); return (error); } /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else - af = dst->sa_family; + af = RO_GET_FAMILY(ro, dst); /* Berkeley packet filter */ ng_iface_bpftap(ifp, m, af); if (ALTQ_IS_ENABLED(&ifp->if_snd)) { M_PREPEND(m, sizeof(sa_family_t), M_NOWAIT); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); return (ENOBUFS); } *(sa_family_t *)m->m_data = af; error = (ifp->if_transmit)(ifp, m); } else error = ng_iface_send(ifp, m, af); return (error); } /* * Start method is used only when ALTQ is enabled. */ static void ng_iface_start(struct ifnet *ifp) { struct mbuf *m; sa_family_t sa; KASSERT(ALTQ_IS_ENABLED(&ifp->if_snd), ("%s without ALTQ", __func__)); for(;;) { IFQ_DRV_DEQUEUE(&ifp->if_snd, m); if (m == NULL) break; sa = *mtod(m, sa_family_t *); m_adj(m, sizeof(sa_family_t)); ng_iface_send(ifp, m, sa); } } /* * Flash a packet by the BPF (requires prepending 4 byte AF header) * Note the phoney mbuf; this is OK because BPF treats it read-only. */ static void ng_iface_bpftap(struct ifnet *ifp, struct mbuf *m, sa_family_t family) { KASSERT(family != AF_UNSPEC, ("%s: family=AF_UNSPEC", __func__)); if (bpf_peers_present(ifp->if_bpf)) { int32_t family4 = (int32_t)family; bpf_mtap2(ifp->if_bpf, &family4, sizeof(family4), m); } } /* * This routine does actual delivery of the packet into the * netgraph(4). It is called from ng_iface_start() and * ng_iface_output(). */ static int ng_iface_send(struct ifnet *ifp, struct mbuf *m, sa_family_t sa) { struct rm_priotracker priv_tracker; const priv_p priv = (priv_p) ifp->if_softc; const iffam_p iffam = get_iffam_from_af(sa); hook_p hook; int error; int len; /* Check address family to determine hook (if known) */ if (iffam == NULL) { m_freem(m); log(LOG_WARNING, "%s: can't handle af%d\n", ifp->if_xname, sa); return (EAFNOSUPPORT); } /* Copy length before the mbuf gets invalidated. */ len = m->m_pkthdr.len; PRIV_RLOCK(priv, &priv_tracker); hook = *get_hook_from_iffam(priv, iffam); if (hook == NULL) { NG_FREE_M(m); PRIV_RUNLOCK(priv, &priv_tracker); return ENETDOWN; } NG_HOOK_REF(hook); PRIV_RUNLOCK(priv, &priv_tracker); NG_OUTBOUND_THREAD_REF(); NG_SEND_DATA_ONLY(error, hook, m); NG_OUTBOUND_THREAD_UNREF(); NG_HOOK_UNREF(hook); /* Update stats. */ if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); } return (error); } #ifdef DEBUG /* * Display an ioctl to the virtual interface */ static void ng_iface_print_ioctl(struct ifnet *ifp, int command, caddr_t data) { char *str; switch (command & IOC_DIRMASK) { case IOC_VOID: str = "IO"; break; case IOC_OUT: str = "IOR"; break; case IOC_IN: str = "IOW"; break; case IOC_INOUT: str = "IORW"; break; default: str = "IO??"; } log(LOG_DEBUG, "%s: %s('%c', %d, char[%d])\n", ifp->if_xname, str, IOCGROUP(command), command & 0xff, IOCPARM_LEN(command)); } #endif /* DEBUG */ /************************************************************************ NETGRAPH NODE STUFF ************************************************************************/ /* * Constructor for a node */ static int ng_iface_constructor(node_p node) { struct ifnet *ifp; priv_p priv; /* Allocate node and interface private structures */ priv = malloc(sizeof(*priv), M_NETGRAPH_IFACE, M_WAITOK | M_ZERO); ifp = if_alloc(IFT_PROPVIRTUAL); if (ifp == NULL) { free(priv, M_NETGRAPH_IFACE); return (ENOMEM); } rm_init(&priv->lock, "ng_iface private rmlock"); /* Link them together */ ifp->if_softc = priv; priv->ifp = ifp; /* Get an interface unit number */ priv->unit = alloc_unr(V_ng_iface_unit); /* Link together node and private info */ NG_NODE_SET_PRIVATE(node, priv); priv->node = node; /* Initialize interface structure */ if_initname(ifp, NG_IFACE_IFACE_NAME, priv->unit); ifp->if_output = ng_iface_output; ifp->if_start = ng_iface_start; ifp->if_ioctl = ng_iface_ioctl; ifp->if_mtu = NG_IFACE_MTU_DEFAULT; ifp->if_flags = (IFF_SIMPLEX|IFF_POINTOPOINT|IFF_NOARP|IFF_MULTICAST); ifp->if_type = IFT_PROPVIRTUAL; /* XXX */ ifp->if_addrlen = 0; /* XXX */ ifp->if_hdrlen = 0; /* XXX */ ifp->if_baudrate = 64000; /* XXX */ IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_snd.ifq_drv_maxlen = ifqmaxlen; IFQ_SET_READY(&ifp->if_snd); /* Give this node the same name as the interface (if possible) */ if (ng_name_node(node, ifp->if_xname) != 0) log(LOG_WARNING, "%s: can't acquire netgraph name\n", ifp->if_xname); /* Attach the interface */ if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); /* Done */ return (0); } /* * Give our ok for a hook to be added */ static int ng_iface_newhook(node_p node, hook_p hook, const char *name) { const iffam_p iffam = get_iffam_from_name(name); const priv_p priv = NG_NODE_PRIVATE(node); hook_p *hookptr; if (iffam == NULL) return (EPFNOSUPPORT); PRIV_WLOCK(priv); hookptr = get_hook_from_iffam(priv, iffam); if (*hookptr != NULL) { PRIV_WUNLOCK(priv); return (EISCONN); } *hookptr = hook; NG_HOOK_HI_STACK(hook); NG_HOOK_SET_TO_INBOUND(hook); PRIV_WUNLOCK(priv); return (0); } /* * Receive a control message */ static int ng_iface_rcvmsg(node_p node, item_p item, hook_p lasthook) { const priv_p priv = NG_NODE_PRIVATE(node); struct ifnet *const ifp = priv->ifp; struct ng_mesg *resp = NULL; int error = 0; struct ng_mesg *msg; NGI_GET_MSG(item, msg); switch (msg->header.typecookie) { case NGM_IFACE_COOKIE: switch (msg->header.cmd) { case NGM_IFACE_GET_IFNAME: NG_MKRESPONSE(resp, msg, IFNAMSIZ, M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } strlcpy(resp->data, ifp->if_xname, IFNAMSIZ); break; case NGM_IFACE_POINT2POINT: case NGM_IFACE_BROADCAST: { /* Deny request if interface is UP */ if ((ifp->if_flags & IFF_UP) != 0) return (EBUSY); /* Change flags */ switch (msg->header.cmd) { case NGM_IFACE_POINT2POINT: ifp->if_flags |= IFF_POINTOPOINT; ifp->if_flags &= ~IFF_BROADCAST; break; case NGM_IFACE_BROADCAST: ifp->if_flags &= ~IFF_POINTOPOINT; ifp->if_flags |= IFF_BROADCAST; break; } break; } case NGM_IFACE_GET_IFINDEX: NG_MKRESPONSE(resp, msg, sizeof(uint32_t), M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } *((uint32_t *)resp->data) = priv->ifp->if_index; break; default: error = EINVAL; break; } break; case NGM_FLOW_COOKIE: switch (msg->header.cmd) { case NGM_LINK_IS_UP: if_link_state_change(ifp, LINK_STATE_UP); break; case NGM_LINK_IS_DOWN: if_link_state_change(ifp, LINK_STATE_DOWN); break; default: break; } break; default: error = EINVAL; break; } NG_RESPOND_MSG(error, node, item, resp); NG_FREE_MSG(msg); return (error); } /* * Recive data from a hook. Pass the packet to the correct input routine. */ static int ng_iface_rcvdata(hook_p hook, item_p item) { const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); const iffam_p iffam = get_iffam_from_hook(priv, hook); struct ifnet *const ifp = priv->ifp; struct epoch_tracker et; struct mbuf *m; int isr; NGI_GET_M(item, m); NG_FREE_ITEM(item); /* Sanity checks */ KASSERT(iffam != NULL, ("%s: iffam", __func__)); M_ASSERTPKTHDR(m); if ((ifp->if_flags & IFF_UP) == 0) { NG_FREE_M(m); return (ENETDOWN); } /* Update interface stats */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); /* Note receiving interface */ m->m_pkthdr.rcvif = ifp; /* Berkeley packet filter */ ng_iface_bpftap(ifp, m, iffam->family); /* Send packet */ switch (iffam->family) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif default: m_freem(m); return (EAFNOSUPPORT); } random_harvest_queue(m, sizeof(*m), RANDOM_NET_NG); M_SETFIB(m, ifp->if_fib); CURVNET_SET(ifp->if_vnet); NET_EPOCH_ENTER(et); netisr_dispatch(isr, m); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } /* * Shutdown and remove the node and its associated interface. */ static int ng_iface_shutdown(node_p node) { const priv_p priv = NG_NODE_PRIVATE(node); /* * The ifnet may be in a different vnet than the netgraph node, * hence we have to change the current vnet context here. */ CURVNET_SET_QUIET(priv->ifp->if_vnet); bpfdetach(priv->ifp); if_detach(priv->ifp); if_free(priv->ifp); CURVNET_RESTORE(); priv->ifp = NULL; free_unr(V_ng_iface_unit, priv->unit); rm_destroy(&priv->lock); free(priv, M_NETGRAPH_IFACE); NG_NODE_SET_PRIVATE(node, NULL); NG_NODE_UNREF(node); return (0); } /* * Hook disconnection. Note that we do *not* shutdown when all * hooks have been disconnected. */ static int ng_iface_disconnect(hook_p hook) { const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); const iffam_p iffam = get_iffam_from_hook(priv, hook); if (iffam == NULL) panic("%s", __func__); PRIV_WLOCK(priv); *get_hook_from_iffam(priv, iffam) = NULL; PRIV_WUNLOCK(priv); return (0); } /* * Handle loading and unloading for this node type. */ static int ng_iface_mod_event(module_t mod, int event, void *data) { int error = 0; switch (event) { case MOD_LOAD: case MOD_UNLOAD: break; default: error = EOPNOTSUPP; break; } return (error); } static void vnet_ng_iface_init(const void *unused) { V_ng_iface_unit = new_unrhdr(0, 0xffff, NULL); } VNET_SYSINIT(vnet_ng_iface_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_ng_iface_init, NULL); static void vnet_ng_iface_uninit(const void *unused) { delete_unrhdr(V_ng_iface_unit); } VNET_SYSUNINIT(vnet_ng_iface_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_ng_iface_uninit, NULL); diff --git a/sys/netinet/ip_fastfwd.c b/sys/netinet/ip_fastfwd.c index 44da6b73e41c..facf876f18cc 100644 --- a/sys/netinet/ip_fastfwd.c +++ b/sys/netinet/ip_fastfwd.c @@ -1,523 +1,528 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2003 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * ip_fastforward gets its speed from processing the forwarded packet to * completion (if_output on the other side) without any queues or netisr's. * The receiving interface DMAs the packet into memory, the upper half of * driver calls ip_fastforward, we do our routing table lookup and directly * send it off to the outgoing interface, which DMAs the packet to the * network card. The only part of the packet we touch with the CPU is the * IP header (unless there are complex firewall rules touching other parts * of the packet, but that is up to you). We are essentially limited by bus * bandwidth and how fast the network card/driver can set up receives and * transmits. * * We handle basic errors, IP header errors, checksum errors, * destination unreachable, fragmentation and fragmentation needed and * report them via ICMP to the sender. * * Else if something is not pure IPv4 unicast forwarding we fall back to * the normal ip_input processing path. We should only be called from * interfaces connected to the outside world. * * Firewalling is fully supported including divert, ipfw fwd and ipfilter * ipnat and address rewrite. * * IPSEC is not supported if this host is a tunnel broker. IPSEC is * supported for connections to/from local host. * * We try to do the least expensive (in CPU ops) checks and operations * first to catch junk with as little overhead as possible. * * We take full advantage of hardware support for IP checksum and * fragmentation offloading. * * We don't do ICMP redirect in the fast forwarding path. I have had my own * cases where two core routers with Zebra routing suite would send millions * ICMP redirects to connected hosts if the destination router was not the * default gateway. In one case it was filling the routing table of a host * with approximately 300.000 cloned redirect entries until it ran out of * kernel memory. However the networking code proved very robust and it didn't * crash or fail in other ways. */ /* * Many thanks to Matt Thomas of NetBSD for basic structure of ip_flow.c which * is being followed here. */ #include __FBSDID("$FreeBSD$"); #include "opt_ipstealth.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define V_ipsendredirects VNET(ipsendredirects) static struct mbuf * ip_redir_alloc(struct mbuf *m, struct nhop_object *nh, struct ip *ip, in_addr_t *addr) { struct mbuf *mcopy = m_gethdr(M_NOWAIT, m->m_type); if (mcopy == NULL) return (NULL); if (m_dup_pkthdr(mcopy, m, M_NOWAIT) == 0) { /* * It's probably ok if the pkthdr dup fails (because * the deep copy of the tag chain failed), but for now * be conservative and just discard the copy since * code below may some day want the tags. */ m_free(mcopy); return (NULL); } mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy)); mcopy->m_pkthdr.len = mcopy->m_len; m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); if (nh != NULL && ((nh->nh_flags & (NHF_REDIRECT|NHF_DEFAULT)) == 0)) { struct in_ifaddr *nh_ia = (struct in_ifaddr *)(nh->nh_ifa); u_long src = ntohl(ip->ip_src.s_addr); if (nh_ia != NULL && (src & nh_ia->ia_subnetmask) == nh_ia->ia_subnet) { if (nh->nh_flags & NHF_GATEWAY) *addr = nh->gw4_sa.sin_addr.s_addr; else *addr = ip->ip_dst.s_addr; } } return (mcopy); } static int ip_findroute(struct nhop_object **pnh, struct in_addr dest, struct mbuf *m) { struct nhop_object *nh; nh = fib4_lookup(M_GETFIB(m), dest, 0, NHR_NONE, m->m_pkthdr.flowid); if (nh == NULL) { IPSTAT_INC(ips_noroute); IPSTAT_INC(ips_cantforward); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); return (EHOSTUNREACH); } /* * Drop blackholed traffic and directed broadcasts. */ if ((nh->nh_flags & (NHF_BLACKHOLE | NHF_BROADCAST)) != 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return (EHOSTUNREACH); } if (nh->nh_flags & NHF_REJECT) { IPSTAT_INC(ips_cantforward); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); return (EHOSTUNREACH); } *pnh = nh; return (0); } /* * Try to forward a packet based on the destination address. * This is a fast path optimized for the plain forwarding case. * If the packet is handled (and consumed) here then we return NULL; * otherwise mbuf is returned and the packet should be delivered * to ip_input for full processing. */ struct mbuf * ip_tryforward(struct mbuf *m) { struct ip *ip; struct mbuf *m0 = NULL; struct nhop_object *nh = NULL; - struct sockaddr_in dst; + struct route ro; + struct sockaddr_in *dst; + const struct sockaddr *gw; struct in_addr dest, odest, rtdest; uint16_t ip_len, ip_off; int error = 0; struct m_tag *fwd_tag = NULL; struct mbuf *mcopy = NULL; struct in_addr redest; /* * Are we active and forwarding packets? */ M_ASSERTVALID(m); M_ASSERTPKTHDR(m); #ifdef ALTQ /* * Is packet dropped by traffic conditioner? */ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) goto drop; #endif /* * Only IP packets without options */ ip = mtod(m, struct ip *); if (ip->ip_hl != (sizeof(struct ip) >> 2)) { if (V_ip_doopts == 1) return m; else if (V_ip_doopts == 2) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_FILTER_PROHIB, 0, 0); return NULL; /* mbuf already free'd */ } /* else ignore IP options and continue */ } /* * Only unicast IP, not from loopback, no L2 or IP broadcast, * no multicast, no INADDR_ANY * * XXX: Probably some of these checks could be direct drop * conditions. However it is not clear whether there are some * hacks or obscure behaviours which make it necessary to * let ip_input handle it. We play safe here and let ip_input * deal with it until it is proven that we can directly drop it. */ if ((m->m_flags & (M_BCAST|M_MCAST)) || (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) || ntohl(ip->ip_src.s_addr) == (u_long)INADDR_BROADCAST || ntohl(ip->ip_dst.s_addr) == (u_long)INADDR_BROADCAST || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) || IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) || ip->ip_src.s_addr == INADDR_ANY || ip->ip_dst.s_addr == INADDR_ANY ) return m; /* * Is it for a local address on this host? */ if (in_localip(ip->ip_dst)) return m; IPSTAT_INC(ips_total); /* * Step 3: incoming packet firewall processing */ odest.s_addr = dest.s_addr = ip->ip_dst.s_addr; /* * Run through list of ipfilter hooks for input packets */ if (!PFIL_HOOKED_IN(V_inet_pfil_head)) goto passin; if (pfil_run_hooks(V_inet_pfil_head, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL) != PFIL_PASS) goto drop; M_ASSERTVALID(m); M_ASSERTPKTHDR(m); ip = mtod(m, struct ip *); /* m may have changed by pfil hook */ dest.s_addr = ip->ip_dst.s_addr; /* * Destination address changed? */ if (odest.s_addr != dest.s_addr) { /* * Is it now for a local address on this host? */ if (in_localip(dest)) goto forwardlocal; /* * Go on with new destination address */ } if (m->m_flags & M_FASTFWD_OURS) { /* * ipfw changed it for a local address on this host. */ goto forwardlocal; } passin: /* * Step 4: decrement TTL and look up route */ /* * Check TTL */ #ifdef IPSTEALTH if (!V_ipstealth) { #endif if (ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); return NULL; /* mbuf already free'd */ } /* * Decrement the TTL and incrementally change the IP header checksum. * Don't bother doing this with hw checksum offloading, it's faster * doing it right here. */ ip->ip_ttl -= IPTTLDEC; if (ip->ip_sum >= (u_int16_t) ~htons(IPTTLDEC << 8)) ip->ip_sum -= ~htons(IPTTLDEC << 8); else ip->ip_sum += htons(IPTTLDEC << 8); #ifdef IPSTEALTH } #endif /* * Next hop forced by pfil(9) hook? */ if ((m->m_flags & M_IP_NEXTHOP) && ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) { /* * Now we will find route to forced destination. */ dest.s_addr = ((struct sockaddr_in *) (fwd_tag + 1))->sin_addr.s_addr; m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP_NEXTHOP; } /* * Find route to destination. */ if (ip_findroute(&nh, dest, m) != 0) return (NULL); /* icmp unreach already sent */ /* * Avoid second route lookup by caching destination. */ rtdest.s_addr = dest.s_addr; /* * Step 5: outgoing firewall packet processing */ if (!PFIL_HOOKED_OUT(V_inet_pfil_head)) goto passout; if (pfil_run_hooks(V_inet_pfil_head, &m, nh->nh_ifp, PFIL_OUT | PFIL_FWD, NULL) != PFIL_PASS) goto drop; M_ASSERTVALID(m); M_ASSERTPKTHDR(m); ip = mtod(m, struct ip *); dest.s_addr = ip->ip_dst.s_addr; /* * Destination address changed? */ if (m->m_flags & M_IP_NEXTHOP) fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); else fwd_tag = NULL; if (odest.s_addr != dest.s_addr || fwd_tag != NULL) { /* * Is it now for a local address on this host? */ if (m->m_flags & M_FASTFWD_OURS || in_localip(dest)) { forwardlocal: /* * Return packet for processing by ip_input(). */ m->m_flags |= M_FASTFWD_OURS; return (m); } /* * Redo route lookup with new destination address */ if (fwd_tag) { dest.s_addr = ((struct sockaddr_in *) (fwd_tag + 1))->sin_addr.s_addr; m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP_NEXTHOP; } if (dest.s_addr != rtdest.s_addr && ip_findroute(&nh, dest, m) != 0) return (NULL); /* icmp unreach already sent */ } passout: /* * Step 6: send off the packet */ ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); - bzero(&dst, sizeof(dst)); - dst.sin_family = AF_INET; - dst.sin_len = sizeof(dst); - if (nh->nh_flags & NHF_GATEWAY) - dst.sin_addr = nh->gw4_sa.sin_addr; - else - dst.sin_addr = dest; + bzero(&ro, sizeof(ro)); + dst = (struct sockaddr_in *)&ro.ro_dst; + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = dest; + if (nh->nh_flags & NHF_GATEWAY) { + gw = &nh->gw_sa; + ro.ro_flags |= RT_HAS_GW; + } else + gw = (const struct sockaddr *)dst; /* * Handle redirect case. */ redest.s_addr = 0; - if (V_ipsendredirects && (nh->nh_ifp == m->m_pkthdr.rcvif)) + if (V_ipsendredirects && (nh->nh_ifp == m->m_pkthdr.rcvif) && + gw->sa_family == AF_INET) mcopy = ip_redir_alloc(m, nh, ip, &redest.s_addr); /* * Check if packet fits MTU or if hardware will fragment for us */ if (ip_len <= nh->nh_mtu) { /* * Avoid confusing lower layers. */ m_clrprotoflags(m); /* * Send off the packet via outgoing interface */ IP_PROBE(send, NULL, NULL, ip, nh->nh_ifp, ip, NULL); - error = (*nh->nh_ifp->if_output)(nh->nh_ifp, m, - (struct sockaddr *)&dst, NULL); + error = (*nh->nh_ifp->if_output)(nh->nh_ifp, m, gw, &ro); } else { /* * Handle EMSGSIZE with icmp reply needfrag for TCP MTU discovery */ if (ip_off & IP_DF) { IPSTAT_INC(ips_cantfrag); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, nh->nh_mtu); goto consumed; } else { /* * We have to fragment the packet */ m->m_pkthdr.csum_flags |= CSUM_IP; if (ip_fragment(ip, &m, nh->nh_mtu, nh->nh_ifp->if_hwassist) != 0) goto drop; KASSERT(m != NULL, ("null mbuf and no error")); /* * Send off the fragments via outgoing interface */ error = 0; do { m0 = m->m_nextpkt; m->m_nextpkt = NULL; /* * Avoid confusing lower layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), nh->nh_ifp, mtod(m, struct ip *), NULL); error = (*nh->nh_ifp->if_output)(nh->nh_ifp, m, - (struct sockaddr *)&dst, NULL); + gw, &ro); if (error) break; } while ((m = m0) != NULL); if (error) { /* Reclaim remaining fragments */ for (m = m0; m; m = m0) { m0 = m->m_nextpkt; m_freem(m); } } else IPSTAT_INC(ips_fragmented); } } if (error != 0) IPSTAT_INC(ips_odropped); else { IPSTAT_INC(ips_forward); IPSTAT_INC(ips_fastforward); } /* Send required redirect */ if (mcopy != NULL) { icmp_error(mcopy, ICMP_REDIRECT, ICMP_REDIRECT_HOST, redest.s_addr, 0); mcopy = NULL; /* Freed by caller */ } consumed: if (mcopy != NULL) m_freem(mcopy); return NULL; drop: if (m) m_freem(m); return NULL; } diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c index 733cc2901879..465c00e4dac7 100644 --- a/sys/netinet/ip_input.c +++ b/sys/netinet/ip_input.c @@ -1,1436 +1,1439 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CTASSERT CTASSERT(sizeof(struct ip) == 20); #endif /* IP reassembly functions are defined in ip_reass.c. */ extern void ipreass_init(void); extern void ipreass_drain(void); extern void ipreass_slowtimo(void); #ifdef VIMAGE extern void ipreass_destroy(void); #endif struct rmlock in_ifaddr_lock; RM_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock"); VNET_DEFINE(int, rsvp_on); VNET_DEFINE(int, ipforwarding); SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipforwarding), 0, "Enable IP forwarding between interfaces"); /* * Respond with an ICMP host redirect when we forward a packet out of * the same interface on which it was received. See RFC 792. */ VNET_DEFINE(int, ipsendredirects) = 1; SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsendredirects), 0, "Enable sending IP redirects"); /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table * and transmit implementation do not implement the Strong ES model, * setting this to 1 results in an odd hybrid. * * XXX - ip_checkinterface currently must be disabled if you use ipnat * to translate the destination address to another local interface. * * XXX - ip_checkinterface must be disabled if you add IP aliases * to the loopback interface instead of the interface where the * packets for those addresses are received. */ VNET_DEFINE_STATIC(int, ip_checkinterface); #define V_ip_checkinterface VNET(ip_checkinterface) SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_checkinterface), 0, "Verify packet arrives on correct interface"); VNET_DEFINE(pfil_head_t, inet_pfil_head); /* Packet filter hooks */ static struct netisr_handler ip_nh = { .nh_name = "ip", .nh_handler = ip_input, .nh_proto = NETISR_IP, #ifdef RSS .nh_m2cpuid = rss_soft_m2cpuid_v4, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, #else .nh_policy = NETISR_POLICY_FLOW, #endif }; #ifdef RSS /* * Directly dispatched frames are currently assumed * to have a flowid already calculated. * * It should likely have something that assert it * actually has valid flow details. */ static struct netisr_handler ip_direct_nh = { .nh_name = "ip_direct", .nh_handler = ip_direct_input, .nh_proto = NETISR_IP_DIRECT, .nh_m2cpuid = rss_soft_m2cpuid_v4, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, }; #endif extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */ VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */ VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */ #ifdef IPCTL_DEFMTU SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, &ip_mtu, 0, "Default MTU"); #endif #ifdef IPSTEALTH VNET_DEFINE(int, ipstealth); SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipstealth), 0, "IP stealth mode, no TTL decrementation on forwarding"); #endif /* * IP statistics are stored in the "array" of counter(9)s. */ VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat); VNET_PCPUSTAT_SYSINIT(ipstat); SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ipstat); #endif /* VIMAGE */ /* * Kernel module interface for updating ipstat. The argument is an index * into ipstat treated as an array. */ void kmod_ipstat_inc(int statnum) { counter_u64_add(VNET(ipstat)[statnum], 1); } void kmod_ipstat_dec(int statnum) { counter_u64_add(VNET(ipstat)[statnum], -1); } static int sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_netinet_intr_queue_maxlen, "I", "Maximum size of the IP input queue"); static int sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_netinet_intr_queue_drops, "I", "Number of packets dropped from the IP input queue"); #ifdef RSS static int sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_direct_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_direct_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQMAXLEN, intr_direct_queue_maxlen, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I", "Maximum size of the IP direct input queue"); static int sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_direct_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_direct_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQDROPS, intr_direct_queue_drops, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I", "Number of packets dropped from the IP direct input queue"); #endif /* RSS */ /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. */ void ip_init(void) { struct pfil_head_args args; struct protosw *pr; int i; CK_STAILQ_INIT(&V_in_ifaddrhead); V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); /* Initialize IP reassembly queue. */ ipreass_init(); /* Initialize packet filter hooks. */ args.pa_version = PFIL_VERSION; args.pa_flags = PFIL_IN | PFIL_OUT; args.pa_type = PFIL_TYPE_IP4; args.pa_headname = PFIL_INET_NAME; V_inet_pfil_head = pfil_head_register(&args); if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET, &V_ipsec_hhh_in[HHOOK_IPSEC_INET], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register input helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET, &V_ipsec_hhh_out[HHOOK_IPSEC_INET], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register output helper hook\n", __func__); /* Skip initialization of globals for non-default instances. */ #ifdef VIMAGE if (!IS_DEFAULT_VNET(curvnet)) { netisr_register_vnet(&ip_nh); #ifdef RSS netisr_register_vnet(&ip_direct_nh); #endif return; } #endif pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip_init: PF_INET not found"); /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip_protox[i] = pr - inetsw; /* * Cycle through IP protocols and put them into the appropriate place * in ip_protox[]. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip_protox[pr->pr_protocol] = pr - inetsw; } netisr_register(&ip_nh); #ifdef RSS netisr_register(&ip_direct_nh); #endif } #ifdef VIMAGE static void ip_destroy(void *unused __unused) { int error; #ifdef RSS netisr_unregister_vnet(&ip_direct_nh); #endif netisr_unregister_vnet(&ip_nh); pfil_head_unregister(V_inet_pfil_head); error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET]); if (error != 0) { printf("%s: WARNING: unable to deregister input helper hook " "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET: " "error %d returned\n", __func__, error); } error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET]); if (error != 0) { printf("%s: WARNING: unable to deregister output helper hook " "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET: " "error %d returned\n", __func__, error); } /* Remove the IPv4 addresses from all interfaces. */ in_ifscrub_all(); /* Make sure the IPv4 routes are gone as well. */ rib_flush_routes_family(AF_INET); /* Destroy IP reassembly queue. */ ipreass_destroy(); /* Cleanup in_ifaddr hash table; should be empty. */ hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask); } VNET_SYSUNINIT(ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_destroy, NULL); #endif #ifdef RSS /* * IP direct input routine. * * This is called when reinjecting completed fragments where * all of the previous checking and book-keeping has been done. */ void ip_direct_input(struct mbuf *m) { struct ip *ip; int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if (IPSEC_INPUT(ipv4, m, hlen, ip->ip_p) != 0) return; } #endif /* IPSEC */ IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; } #endif /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. */ void ip_input(struct mbuf *m) { MROUTER_RLOCK_TRACKER; struct rm_priotracker in_ifa_tracker; struct ip *ip = NULL; struct in_ifaddr *ia = NULL; struct ifaddr *ifa; struct ifnet *ifp; int checkif, hlen = 0; uint16_t sum, ip_len; int dchg = 0; /* dest changed after fw */ struct in_addr odst; /* original dst address */ M_ASSERTPKTHDR(m); NET_EPOCH_ASSERT(); if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; /* Set up some basics that will be used later. */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip_len = ntohs(ip->ip_len); goto ours; } IPSTAT_INC(ips_total); if (m->m_pkthdr.len < sizeof(struct ip)) goto tooshort; if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { IPSTAT_INC(ips_toosmall); return; } ip = mtod(m, struct ip *); if (ip->ip_v != IPVERSION) { IPSTAT_INC(ips_badvers); goto bad; } hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ IPSTAT_INC(ips_badhlen); goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { IPSTAT_INC(ips_badhlen); return; } ip = mtod(m, struct ip *); } IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL); /* IN_LOOPBACK must not appear on the wire - RFC1122 */ ifp = m->m_pkthdr.rcvif; if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) || IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); goto bad; } } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); } else { if (hlen == sizeof(struct ip)) { sum = in_cksum_hdr(ip); } else { sum = in_cksum(m, hlen); } } if (sum) { IPSTAT_INC(ips_badsum); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) /* packet is dropped by traffic conditioner */ return; #endif ip_len = ntohs(ip->ip_len); if (ip_len < hlen) { IPSTAT_INC(ips_badlen); goto bad; } /* * Check that the amount of data in the buffers * is as at least much as the IP header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len < ip_len) { tooshort: IPSTAT_INC(ips_tooshort); goto bad; } if (m->m_pkthdr.len > ip_len) { if (m->m_len == m->m_pkthdr.len) { m->m_len = ip_len; m->m_pkthdr.len = ip_len; } else m_adj(m, ip_len - m->m_pkthdr.len); } /* * Try to forward the packet, but if we fail continue. * ip_tryforward() does not generate redirects, so fall * through to normal processing if redirects are required. * ip_tryforward() does inbound and outbound packet firewall * processing. If firewall has decided that destination becomes * our local address, it sets M_FASTFWD_OURS flag. In this * case skip another inbound firewall processing and update * ip pointer. */ if (V_ipforwarding != 0 #if defined(IPSEC) || defined(IPSEC_SUPPORT) && (!IPSEC_ENABLED(ipv4) || IPSEC_CAPS(ipv4, m, IPSEC_CAP_OPERABLE) == 0) #endif ) { if ((m = ip_tryforward(m)) == NULL) return; if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; ip = mtod(m, struct ip *); goto ours; } } #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Bypass packet filtering for packets previously handled by IPsec. */ if (IPSEC_ENABLED(ipv4) && IPSEC_CAPS(ipv4, m, IPSEC_CAP_BYPASS_FILTER) != 0) goto passin; #endif /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing (e.g. * by NAT rewriting). When this happens, tell * ip_forward to do the right thing. */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED_IN(V_inet_pfil_head)) goto passin; odst = ip->ip_dst; if (pfil_run_hooks(V_inet_pfil_head, &m, ifp, PFIL_IN, NULL) != PFIL_PASS) return; if (m == NULL) /* consumed by filter */ return; ip = mtod(m, struct ip *); dchg = (odst.s_addr != ip->ip_dst.s_addr); ifp = m->m_pkthdr.rcvif; if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; goto ours; } if (m->m_flags & M_IP_NEXTHOP) { if (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) { /* * Directly ship the packet on. This allows * forwarding packets originally destined to us * to some other directly connected host. */ ip_forward(m, 1); return; } } passin: /* * Process options and, if not destined for us, * ship it on. ip_dooptions returns 1 when an * error was detected (causing an icmp message * to be sent and the original packet to be freed). */ if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) return; /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no * matter if it is destined to another node, or whether it is * a multicast one, RSVP wants it! and prevents it from being forwarded * anywhere else. Also checks if the rsvp daemon is running before * grabbing the packet. */ if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) goto ours; /* * Check our list of addresses, to see if the packet is for us. * If we don't have any addresses, assume any unicast packet * we receive might be for us (and let the upper layers deal * with it). */ if (CK_STAILQ_EMPTY(&V_in_ifaddrhead) && (m->m_flags & (M_MCAST|M_BCAST)) == 0) goto ours; /* * Enable a consistency check between the destination address * and the arrival interface for a unicast packet (the RFC 1122 * strong ES model) if IP forwarding is disabled and the packet * is not locally generated and the packet is not subject to * 'ipfw fwd'. * * XXX - Checking also should be disabled if the destination * address is ipnat'ed to a different interface. * * XXX - Checking is incompatible with IP aliases added * to the loopback interface instead of the interface where * the packets are received. * * XXX - This is the case for carp vhost IPs as well so we * insert a workaround. If the packet got here, we already * checked with carp_iamatch() and carp_forus(). */ checkif = V_ip_checkinterface && (V_ipforwarding == 0) && ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) && ifp->if_carp == NULL && (dchg == 0); /* * Check for exact addresses in the hash bucket. */ IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { /* * If the address matches, verify that the packet * arrived via the correct interface if checking is * enabled. */ if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && (!checkif || ia->ia_ifp == ifp)) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto ours; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * Check for broadcast addresses. * * Only accept broadcast packets that arrive via the matching * interface. Reception of forwarded directed broadcasts would * be handled via ip_forward() and ether_output() with the loopback * into the stack for SIMPLEX interfaces handled by ether_output(). */ if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == ip->ip_dst.s_addr) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); goto ours; } #ifdef BOOTP_COMPAT if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); goto ours; } #endif } ia = NULL; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { MROUTER_RLOCK(); /* * RFC 3927 2.7: Do not forward multicast packets from * IN_LINKLOCAL. */ if (V_ip_mrouter && !IN_LINKLOCAL(ntohl(ip->ip_src.s_addr))) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) { MROUTER_RUNLOCK(); IPSTAT_INC(ips_cantforward); m_freem(m); return; } /* * The process-level routing daemon needs to receive * all multicast IGMP packets, whether or not this * host belongs to their destination groups. */ if (ip->ip_p == IPPROTO_IGMP) { MROUTER_RUNLOCK(); goto ours; } IPSTAT_INC(ips_forward); } MROUTER_RUNLOCK(); /* * Assume the packet is for us, to avoid prematurely taking * a lock on the in_multi hash. Protocols must perform * their own filtering and update statistics accordingly. */ goto ours; } if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) goto ours; if (ip->ip_dst.s_addr == INADDR_ANY) goto ours; /* RFC 3927 2.7: Do not forward packets to or from IN_LINKLOCAL. */ if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) || IN_LINKLOCAL(ntohl(ip->ip_src.s_addr))) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } /* * Not for us; forward if possible and desirable. */ if (V_ipforwarding == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); } else { ip_forward(m, dchg); } return; ours: #ifdef IPSTEALTH /* * IPSTEALTH: Process non-routing options only * if the packet is destined for us. */ if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) return; #endif /* IPSTEALTH */ /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) { /* XXXGL: shouldn't we save & set m_flags? */ m = ip_reass(m); if (m == NULL) return; ip = mtod(m, struct ip *); /* Get the header length of the reassembled packet */ hlen = ip->ip_hl << 2; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if (IPSEC_INPUT(ipv4, m, hlen, ip->ip_p) != 0) return; } #endif /* IPSEC */ /* * Switch out to protocol's input routine. */ IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; bad: m_freem(m); } /* * IP timer processing; * if a timer expires on a reassembly * queue, discard it. */ void ip_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ipreass_slowtimo(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } void ip_drain(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ipreass_drain(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * The protocol to be inserted into ip_protox[] must be already registered * in inetsw[], either statically or through pf_proto_register(). */ int ipproto_register(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* * The protocol slot must not be occupied by another protocol * already. An index pointing to IPPROTO_RAW is unused. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */ return (EEXIST); /* Find the protocol position in inetsw[] and set the index. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) { if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol == ipproto) { ip_protox[pr->pr_protocol] = pr - inetsw; return (0); } } return (EPROTONOSUPPORT); } int ipproto_unregister(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* Check if the protocol was indeed registered. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */ return (ENOENT); /* Reset the protocol slot to IPPROTO_RAW. */ ip_protox[ipproto] = pr - inetsw; return (0); } u_char inetctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, EHOSTUNREACH, 0, ENOPROTOOPT, ECONNREFUSED }; /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful * icmp message because icmp doesn't have a large enough repertoire * of codes and types. * * If not forwarding, just drop the packet. This could be confusing * if ipforwarding was zero but some routing protocol was advancing * us as a gateway to somewhere. However, we must let the routing * protocol deal with that. * * The srcrt parameter indicates whether the packet is being forwarded * via a source route. */ void ip_forward(struct mbuf *m, int srcrt) { struct ip *ip = mtod(m, struct ip *); struct in_ifaddr *ia; struct mbuf *mcopy; struct sockaddr_in *sin; struct in_addr dest; struct route ro; uint32_t flowid; int error, type = 0, code = 0, mtu = 0; NET_EPOCH_ASSERT(); if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } if ( #ifdef IPSTEALTH V_ipstealth == 0 && #endif ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); return; } bzero(&ro, sizeof(ro)); sin = (struct sockaddr_in *)&ro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ip->ip_dst; flowid = m->m_pkthdr.flowid; ro.ro_nh = fib4_lookup(M_GETFIB(m), ip->ip_dst, 0, NHR_REF, flowid); if (ro.ro_nh != NULL) { ia = ifatoia(ro.ro_nh->nh_ifa); } else ia = NULL; /* * Save the IP header and at most 8 bytes of the payload, * in case we need to generate an ICMP message to the src. * * XXX this can be optimized a lot by saving the data in a local * buffer on the stack (72 bytes at most), and only allocating the * mbuf if really necessary. The vast majority of the packets * are forwarded without having to send an ICMP back (either * because unnecessary, or because rate limited), so we are * really we are wasting a lot of work here. * * We don't use m_copym() because it might return a reference * to a shared cluster. Both this function and ip_output() * assume exclusive access to the IP header in `m', so any * data in a cluster may change before we reach icmp_error(). */ mcopy = m_gethdr(M_NOWAIT, m->m_type); if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) { /* * It's probably ok if the pkthdr dup fails (because * the deep copy of the tag chain failed), but for now * be conservative and just discard the copy since * code below may some day want the tags. */ m_free(mcopy); mcopy = NULL; } if (mcopy != NULL) { mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy)); mcopy->m_pkthdr.len = mcopy->m_len; m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); } #ifdef IPSTEALTH if (V_ipstealth == 0) #endif ip->ip_ttl -= IPTTLDEC; #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if ((error = IPSEC_FORWARD(ipv4, m)) != 0) { /* mbuf consumed by IPsec */ RO_NHFREE(&ro); m_freem(mcopy); if (error != EINPROGRESS) IPSTAT_INC(ips_cantforward); return; } /* No IPsec processing required */ } #endif /* IPSEC */ /* * If forwarding packet using same interface that it came in on, * perhaps should send a redirect to sender to shortcut a hop. * Only send redirect if source is sending directly to us, * and if packet was not source routed (or has any options). * Also, don't send redirect if forwarding using a default route * or a route modified by a redirect. */ dest.s_addr = 0; if (!srcrt && V_ipsendredirects && ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) { struct nhop_object *nh; nh = ro.ro_nh; if (nh != NULL && ((nh->nh_flags & (NHF_REDIRECT|NHF_DEFAULT)) == 0)) { struct in_ifaddr *nh_ia = (struct in_ifaddr *)(nh->nh_ifa); u_long src = ntohl(ip->ip_src.s_addr); if (nh_ia != NULL && (src & nh_ia->ia_subnetmask) == nh_ia->ia_subnet) { - if (nh->nh_flags & NHF_GATEWAY) - dest.s_addr = nh->gw4_sa.sin_addr.s_addr; - else - dest.s_addr = ip->ip_dst.s_addr; /* Router requirements says to only send host redirects */ type = ICMP_REDIRECT; code = ICMP_REDIRECT_HOST; + if (nh->nh_flags & NHF_GATEWAY) { + if (nh->gw_sa.sa_family == AF_INET) + dest.s_addr = nh->gw4_sa.sin_addr.s_addr; + else /* Do not redirect in case gw is AF_INET6 */ + type = 0; + } else + dest.s_addr = ip->ip_dst.s_addr; } } } error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); if (error == EMSGSIZE && ro.ro_nh) mtu = ro.ro_nh->nh_mtu; RO_NHFREE(&ro); if (error) IPSTAT_INC(ips_cantforward); else { IPSTAT_INC(ips_forward); if (type) IPSTAT_INC(ips_redirectsent); else { if (mcopy) m_freem(mcopy); return; } } if (mcopy == NULL) return; switch (error) { case 0: /* forwarded, but need redirect */ /* type, code set above */ break; case ENETUNREACH: case EHOSTUNREACH: case ENETDOWN: case EHOSTDOWN: default: type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; break; case EMSGSIZE: type = ICMP_UNREACH; code = ICMP_UNREACH_NEEDFRAG; /* * If the MTU was set before make sure we are below the * interface MTU. * If the MTU wasn't set before use the interface mtu or * fall back to the next smaller mtu step compared to the * current packet size. */ if (mtu != 0) { if (ia != NULL) mtu = min(mtu, ia->ia_ifp->if_mtu); } else { if (ia != NULL) mtu = ia->ia_ifp->if_mtu; else mtu = ip_next_mtu(ntohs(ip->ip_len), 0); } IPSTAT_INC(ips_cantfrag); break; case ENOBUFS: case EACCES: /* ipfw denied packet */ m_freem(mcopy); return; } icmp_error(mcopy, type, code, dest.s_addr, mtu); } #define CHECK_SO_CT(sp, ct) \ (((sp->so_options & SO_TIMESTAMP) && (sp->so_ts_clock == ct)) ? 1 : 0) void ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct mbuf *m) { bool stamped; stamped = false; if ((inp->inp_socket->so_options & SO_BINTIME) || CHECK_SO_CT(inp->inp_socket, SO_TS_BINTIME)) { struct bintime boottimebin, bt; struct timespec ts1; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &bt); getboottimebin(&boottimebin); bintime_add(&bt, &boottimebin); } else { bintime(&bt); } *mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt), SCM_BINTIME, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME_MICRO)) { struct bintime boottimebin, bt1; struct timespec ts1; struct timeval tv; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &bt1); getboottimebin(&boottimebin); bintime_add(&bt1, &boottimebin); bintime2timeval(&bt1, &tv); } else { microtime(&tv); } *mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } else if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME)) { struct bintime boottimebin; struct timespec ts, ts1; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts); getboottimebin(&boottimebin); bintime2timespec(&boottimebin, &ts1); timespecadd(&ts, &ts1, &ts); } else { nanotime(&ts); } *mp = sbcreatecontrol((caddr_t)&ts, sizeof(ts), SCM_REALTIME, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } else if (CHECK_SO_CT(inp->inp_socket, SO_TS_MONOTONIC)) { struct timespec ts; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) mbuf_tstmp2timespec(m, &ts); else nanouptime(&ts); *mp = sbcreatecontrol((caddr_t)&ts, sizeof(ts), SCM_MONOTONIC, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } if (stamped && (m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { struct sock_timestamp_info sti; bzero(&sti, sizeof(sti)); sti.st_info_flags = ST_INFO_HW; if ((m->m_flags & M_TSTMP_HPREC) != 0) sti.st_info_flags |= ST_INFO_HW_HPREC; *mp = sbcreatecontrol((caddr_t)&sti, sizeof(sti), SCM_TIME_INFO, SOL_SOCKET); if (*mp != NULL) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVDSTADDR) { *mp = sbcreatecontrol((caddr_t)&ip->ip_dst, sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTTL) { *mp = sbcreatecontrol((caddr_t)&ip->ip_ttl, sizeof(u_char), IP_RECVTTL, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef notyet /* XXX * Moving these out of udp_input() made them even more broken * than they already were. */ /* options were tossed already */ if (inp->inp_flags & INP_RECVOPTS) { *mp = sbcreatecontrol((caddr_t)opts_deleted_above, sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } /* ip_srcroute doesn't do what we want here, need to fix */ if (inp->inp_flags & INP_RECVRETOPTS) { *mp = sbcreatecontrol((caddr_t)ip_srcroute(m), sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #endif if (inp->inp_flags & INP_RECVIF) { struct ifnet *ifp; struct sdlbuf { struct sockaddr_dl sdl; u_char pad[32]; } sdlbuf; struct sockaddr_dl *sdp; struct sockaddr_dl *sdl2 = &sdlbuf.sdl; if ((ifp = m->m_pkthdr.rcvif) && ifp->if_index && ifp->if_index <= V_if_index) { sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; /* * Change our mind and don't try copy. */ if (sdp->sdl_family != AF_LINK || sdp->sdl_len > sizeof(sdlbuf)) { goto makedummy; } bcopy(sdp, sdl2, sdp->sdl_len); } else { makedummy: sdl2->sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); sdl2->sdl_family = AF_LINK; sdl2->sdl_index = 0; sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } *mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len, IP_RECVIF, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTOS) { *mp = sbcreatecontrol((caddr_t)&ip->ip_tos, sizeof(u_char), IP_RECVTOS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); /* * XXX should handle the failure of one or the * other - don't populate both? */ *mp = sbcreatecontrol((caddr_t) &flowid, sizeof(uint32_t), IP_FLOWID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; *mp = sbcreatecontrol((caddr_t) &flow_type, sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef RSS if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { *mp = sbcreatecontrol((caddr_t) &rss_bucketid, sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } } #endif } /* * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on * locking. This code remains in ip_input.c as ip_mroute.c is optionally * compiled. */ VNET_DEFINE_STATIC(int, ip_rsvp_on); VNET_DEFINE(struct socket *, ip_rsvpd); #define V_ip_rsvp_on VNET(ip_rsvp_on) int ip_rsvp_init(struct socket *so) { if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; if (V_ip_rsvpd != NULL) return EADDRINUSE; V_ip_rsvpd = so; /* * This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ if (!V_ip_rsvp_on) { V_ip_rsvp_on = 1; V_rsvp_on++; } return 0; } int ip_rsvp_done(void) { V_ip_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ if (V_ip_rsvp_on) { V_ip_rsvp_on = 0; V_rsvp_on--; } return 0; } int rsvp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; m = *mp; *mp = NULL; if (rsvp_input_p) { /* call the real one if loaded */ *mp = m; rsvp_input_p(mp, offp, proto); return (IPPROTO_DONE); } /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ if (!V_rsvp_on) { m_freem(m); return (IPPROTO_DONE); } if (V_ip_rsvpd != NULL) { *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); } /* Drop the packet */ m_freem(m); return (IPPROTO_DONE); } diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index 3041232b7223..ad41c9df0b8c 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -1,1632 +1,1627 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include "opt_mbuf_stress_test.h" #include "opt_ratelimit.h" #include "opt_route.h" #include "opt_rss.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SCTP) || defined(SCTP_SUPPORT) #include #include #endif #include #include #include #ifdef MBUF_STRESS_TEST static int mbuf_frag_size = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); #endif static void ip_mloopback(struct ifnet *, const struct mbuf *, int); extern int in_mcast_loop; extern struct protosw inetsw[]; static inline int ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, int flags, struct inpcb *inp, struct sockaddr_in *dst, int *fibnum, int *error) { struct m_tag *fwd_tag = NULL; struct mbuf *m; struct in_addr odst; struct ip *ip; int pflags = PFIL_OUT; if (flags & IP_FORWARDING) pflags |= PFIL_FWD; m = *mp; ip = mtod(m, struct ip *); /* Run through list of hooks for output packets. */ odst.s_addr = ip->ip_dst.s_addr; switch (pfil_run_hooks(V_inet_pfil_head, mp, ifp, pflags, inp)) { case PFIL_DROPPED: *error = EACCES; /* FALLTHROUGH */ case PFIL_CONSUMED: return 1; /* Finished */ case PFIL_PASS: *error = 0; } m = *mp; ip = mtod(m, struct ip *); /* See if destination IP address was changed by packet filter. */ if (odst.s_addr != ip->ip_dst.s_addr) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip_input(). */ if (in_localip(ip->ip_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; #if defined(SCTP) || defined(SCTP_SUPPORT) if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif *error = netisr_queue(NETISR_IP, m); return 1; /* Finished */ } bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; return -1; /* Reloop */ } /* See if fib was changed by packet filter. */ if ((*fibnum) != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; *fibnum = M_GETFIB(m); return -1; /* Reloop for FIB change */ } /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #if defined(SCTP) || defined(SCTP_SUPPORT) if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; *error = netisr_queue(NETISR_IP, m); return 1; /* Finished */ } /* Or forward to some other address? */ if ((m->m_flags & M_IP_NEXTHOP) && ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) { bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP_NEXTHOP; m_tag_delete(m, fwd_tag); return -1; /* Reloop for CHANGE of dst */ } return 0; } static int ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m, - const struct sockaddr_in *gw, struct route *ro, bool stamp_tag) + const struct sockaddr *gw, struct route *ro, bool stamp_tag) { #ifdef KERN_TLS struct ktls_session *tls = NULL; #endif struct m_snd_tag *mst; int error; MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); mst = NULL; #ifdef KERN_TLS /* * If this is an unencrypted TLS record, save a reference to * the record. This local reference is used to call * ktls_output_eagain after the mbuf has been freed (thus * dropping the mbuf's reference) in if_output. */ if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) { tls = ktls_hold(m->m_next->m_epg_tls); mst = tls->snd_tag; /* * If a TLS session doesn't have a valid tag, it must * have had an earlier ifp mismatch, so drop this * packet. */ if (mst == NULL) { error = EAGAIN; goto done; } /* * Always stamp tags that include NIC ktls. */ stamp_tag = true; } #endif #ifdef RATELIMIT if (inp != NULL && mst == NULL) { if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 || (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp)) in_pcboutput_txrtlmt(inp, ifp, m); if (inp->inp_snd_tag != NULL) mst = inp->inp_snd_tag; } #endif if (stamp_tag && mst != NULL) { KASSERT(m->m_pkthdr.rcvif == NULL, ("trying to add a send tag to a forwarded packet")); if (mst->ifp != ifp) { error = EAGAIN; goto done; } /* stamp send tag on mbuf */ m->m_pkthdr.snd_tag = m_snd_tag_ref(mst); m->m_pkthdr.csum_flags |= CSUM_SND_TAG; } - error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); + error = (*ifp->if_output)(ifp, m, gw, ro); done: /* Check for route change invalidating send tags. */ #ifdef KERN_TLS if (tls != NULL) { if (error == EAGAIN) error = ktls_output_eagain(inp, tls); ktls_free(tls); } #endif #ifdef RATELIMIT if (error == EAGAIN) in_pcboutput_eagain(inp); #endif return (error); } /* rte<>ro_flags translation */ static inline void rt_update_ro_flags(struct route *ro, const struct nhop_object *nh) { int nh_flags = nh->nh_flags; ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW); ro->ro_flags |= (nh_flags & NHF_REJECT) ? RT_REJECT : 0; ro->ro_flags |= (nh_flags & NHF_BLACKHOLE) ? RT_BLACKHOLE : 0; ro->ro_flags |= (nh_flags & NHF_GATEWAY) ? RT_HAS_GW : 0; } /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route ro is present and has ro_rt initialized, route lookup would be * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, * then result of route lookup is stored in ro->ro_rt. * * In the IP forwarding case, the packet will arrive with options already * inserted, so must have a NULL opt pointer. */ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { MROUTER_RLOCK_TRACKER; struct rm_priotracker in_ifa_tracker; struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int mtu = 0; int error = 0; int vlan_pcp = -1; - struct sockaddr_in *dst, sin; - const struct sockaddr_in *gw; + struct sockaddr_in *dst; + const struct sockaddr *gw; struct in_ifaddr *ia = NULL; struct in_addr src; int isbroadcast; uint16_t ip_len, ip_off; + struct route iproute; uint32_t fibnum; #if defined(IPSEC) || defined(IPSEC_SUPPORT) int no_route_but_check_spd = 0; #endif M_ASSERTPKTHDR(m); NET_EPOCH_ASSERT(); if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); if ((flags & IP_NODEFAULTFLOWID) == 0) { m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } if ((inp->inp_flags2 & INP_2PCP_SET) != 0) vlan_pcp = (inp->inp_flags2 & INP_2PCP_MASK) >> INP_2PCP_SHIFT; #ifdef NUMA m->m_pkthdr.numa_domain = inp->inp_numa_domain; #endif } if (opt) { int len = 0; m = ip_insertoptions(m, opt, &len); if (len != 0) hlen = len; /* ip->ip_hl is updated above */ } ip = mtod(m, struct ip *); ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip_fillid(ip); } else { /* Header already set, fetch hlen from there */ hlen = ip->ip_hl << 2; } if ((flags & IP_FORWARDING) == 0) IPSTAT_INC(ips_localout); /* * dst/gw handling: * * gw is readonly but can point either to dst OR rt_gateway, * therefore we need restore gw if we're redoing lookup. */ fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); - if (ro != NULL) - dst = (struct sockaddr_in *)&ro->ro_dst; - else - dst = &sin; - if (ro == NULL || ro->ro_nh == NULL) { - bzero(dst, sizeof(*dst)); + if (ro == NULL) { + ro = &iproute; + bzero(ro, sizeof (*ro)); + } + dst = (struct sockaddr_in *)&ro->ro_dst; + if (ro->ro_nh == NULL) { dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } - gw = dst; + gw = (const struct sockaddr *)dst; again: /* * Validate route against routing table additions; * a better/more specific route might have been added. */ - if (inp != NULL && ro != NULL && ro->ro_nh != NULL) + if (inp != NULL && ro->ro_nh != NULL) NH_VALIDATE(ro, &inp->inp_rt_cookie, fibnum); /* * If there is a cached route, * check that it is to the same destination * and is still up. If not, free it and try again. * The address family should also be checked in case of sharing the * cache with IPv6. * Also check whether routing cache needs invalidation. */ - if (ro != NULL && ro->ro_nh != NULL && + if (ro->ro_nh != NULL && ((!NH_IS_VALID(ro->ro_nh)) || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) RO_INVALIDATE_CACHE(ro); ia = NULL; /* * If routing to interface only, short circuit routing lookup. * The use of an all-ones broadcast address implies this; an * interface is specified by the broadcast address of an interface, * or the destination address of a ptp interface. */ if (flags & IP_SENDONES) { if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } ip->ip_dst.s_addr = INADDR_BROADCAST; dst->sin_addr = ip->ip_dst; ifp = ia->ia_ifp; mtu = ifp->if_mtu; ip->ip_ttl = 1; isbroadcast = 1; src = IA_SIN(ia)->sin_addr; } else if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0, M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } ifp = ia->ia_ifp; mtu = ifp->if_mtu; ip->ip_ttl = 1; isbroadcast = ifp->if_flags & IFF_BROADCAST ? in_ifaddr_broadcast(dst->sin_addr, ia) : 0; src = IA_SIN(ia)->sin_addr; } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; mtu = ifp->if_mtu; IFP_TO_IA(ifp, ia, &in_ifa_tracker); isbroadcast = 0; /* fool gcc */ /* Interface may have no addresses. */ if (ia != NULL) src = IA_SIN(ia)->sin_addr; else src.s_addr = INADDR_ANY; - } else if (ro != NULL) { + } else if (ro != &iproute) { if (ro->ro_nh == NULL) { /* * We want to do any cloning requested by the link * layer, as this is probably required in all cases * for correct operation (as it is for ARP). */ uint32_t flowid; flowid = m->m_pkthdr.flowid; ro->ro_nh = fib4_lookup(fibnum, dst->sin_addr, 0, NHR_REF, flowid); if (ro->ro_nh == NULL || (!NH_IS_VALID(ro->ro_nh))) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * There is no route for this packet, but it is * possible that a matching SPD entry exists. */ no_route_but_check_spd = 1; goto sendit; #endif IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } } struct nhop_object *nh = ro->ro_nh; ia = ifatoia(nh->nh_ifa); ifp = nh->nh_ifp; counter_u64_add(nh->nh_pksent, 1); rt_update_ro_flags(ro, nh); if (nh->nh_flags & NHF_GATEWAY) - gw = &nh->gw4_sa; + gw = &nh->gw_sa; if (nh->nh_flags & NHF_HOST) isbroadcast = (nh->nh_flags & NHF_BROADCAST); - else if (ifp->if_flags & IFF_BROADCAST) - isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia); + else if ((ifp->if_flags & IFF_BROADCAST) && (gw->sa_family == AF_INET)) + isbroadcast = in_ifaddr_broadcast(((const struct sockaddr_in *)gw)->sin_addr, ia); else isbroadcast = 0; mtu = nh->nh_mtu; src = IA_SIN(ia)->sin_addr; } else { struct nhop_object *nh; nh = fib4_lookup(M_GETFIB(m), ip->ip_dst, 0, NHR_NONE, m->m_pkthdr.flowid); if (nh == NULL) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * There is no route for this packet, but it is * possible that a matching SPD entry exists. */ no_route_but_check_spd = 1; goto sendit; #endif IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } ifp = nh->nh_ifp; mtu = nh->nh_mtu; - /* - * We are rewriting here dst to be gw actually, contradicting - * comment at the beginning of the function. However, in this - * case we are always dealing with on stack dst. - * In case if pfil(9) sends us back to beginning of the - * function, the dst would be rewritten by ip_output_pfil(). - */ - MPASS(dst == &sin); + rt_update_ro_flags(ro, nh); if (nh->nh_flags & NHF_GATEWAY) - dst->sin_addr = nh->gw4_sa.sin_addr; + gw = &nh->gw_sa; ia = ifatoia(nh->nh_ifa); src = IA_SIN(ia)->sin_addr; isbroadcast = (((nh->nh_flags & (NHF_HOST | NHF_BROADCAST)) == (NHF_HOST | NHF_BROADCAST)) || ((ifp->if_flags & IFF_BROADCAST) && - in_ifaddr_broadcast(dst->sin_addr, ia))); + (gw->sa_family == AF_INET) && + in_ifaddr_broadcast(((const struct sockaddr_in *)gw)->sin_addr, ia))); } /* Catch a possible divide by zero later. */ KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (nh_flags=0x%08x) ifp=%p", __func__, mtu, ro, (ro != NULL && ro->ro_nh != NULL) ? ro->ro_nh->nh_flags : 0, ifp)); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { m->m_flags |= M_MCAST; /* * IP destination address is multicast. Make sure "gw" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ - gw = dst; + gw = (const struct sockaddr *)dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { ip->ip_ttl = imo->imo_multicast_ttl; if (imo->imo_multicast_vif != -1) ip->ip_src.s_addr = ip_mcast_src ? ip_mcast_src(imo->imo_multicast_vif) : INADDR_ANY; } else ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } } /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) ip->ip_src = src; if ((imo == NULL && in_mcast_loop) || (imo && imo->imo_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we are not a member * of the group; ip_input() will filter it later, * thus deferring a hash lookup and mutex acquisition * at the expense of a cheap copy using m_copym(). */ ip_mloopback(ifp, m, hlen); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IP_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip_mloopback(), * above, will be forwarded by the ip_input() routine, * if necessary. */ MROUTER_RLOCK(); if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * If rsvp daemon is not running, do not * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ if (!V_rsvp_on) imo = NULL; if (ip_mforward && ip_mforward(ip, ifp, m, imo) != 0) { MROUTER_RUNLOCK(); m_freem(m); goto done; } } MROUTER_RUNLOCK(); } /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip_mloopback() will * loop back a copy. ip_input() will drop the copy if * this host does not belong to the destination group on * the loopback interface. */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); goto done; } goto sendit; } /* * If the source address is not specified yet, use the address * of the outoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) ip->ip_src = src; /* * Look for broadcast address and * verify user is allowed to send * such a packet. */ if (isbroadcast) { if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EADDRNOTAVAIL; goto bad; } if ((flags & IP_ALLOWBROADCAST) == 0) { error = EACCES; goto bad; } /* don't allow broadcast messages to be fragmented */ if (ip_len > mtu) { error = EMSGSIZE; goto bad; } m->m_flags |= M_BCAST; } else { m->m_flags &= ~M_BCAST; } sendit: #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if ((error = IPSEC_OUTPUT(ipv4, m, inp)) != 0) { if (error == EINPROGRESS) error = 0; goto done; } } /* * Check if there was a route for this packet; return error if not. */ if (no_route_but_check_spd) { IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } /* Update variables that are affected by ipsec4_output(). */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ if (PFIL_HOOKED_OUT(V_inet_pfil_head)) { switch (ip_output_pfil(&m, ifp, flags, inp, dst, &fibnum, &error)) { case 1: /* Finished */ goto done; case 0: /* Continue normally */ ip = mtod(m, struct ip *); break; case -1: /* Need to try again */ /* Reset everything for a new round */ if (ro != NULL) { RO_NHFREE(ro); ro->ro_prepend = NULL; } - gw = dst; + gw = (const struct sockaddr *)dst; ip = mtod(m, struct ip *); goto again; } } if (vlan_pcp > -1) EVL_APPLY_PRI(m, vlan_pcp); /* IN_LOOPBACK must not appear on the wire - RFC1122. */ if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) || IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); error = EADDRNOTAVAIL; goto bad; } } m->m_pkthdr.csum_flags |= CSUM_IP; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { m = mb_unmapped_to_ext(m); if (m == NULL) { IPSTAT_INC(ips_odropped); error = ENOBUFS; goto bad; } in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } else if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) { m = mb_unmapped_to_ext(m); if (m == NULL) { IPSTAT_INC(ips_odropped); error = ENOBUFS; goto bad; } } #if defined(SCTP) || defined(SCTP_SUPPORT) if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { m = mb_unmapped_to_ext(m); if (m == NULL) { IPSTAT_INC(ips_odropped); error = ENOBUFS; goto bad; } sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. * Note that if_vxlan could have requested TSO even though the outer * frame is UDP. It is correct to not fragment such datagrams and * instead just pass them on to the driver. */ if (ip_len <= mtu || (m->m_pkthdr.csum_flags & ifp->if_hwassist & (CSUM_TSO | CSUM_INNER_TSO)) != 0) { ip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } /* * Record statistics for this interface address. * With CSUM_TSO the byte/packet count will be slightly * incorrect because we count the IP+TCP headers only * once instead of for every generated packet. */ if (!(flags & IP_FORWARDING) && ia) { if (m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_INNER_TSO)) counter_u64_add(ia->ia_ifa.ifa_opackets, m->m_pkthdr.len / m->m_pkthdr.tso_segsz); else counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } #ifdef MBUF_STRESS_TEST if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) m = m_fragment(m, M_NOWAIT, mbuf_frag_size); #endif /* * Reset layer specific mbuf flags * to avoid confusing lower layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = ip_output_send(inp, ifp, m, gw, ro, (flags & IP_NO_SND_TAG_RL) ? false : true); goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_INNER_TSO))) { error = EMSGSIZE; IPSTAT_INC(ips_cantfrag); goto bad; } /* * Too large for interface; fragment if possible. If successful, * on return, m will point to a list of packets to be sent. */ error = ip_fragment(ip, &m, mtu, ifp->if_hwassist); if (error) goto bad; for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } /* * Reset layer specific mbuf flags * to avoid confusing upper layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp, mtod(m, struct ip *), NULL); error = ip_output_send(inp, ifp, m, gw, ro, true); } else m_freem(m); } if (error == 0) IPSTAT_INC(ips_fragmented); done: return (error); bad: m_freem(m); goto done; } /* * Create a chain of fragments which fit the given mtu. m_frag points to the * mbuf to be fragmented; on return it points to the chain with the fragments. * Return 0 if no error. If error, m_frag may contain a partially built * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags) { int error = 0; int hlen = ip->ip_hl << 2; int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ int off; struct mbuf *m0 = *m_frag; /* the original packet */ int firstlen; struct mbuf **mnext; int nfrags; uint16_t ip_len, ip_off; ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); if (ip_off & IP_DF) { /* Fragmentation not allowed */ IPSTAT_INC(ips_cantfrag); return EMSGSIZE; } /* * Must be able to put at least 8 bytes per fragment. */ if (len < 8) return EMSGSIZE; /* * If the interface will not calculate checksums on * fragmented packets, then do it here. */ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m0 = mb_unmapped_to_ext(m0); if (m0 == NULL) { error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #if defined(SCTP) || defined(SCTP_SUPPORT) if (m0->m_pkthdr.csum_flags & CSUM_SCTP) { m0 = mb_unmapped_to_ext(m0); if (m0 == NULL) { error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } sctp_delayed_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif if (len > PAGE_SIZE) { /* * Fragment large datagrams such that each segment * contains a multiple of PAGE_SIZE amount of data, * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. * * XXX When does this help given that sender and receiver * could have different page sizes, and also mtu could * be less than the receiver's page size ? */ int newlen; off = MIN(mtu, m0->m_pkthdr.len); /* * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) goto smart_frag_failure; off = ((off - hlen) & ~7) + hlen; newlen = (~PAGE_MASK) & mtu; if ((newlen + sizeof (struct ip)) > mtu) { /* we failed, go back the default */ smart_frag_failure: newlen = len; off = hlen + len; } len = newlen; } else { off = hlen + len; } firstlen = off - hlen; mnext = &m0->m_nextpkt; /* pointer to next packet */ /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. * Here, m0 is the original packet, m is the fragment being created. * The fragments are linked off the m_nextpkt of the original * packet, which after processing serves as the first fragment. */ for (nfrags = 1; off < ip_len; off += len, nfrags++) { struct ip *mhip; /* ip header on the fragment */ struct mbuf *m; int mhlen = sizeof (struct ip); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } /* * Make sure the complete packet header gets copied * from the originating mbuf to the newly created * mbuf. This also ensures that existing firewall * classification(s), VLAN tags and so on get copied * to the resulting fragmented packet(s): */ if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) { m_free(m); error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } /* * In the first mbuf, leave room for the link header, then * copy the original IP header including options. The payload * goes into an additional mbuf chain returned by m_copym(). */ m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); *mhip = *ip; if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); mhip->ip_v = IPVERSION; mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; /* XXX do we need to add ip_off below ? */ mhip->ip_off = ((off - hlen) >> 3) + ip_off; if (off + len >= ip_len) len = ip_len - off; else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copym(m0, off, len, M_NOWAIT); if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ IPSTAT_INC(ips_odropped); goto done; } m->m_pkthdr.len = mhlen + len; #ifdef MAC mac_netinet_fragment(m0, m); #endif mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { mhip->ip_sum = in_cksum(m, mhlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } *mnext = m; mnext = &m->m_nextpkt; } IPSTAT_ADD(ips_ofragments, nfrags); /* * Update first fragment by trimming what's been copied out * and updating header. */ m_adj(m0, hlen + firstlen - ip_len); m0->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m0->m_pkthdr.len); ip->ip_off = htons(ip_off | IP_MF); ip->ip_sum = 0; if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { ip->ip_sum = in_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_IP; } done: *m_frag = m0; return error; } void in_delayed_cksum(struct mbuf *m) { struct ip *ip; struct udphdr *uh; uint16_t cklen, csum, offset; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; if (m->m_pkthdr.csum_flags & CSUM_UDP) { /* if udp header is not in the first mbuf copy udplen */ if (offset + sizeof(struct udphdr) > m->m_len) { m_copydata(m, offset + offsetof(struct udphdr, uh_ulen), sizeof(cklen), (caddr_t)&cklen); cklen = ntohs(cklen); } else { uh = (struct udphdr *)mtodo(m, offset); cklen = ntohs(uh->uh_ulen); } csum = in_cksum_skip(m, cklen + offset, offset); if (csum == 0) csum = 0xffff; } else { cklen = ntohs(ip->ip_len); csum = in_cksum_skip(m, cklen, offset); } offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(csum) > m->m_len) m_copyback(m, offset, sizeof(csum), (caddr_t)&csum); else *(u_short *)mtodo(m, offset) = csum; } /* * IP socket option processing. */ int ip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; #ifdef RSS uint32_t rss_bucket; int retval; #endif error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(inp); if ((so->so_options & SO_REUSEADDR) != 0) inp->inp_flags2 |= INP_REUSEADDR; else inp->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT) != 0) inp->inp_flags2 |= INP_REUSEPORT; else inp->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT_LB: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT_LB) != 0) inp->inp_flags2 |= INP_REUSEPORT_LB; else inp->inp_flags2 &= ~INP_REUSEPORT_LB; INP_WUNLOCK(inp); error = 0; break; case SO_SETFIB: INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); error = 0; break; case SO_MAX_PACING_RATE: #ifdef RATELIMIT INP_WLOCK(inp); inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; INP_WUNLOCK(inp); error = 0; #else error = EOPNOTSUPP; #endif break; default: break; } } return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif { struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); if (error) { m_free(m); break; } INP_WLOCK(inp); error = ip_pcbopts(inp, sopt->sopt_name, m); INP_WUNLOCK(inp); return (error); } case IP_BINDANY: if (sopt->sopt_td != NULL) { error = priv_check(sopt->sopt_td, PRIV_NETINET_BINDANY); if (error) break; } /* FALLTHROUGH */ case IP_BINDMULTI: #ifdef RSS case IP_RSS_LISTEN_BUCKET: #endif case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_ORIGDSTADDR: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_ONESBCAST: case IP_DONTFRAG: case IP_RECVTOS: case IP_RECVFLOWID: #ifdef RSS case IP_RECVRSSBUCKETID: #endif case IP_VLAN_PCP: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos = optval; break; case IP_TTL: inp->inp_ip_ttl = optval; break; case IP_MINTTL: if (optval >= 0 && optval <= MAXTTL) inp->inp_ip_minttl = optval; else error = EINVAL; break; #define OPTSET(bit) do { \ INP_WLOCK(inp); \ if (optval) \ inp->inp_flags |= bit; \ else \ inp->inp_flags &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) #define OPTSET2(bit, val) do { \ INP_WLOCK(inp); \ if (val) \ inp->inp_flags2 |= bit; \ else \ inp->inp_flags2 &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; case IP_RECVRETOPTS: OPTSET(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: OPTSET(INP_RECVDSTADDR); break; case IP_ORIGDSTADDR: OPTSET2(INP_ORIGDSTADDR, optval); break; case IP_RECVTTL: OPTSET(INP_RECVTTL); break; case IP_RECVIF: OPTSET(INP_RECVIF); break; case IP_ONESBCAST: OPTSET(INP_ONESBCAST); break; case IP_DONTFRAG: OPTSET(INP_DONTFRAG); break; case IP_BINDANY: OPTSET(INP_BINDANY); break; case IP_RECVTOS: OPTSET(INP_RECVTOS); break; case IP_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; case IP_RECVFLOWID: OPTSET2(INP_RECVFLOWID, optval); break; #ifdef RSS case IP_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { inp->inp_rss_listen_bucket = optval; OPTSET2(INP_RSS_BUCKET_SET, 1); } else { error = EINVAL; } break; case IP_RECVRSSBUCKETID: OPTSET2(INP_RECVRSSBUCKETID, optval); break; #endif case IP_VLAN_PCP: if ((optval >= -1) && (optval <= (INP_2PCP_MASK >> INP_2PCP_SHIFT))) { if (optval == -1) { INP_WLOCK(inp); inp->inp_flags2 &= ~(INP_2PCP_SET | INP_2PCP_MASK); INP_WUNLOCK(inp); } else { INP_WLOCK(inp); inp->inp_flags2 |= INP_2PCP_SET; inp->inp_flags2 &= ~INP_2PCP_MASK; inp->inp_flags2 |= optval << INP_2PCP_SHIFT; INP_WUNLOCK(inp); } } else error = EINVAL; break; } break; #undef OPTSET #undef OPTSET2 /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case IP_MSFILTER: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_setmoptions(inp, sopt); break; case IP_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(inp); switch (optval) { case IP_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IP_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IP_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(inp); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IP_IPSEC_POLICY: if (IPSEC_ENABLED(ipv4)) { error = IPSEC_PCBCTL(ipv4, inp, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: INP_RLOCK(inp); if (inp->inp_options) { struct mbuf *options; options = m_copym(inp->inp_options, 0, M_COPYALL, M_NOWAIT); INP_RUNLOCK(inp); if (options != NULL) { error = sooptcopyout(sopt, mtod(options, char *), options->m_len); m_freem(options); } else error = ENOMEM; } else { INP_RUNLOCK(inp); sopt->sopt_valsize = 0; } break; case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_ORIGDSTADDR: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_PORTRANGE: case IP_ONESBCAST: case IP_DONTFRAG: case IP_BINDANY: case IP_RECVTOS: case IP_BINDMULTI: case IP_FLOWID: case IP_FLOWTYPE: case IP_RECVFLOWID: #ifdef RSS case IP_RSSBUCKETID: case IP_RECVRSSBUCKETID: #endif case IP_VLAN_PCP: switch (sopt->sopt_name) { case IP_TOS: optval = inp->inp_ip_tos; break; case IP_TTL: optval = inp->inp_ip_ttl; break; case IP_MINTTL: optval = inp->inp_ip_minttl; break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) #define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); break; case IP_RECVRETOPTS: optval = OPTBIT(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: optval = OPTBIT(INP_RECVDSTADDR); break; case IP_ORIGDSTADDR: optval = OPTBIT2(INP_ORIGDSTADDR); break; case IP_RECVTTL: optval = OPTBIT(INP_RECVTTL); break; case IP_RECVIF: optval = OPTBIT(INP_RECVIF); break; case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; else if (inp->inp_flags & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = 0; break; case IP_ONESBCAST: optval = OPTBIT(INP_ONESBCAST); break; case IP_DONTFRAG: optval = OPTBIT(INP_DONTFRAG); break; case IP_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IP_RECVTOS: optval = OPTBIT(INP_RECVTOS); break; case IP_FLOWID: optval = inp->inp_flowid; break; case IP_FLOWTYPE: optval = inp->inp_flowtype; break; case IP_RECVFLOWID: optval = OPTBIT2(INP_RECVFLOWID); break; #ifdef RSS case IP_RSSBUCKETID: retval = rss_hash2bucket(inp->inp_flowid, inp->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; case IP_RECVRSSBUCKETID: optval = OPTBIT2(INP_RECVRSSBUCKETID); break; #endif case IP_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; case IP_VLAN_PCP: if (OPTBIT2(INP_2PCP_SET)) { optval = (inp->inp_flags2 & INP_2PCP_MASK) >> INP_2PCP_SHIFT; } else { optval = -1; } break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_MSFILTER: error = inp_getmoptions(inp, sopt); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IP_IPSEC_POLICY: if (IPSEC_ENABLED(ipv4)) { error = IPSEC_PCBCTL(ipv4, inp, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Routine called from ip_output() to loop back a copy of an IP multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be a loopback interface -- evil, but easier than * replicating that code here. */ static void ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen) { struct ip *ip; struct mbuf *copym; /* * Make a deep copy of the packet because we're going to * modify the pack in order to generate checksums. */ copym = m_dup(m, M_NOWAIT); if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* If needed, compute the checksum and mark it as valid. */ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(copym); copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } /* * We don't bother to fragment if the IP length is greater * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); if_simloop(ifp, copym, AF_INET, 0); } } diff --git a/sys/netinet/toecore.c b/sys/netinet/toecore.c index 1c0be6011253..0bf55958c618 100644 --- a/sys/netinet/toecore.c +++ b/sys/netinet/toecore.c @@ -1,601 +1,602 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include #include static struct mtx toedev_lock; static TAILQ_HEAD(, toedev) toedev_list; static eventhandler_tag listen_start_eh; static eventhandler_tag listen_stop_eh; static eventhandler_tag lle_event_eh; static int toedev_connect(struct toedev *tod __unused, struct socket *so __unused, struct nhop_object *nh __unused, struct sockaddr *nam __unused) { return (ENOTSUP); } static int toedev_listen_start(struct toedev *tod __unused, struct tcpcb *tp __unused) { return (ENOTSUP); } static int toedev_listen_stop(struct toedev *tod __unused, struct tcpcb *tp __unused) { return (ENOTSUP); } static void toedev_input(struct toedev *tod __unused, struct tcpcb *tp __unused, struct mbuf *m) { m_freem(m); return; } static void toedev_rcvd(struct toedev *tod __unused, struct tcpcb *tp __unused) { return; } static int toedev_output(struct toedev *tod __unused, struct tcpcb *tp __unused) { return (ENOTSUP); } static void toedev_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp __unused) { return; } static void toedev_l2_update(struct toedev *tod __unused, struct ifnet *ifp __unused, struct sockaddr *sa __unused, uint8_t *lladdr __unused, uint16_t vtag __unused) { return; } static void toedev_route_redirect(struct toedev *tod __unused, struct ifnet *ifp __unused, struct nhop_object *nh0 __unused, struct nhop_object *nh1 __unused) { return; } static void toedev_syncache_added(struct toedev *tod __unused, void *ctx __unused) { return; } static void toedev_syncache_removed(struct toedev *tod __unused, void *ctx __unused) { return; } static int toedev_syncache_respond(struct toedev *tod __unused, void *ctx __unused, struct mbuf *m) { m_freem(m); return (0); } static void toedev_offload_socket(struct toedev *tod __unused, void *ctx __unused, struct socket *so __unused) { return; } static void toedev_ctloutput(struct toedev *tod __unused, struct tcpcb *tp __unused, int sopt_dir __unused, int sopt_name __unused) { return; } static void toedev_tcp_info(struct toedev *tod __unused, struct tcpcb *tp __unused, struct tcp_info *ti __unused) { return; } static int toedev_alloc_tls_session(struct toedev *tod __unused, struct tcpcb *tp __unused, struct ktls_session *tls __unused, int direction __unused) { return (EINVAL); } static void toedev_pmtu_update(struct toedev *tod __unused, struct tcpcb *tp __unused, tcp_seq seq __unused, int mtu __unused) { return; } /* * Inform one or more TOE devices about a listening socket. */ static void toe_listen_start(struct inpcb *inp, void *arg) { struct toedev *t, *tod; struct tcpcb *tp; INP_WLOCK_ASSERT(inp); KASSERT(inp->inp_pcbinfo == &V_tcbinfo, ("%s: inp is not a TCP inp", __func__)); if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) return; tp = intotcpcb(inp); if (tp->t_state != TCPS_LISTEN) return; t = arg; mtx_lock(&toedev_lock); TAILQ_FOREACH(tod, &toedev_list, link) { if (t == NULL || t == tod) tod->tod_listen_start(tod, tp); } mtx_unlock(&toedev_lock); } static void toe_listen_start_event(void *arg __unused, struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; INP_WLOCK_ASSERT(inp); KASSERT(tp->t_state == TCPS_LISTEN, ("%s: t_state %s", __func__, tcpstates[tp->t_state])); toe_listen_start(inp, NULL); } static void toe_listen_stop_event(void *arg __unused, struct tcpcb *tp) { struct toedev *tod; #ifdef INVARIANTS struct inpcb *inp = tp->t_inpcb; #endif INP_WLOCK_ASSERT(inp); KASSERT(tp->t_state == TCPS_LISTEN, ("%s: t_state %s", __func__, tcpstates[tp->t_state])); mtx_lock(&toedev_lock); TAILQ_FOREACH(tod, &toedev_list, link) tod->tod_listen_stop(tod, tp); mtx_unlock(&toedev_lock); } /* * Fill up a freshly allocated toedev struct with reasonable defaults. */ void init_toedev(struct toedev *tod) { tod->tod_softc = NULL; /* * Provide no-op defaults so that the kernel can call any toedev * function without having to check whether the TOE driver supplied one * or not. */ tod->tod_connect = toedev_connect; tod->tod_listen_start = toedev_listen_start; tod->tod_listen_stop = toedev_listen_stop; tod->tod_input = toedev_input; tod->tod_rcvd = toedev_rcvd; tod->tod_output = toedev_output; tod->tod_send_rst = toedev_output; tod->tod_send_fin = toedev_output; tod->tod_pcb_detach = toedev_pcb_detach; tod->tod_l2_update = toedev_l2_update; tod->tod_route_redirect = toedev_route_redirect; tod->tod_syncache_added = toedev_syncache_added; tod->tod_syncache_removed = toedev_syncache_removed; tod->tod_syncache_respond = toedev_syncache_respond; tod->tod_offload_socket = toedev_offload_socket; tod->tod_ctloutput = toedev_ctloutput; tod->tod_tcp_info = toedev_tcp_info; tod->tod_alloc_tls_session = toedev_alloc_tls_session; tod->tod_pmtu_update = toedev_pmtu_update; } /* * Register an active TOE device with the system. This allows it to receive * notifications from the kernel. */ int register_toedev(struct toedev *tod) { struct toedev *t; mtx_lock(&toedev_lock); TAILQ_FOREACH(t, &toedev_list, link) { if (t == tod) { mtx_unlock(&toedev_lock); return (EEXIST); } } TAILQ_INSERT_TAIL(&toedev_list, tod, link); registered_toedevs++; mtx_unlock(&toedev_lock); inp_apply_all(toe_listen_start, tod); return (0); } /* * Remove the TOE device from the global list of active TOE devices. It is the * caller's responsibility to ensure that the TOE device is quiesced prior to * this call. */ int unregister_toedev(struct toedev *tod) { struct toedev *t, *t2; int rc = ENODEV; mtx_lock(&toedev_lock); TAILQ_FOREACH_SAFE(t, &toedev_list, link, t2) { if (t == tod) { TAILQ_REMOVE(&toedev_list, tod, link); registered_toedevs--; rc = 0; break; } } KASSERT(registered_toedevs >= 0, ("%s: registered_toedevs (%d) < 0", __func__, registered_toedevs)); mtx_unlock(&toedev_lock); return (rc); } void toe_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct inpcb *inp, void *tod, void *todctx, uint8_t iptos) { INP_RLOCK_ASSERT(inp); (void )syncache_add(inc, to, th, inp, inp->inp_socket, NULL, tod, todctx, iptos, htons(0)); } int toe_syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct socket **lsop) { NET_EPOCH_ASSERT(); return (syncache_expand(inc, to, th, lsop, NULL, htons(0))); } /* * General purpose check to see if a 4-tuple is in use by the kernel. If a TCP * header (presumably for an incoming SYN) is also provided, an existing 4-tuple * in TIME_WAIT may be assassinated freeing it up for re-use. * * Note that the TCP header must have been run through tcp_fields_to_host() or * equivalent. */ int toe_4tuple_check(struct in_conninfo *inc, struct tcphdr *th, struct ifnet *ifp) { struct inpcb *inp; if (inc->inc_flags & INC_ISIPV6) { inp = in6_pcblookup(&V_tcbinfo, &inc->inc6_faddr, inc->inc_fport, &inc->inc6_laddr, inc->inc_lport, INPLOOKUP_RLOCKPCB, ifp); } else { inp = in_pcblookup(&V_tcbinfo, inc->inc_faddr, inc->inc_fport, inc->inc_laddr, inc->inc_lport, INPLOOKUP_RLOCKPCB, ifp); } if (inp != NULL) { INP_RLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) && th != NULL) { if (!tcp_twcheck(inp, NULL, th, NULL, 0)) return (EADDRINUSE); } else { INP_RUNLOCK(inp); return (EADDRINUSE); } } return (0); } static void toe_lle_event(void *arg __unused, struct llentry *lle, int evt) { struct toedev *tod; struct ifnet *ifp; struct sockaddr *sa; uint8_t *lladdr; uint16_t vid, pcp; int family; struct sockaddr_in6 sin6; LLE_WLOCK_ASSERT(lle); ifp = lltable_get_ifp(lle->lle_tbl); family = lltable_get_af(lle->lle_tbl); if (family != AF_INET && family != AF_INET6) return; /* * Not interested if the interface's TOE capability is not enabled. */ if ((family == AF_INET && !(ifp->if_capenable & IFCAP_TOE4)) || (family == AF_INET6 && !(ifp->if_capenable & IFCAP_TOE6))) return; tod = TOEDEV(ifp); if (tod == NULL) return; sa = (struct sockaddr *)&sin6; lltable_fill_sa_entry(lle, sa); vid = 0xfff; pcp = 0; if (evt != LLENTRY_RESOLVED) { /* * LLENTRY_TIMEDOUT, LLENTRY_DELETED, LLENTRY_EXPIRED all mean * this entry is going to be deleted. */ lladdr = NULL; } else { KASSERT(lle->la_flags & LLE_VALID, ("%s: %p resolved but not valid?", __func__, lle)); lladdr = (uint8_t *)lle->ll_addr; VLAN_TAG(ifp, &vid); VLAN_PCP(ifp, &pcp); } tod->tod_l2_update(tod, ifp, sa, lladdr, EVL_MAKETAG(vid, pcp, 0)); } /* * Returns 0 or EWOULDBLOCK on success (any other value is an error). 0 means * lladdr and vtag are valid on return, EWOULDBLOCK means the TOE driver's * tod_l2_update will be called later, when the entry is resolved or times out. */ int toe_l2_resolve(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa, uint8_t *lladdr, uint16_t *vtag) { int rc; uint16_t vid, pcp; switch (sa->sa_family) { #ifdef INET case AF_INET: rc = arpresolve(ifp, 0, NULL, sa, lladdr, NULL, NULL); break; #endif #ifdef INET6 case AF_INET6: - rc = nd6_resolve(ifp, LLE_SF(AF_INET6, 0), NULL, sa, lladdr, NULL, NULL); + rc = nd6_resolve(ifp, LLE_SF(AF_INET6, 0), NULL, sa, lladdr, + NULL, NULL); break; #endif default: return (EPROTONOSUPPORT); } if (rc == 0) { vid = 0xfff; pcp = 0; if (ifp->if_type == IFT_L2VLAN) { VLAN_TAG(ifp, &vid); VLAN_PCP(ifp, &pcp); } else if (ifp->if_pcp != IFNET_PCP_NONE) { vid = 0; pcp = ifp->if_pcp; } *vtag = EVL_MAKETAG(vid, pcp, 0); } return (rc); } void toe_connect_failed(struct toedev *tod, struct inpcb *inp, int err) { NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); if (!(inp->inp_flags & INP_DROPPED)) { struct tcpcb *tp = intotcpcb(inp); KASSERT(tp->t_flags & TF_TOE, ("%s: tp %p not offloaded.", __func__, tp)); if (err == EAGAIN) { /* * Temporary failure during offload, take this PCB back. * Detach from the TOE driver and do the rest of what * TCP's pru_connect would have done if the connection * wasn't offloaded. */ tod->tod_pcb_detach(tod, tp); KASSERT(!(tp->t_flags & TF_TOE), ("%s: tp %p still offloaded.", __func__, tp)); tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); (void) tp->t_fb->tfb_tcp_output(tp); } else { tp = tcp_drop(tp, err); if (tp == NULL) INP_WLOCK(inp); /* re-acquire */ } } INP_WLOCK_ASSERT(inp); } static int toecore_load(void) { mtx_init(&toedev_lock, "toedev lock", NULL, MTX_DEF); TAILQ_INIT(&toedev_list); listen_start_eh = EVENTHANDLER_REGISTER(tcp_offload_listen_start, toe_listen_start_event, NULL, EVENTHANDLER_PRI_ANY); listen_stop_eh = EVENTHANDLER_REGISTER(tcp_offload_listen_stop, toe_listen_stop_event, NULL, EVENTHANDLER_PRI_ANY); lle_event_eh = EVENTHANDLER_REGISTER(lle_event, toe_lle_event, NULL, EVENTHANDLER_PRI_ANY); return (0); } static int toecore_unload(void) { mtx_lock(&toedev_lock); if (!TAILQ_EMPTY(&toedev_list)) { mtx_unlock(&toedev_lock); return (EBUSY); } EVENTHANDLER_DEREGISTER(tcp_offload_listen_start, listen_start_eh); EVENTHANDLER_DEREGISTER(tcp_offload_listen_stop, listen_stop_eh); EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh); mtx_unlock(&toedev_lock); mtx_destroy(&toedev_lock); return (0); } static int toecore_mod_handler(module_t mod, int cmd, void *arg) { if (cmd == MOD_LOAD) return (toecore_load()); if (cmd == MOD_UNLOAD) return (toecore_unload()); return (EOPNOTSUPP); } static moduledata_t mod_data= { "toecore", toecore_mod_handler, 0 }; MODULE_VERSION(toecore, 1); DECLARE_MODULE(toecore, mod_data, SI_SUB_EXEC, SI_ORDER_ANY); diff --git a/sys/ofed/drivers/infiniband/core/ib_addr.c b/sys/ofed/drivers/infiniband/core/ib_addr.c index 297469bd4d87..2ac79ca64664 100644 --- a/sys/ofed/drivers/infiniband/core/ib_addr.c +++ b/sys/ofed/drivers/infiniband/core/ib_addr.c @@ -1,908 +1,916 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2019, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include "core_priv.h" struct addr_req { struct list_head list; struct sockaddr_storage src_addr; struct sockaddr_storage dst_addr; struct rdma_dev_addr *addr; struct rdma_addr_client *client; void *context; void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context); int timeout; int status; }; static void process_req(struct work_struct *work); static DEFINE_MUTEX(lock); static LIST_HEAD(req_list); static DECLARE_DELAYED_WORK(work, process_req); static struct workqueue_struct *addr_wq; int rdma_addr_size(struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return sizeof(struct sockaddr_in); case AF_INET6: return sizeof(struct sockaddr_in6); case AF_IB: return sizeof(struct sockaddr_ib); default: return 0; } } EXPORT_SYMBOL(rdma_addr_size); int rdma_addr_size_in6(struct sockaddr_in6 *addr) { int ret = rdma_addr_size((struct sockaddr *) addr); return ret <= sizeof(*addr) ? ret : 0; } EXPORT_SYMBOL(rdma_addr_size_in6); int rdma_addr_size_kss(struct sockaddr_storage *addr) { int ret = rdma_addr_size((struct sockaddr *) addr); return ret <= sizeof(*addr) ? ret : 0; } EXPORT_SYMBOL(rdma_addr_size_kss); static struct rdma_addr_client self; void rdma_addr_register_client(struct rdma_addr_client *client) { atomic_set(&client->refcount, 1); init_completion(&client->comp); } EXPORT_SYMBOL(rdma_addr_register_client); static inline void put_client(struct rdma_addr_client *client) { if (atomic_dec_and_test(&client->refcount)) complete(&client->comp); } void rdma_addr_unregister_client(struct rdma_addr_client *client) { put_client(client); wait_for_completion(&client->comp); } EXPORT_SYMBOL(rdma_addr_unregister_client); static inline void rdma_copy_addr_sub(u8 *dst, const u8 *src, unsigned min, unsigned max) { if (min > max) min = max; memcpy(dst, src, min); memset(dst + min, 0, max - min); } int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev, const unsigned char *dst_dev_addr) { /* check for loopback device */ if (dev->if_flags & IFF_LOOPBACK) { dev_addr->dev_type = ARPHRD_ETHER; memset(dev_addr->src_dev_addr, 0, MAX_ADDR_LEN); memset(dev_addr->broadcast, 0, MAX_ADDR_LEN); memset(dev_addr->dst_dev_addr, 0, MAX_ADDR_LEN); dev_addr->bound_dev_if = dev->if_index; return (0); } else if (dev->if_type == IFT_INFINIBAND) dev_addr->dev_type = ARPHRD_INFINIBAND; else if (dev->if_type == IFT_ETHER) dev_addr->dev_type = ARPHRD_ETHER; else dev_addr->dev_type = 0; rdma_copy_addr_sub(dev_addr->src_dev_addr, IF_LLADDR(dev), dev->if_addrlen, MAX_ADDR_LEN); rdma_copy_addr_sub(dev_addr->broadcast, dev->if_broadcastaddr, dev->if_addrlen, MAX_ADDR_LEN); if (dst_dev_addr != NULL) { rdma_copy_addr_sub(dev_addr->dst_dev_addr, dst_dev_addr, dev->if_addrlen, MAX_ADDR_LEN); } dev_addr->bound_dev_if = dev->if_index; return 0; } EXPORT_SYMBOL(rdma_copy_addr); int rdma_translate_ip(const struct sockaddr *addr, struct rdma_dev_addr *dev_addr) { struct ifnet *dev; int ret; if (dev_addr->bound_dev_if) { dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); } else switch (addr->sa_family) { #ifdef INET case AF_INET: dev = ip_ifp_find(dev_addr->net, ((const struct sockaddr_in *)addr)->sin_addr.s_addr); break; #endif #ifdef INET6 case AF_INET6: dev = ip6_ifp_find(dev_addr->net, ((const struct sockaddr_in6 *)addr)->sin6_addr, 0); break; #endif default: dev = NULL; break; } if (dev != NULL) { /* disallow connections through 127.0.0.1 itself */ if (dev->if_flags & IFF_LOOPBACK) ret = -EINVAL; else ret = rdma_copy_addr(dev_addr, dev, NULL); dev_put(dev); } else { ret = -ENODEV; } return ret; } EXPORT_SYMBOL(rdma_translate_ip); static void set_timeout(int time) { int delay; /* under FreeBSD ticks are 32-bit */ delay = time - jiffies; if (delay <= 0) delay = 1; else if (delay > hz) delay = hz; mod_delayed_work(addr_wq, &work, delay); } static void queue_req(struct addr_req *req) { struct addr_req *temp_req; mutex_lock(&lock); list_for_each_entry_reverse(temp_req, &req_list, list) { if (time_after_eq(req->timeout, temp_req->timeout)) break; } list_add(&req->list, &temp_req->list); if (req_list.next == &req->list) set_timeout(req->timeout); mutex_unlock(&lock); } #if defined(INET) || defined(INET6) static int addr_resolve_multi(u8 *edst, struct ifnet *ifp, struct sockaddr *dst_in) { struct sockaddr *llsa; struct sockaddr_dl sdl; int error; sdl.sdl_len = sizeof(sdl); llsa = (struct sockaddr *)&sdl; if (ifp->if_resolvemulti == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_resolvemulti(ifp, &llsa, dst_in); if (error == 0) { rdma_copy_addr_sub(edst, LLADDR((struct sockaddr_dl *)llsa), ifp->if_addrlen, MAX_ADDR_LEN); } } return (error); } #endif #ifdef INET static int addr4_resolve(struct sockaddr_in *src_in, const struct sockaddr_in *dst_in, struct rdma_dev_addr *addr, u8 *edst, struct ifnet **ifpp) { enum { ADDR_VALID = 0, ADDR_SRC_ANY = 1, ADDR_DST_ANY = 2, }; struct sockaddr_in dst_tmp = *dst_in; in_port_t src_port; struct sockaddr *saddr = NULL; struct nhop_object *nh; struct ifnet *ifp; int error; int type; NET_EPOCH_ASSERT(); /* set VNET, if any */ CURVNET_SET(addr->net); /* set default TTL limit */ addr->hoplimit = V_ip_defttl; type = ADDR_VALID; if (src_in->sin_addr.s_addr == INADDR_ANY) type |= ADDR_SRC_ANY; if (dst_tmp.sin_addr.s_addr == INADDR_ANY) type |= ADDR_DST_ANY; /* * Make sure the socket address length field is set. */ dst_tmp.sin_len = sizeof(dst_tmp); /* Step 1 - lookup destination route if any */ switch (type) { case ADDR_VALID: case ADDR_SRC_ANY: /* regular destination route lookup */ nh = fib4_lookup(RT_DEFAULT_FIB, dst_tmp.sin_addr,0,NHR_NONE,0); if (nh == NULL) { error = EHOSTUNREACH; goto done; } break; default: error = ENETUNREACH; goto done; } /* Step 2 - find outgoing network interface */ switch (type) { case ADDR_VALID: /* get source interface */ if (addr->bound_dev_if != 0) { ifp = dev_get_by_index(addr->net, addr->bound_dev_if); } else { ifp = ip_ifp_find(addr->net, src_in->sin_addr.s_addr); } /* check source interface */ if (ifp == NULL) { error = ENETUNREACH; goto done; } else if (ifp->if_flags & IFF_LOOPBACK) { /* * Source address cannot be a loopback device. */ error = EHOSTUNREACH; goto error_put_ifp; } else if (nh->nh_ifp->if_flags & IFF_LOOPBACK) { if (memcmp(&src_in->sin_addr, &dst_in->sin_addr, sizeof(src_in->sin_addr))) { /* * Destination is loopback, but source * and destination address is not the * same. */ error = EHOSTUNREACH; goto error_put_ifp; } /* get destination network interface from route */ dev_put(ifp); ifp = nh->nh_ifp; dev_hold(ifp); } else if (ifp != nh->nh_ifp) { /* * Source and destination interfaces are * different. */ error = ENETUNREACH; goto error_put_ifp; } break; case ADDR_SRC_ANY: /* check for loopback device */ if (nh->nh_ifp->if_flags & IFF_LOOPBACK) saddr = (struct sockaddr *)&dst_tmp; else saddr = nh->nh_ifa->ifa_addr; /* get destination network interface from route */ ifp = nh->nh_ifp; dev_hold(ifp); break; default: break; } /* * Step 3 - resolve destination MAC address */ if (dst_tmp.sin_addr.s_addr == INADDR_BROADCAST) { rdma_copy_addr_sub(edst, ifp->if_broadcastaddr, ifp->if_addrlen, MAX_ADDR_LEN); error = 0; } else if (IN_MULTICAST(ntohl(dst_tmp.sin_addr.s_addr))) { bool is_gw = (nh->nh_flags & NHF_GATEWAY) != 0; error = addr_resolve_multi(edst, ifp, (struct sockaddr *)&dst_tmp); if (error != 0) goto error_put_ifp; else if (is_gw) addr->network = RDMA_NETWORK_IPV4; } else if (ifp->if_flags & IFF_LOOPBACK) { memset(edst, 0, MAX_ADDR_LEN); error = 0; } else { bool is_gw = (nh->nh_flags & NHF_GATEWAY) != 0; memset(edst, 0, MAX_ADDR_LEN); - error = arpresolve(ifp, is_gw, NULL, is_gw ? - &nh->gw_sa : (const struct sockaddr *)&dst_tmp, - edst, NULL, NULL); +#ifdef INET6 + if (is_gw && nh->gw_sa.sa_family == AF_INET6) + error = nd6_resolve(ifp, LLE_SF(AF_INET, is_gw), NULL, + &nh->gw_sa, edst, NULL, NULL); + else +#endif + error = arpresolve(ifp, is_gw, NULL, is_gw ? + &nh->gw_sa : (const struct sockaddr *)&dst_tmp, + edst, NULL, NULL); + if (error != 0) goto error_put_ifp; else if (is_gw) addr->network = RDMA_NETWORK_IPV4; } /* * Step 4 - update source address, if any */ if (saddr != NULL) { src_port = src_in->sin_port; memcpy(src_in, saddr, rdma_addr_size(saddr)); src_in->sin_port = src_port; /* preserve port number */ } *ifpp = ifp; goto done; error_put_ifp: dev_put(ifp); done: CURVNET_RESTORE(); if (error == EWOULDBLOCK || error == EAGAIN) error = ENODATA; return (-error); } #else static int addr4_resolve(struct sockaddr_in *src_in, const struct sockaddr_in *dst_in, struct rdma_dev_addr *addr, u8 *edst, struct ifnet **ifpp) { return -EADDRNOTAVAIL; } #endif #ifdef INET6 static int addr6_resolve(struct sockaddr_in6 *src_in, const struct sockaddr_in6 *dst_in, struct rdma_dev_addr *addr, u8 *edst, struct ifnet **ifpp) { enum { ADDR_VALID = 0, ADDR_SRC_ANY = 1, ADDR_DST_ANY = 2, }; struct sockaddr_in6 dst_tmp = *dst_in; in_port_t src_port; struct sockaddr *saddr = NULL; struct nhop_object *nh; struct ifnet *ifp; int error; int type; NET_EPOCH_ASSERT(); /* set VNET, if any */ CURVNET_SET(addr->net); /* set default TTL limit */ addr->hoplimit = V_ip_defttl; type = ADDR_VALID; if (ipv6_addr_any(&src_in->sin6_addr)) type |= ADDR_SRC_ANY; if (ipv6_addr_any(&dst_tmp.sin6_addr)) type |= ADDR_DST_ANY; /* * Make sure the socket address length field is set. */ dst_tmp.sin6_len = sizeof(dst_tmp); /* * Make sure the scope ID gets embedded, else nd6_resolve() will * not find the record. */ dst_tmp.sin6_scope_id = addr->bound_dev_if; sa6_embedscope(&dst_tmp, 0); /* Step 1 - lookup destination route if any */ switch (type) { case ADDR_VALID: /* sanity check for IPv4 addresses */ if (ipv6_addr_v4mapped(&src_in->sin6_addr) != ipv6_addr_v4mapped(&dst_tmp.sin6_addr)) { error = EAFNOSUPPORT; goto done; } /* FALLTHROUGH */ case ADDR_SRC_ANY: /* regular destination route lookup */ nh = fib6_lookup(RT_DEFAULT_FIB, &dst_in->sin6_addr, addr->bound_dev_if, NHR_NONE, 0); if (nh == NULL) { error = EHOSTUNREACH; goto done; } break; default: error = ENETUNREACH; goto done; } /* Step 2 - find outgoing network interface */ switch (type) { case ADDR_VALID: /* get source interface */ if (addr->bound_dev_if != 0) { ifp = dev_get_by_index(addr->net, addr->bound_dev_if); } else { ifp = ip6_ifp_find(addr->net, src_in->sin6_addr, 0); } /* check source interface */ if (ifp == NULL) { error = ENETUNREACH; goto done; } else if (ifp->if_flags & IFF_LOOPBACK) { /* * Source address cannot be a loopback device. */ error = EHOSTUNREACH; goto error_put_ifp; } else if (nh->nh_ifp->if_flags & IFF_LOOPBACK) { if (memcmp(&src_in->sin6_addr, &dst_in->sin6_addr, sizeof(src_in->sin6_addr))) { /* * Destination is loopback, but source * and destination address is not the * same. */ error = EHOSTUNREACH; goto error_put_ifp; } /* get destination network interface from route */ dev_put(ifp); ifp = nh->nh_ifp; dev_hold(ifp); } else if (ifp != nh->nh_ifp) { /* * Source and destination interfaces are * different. */ error = ENETUNREACH; goto error_put_ifp; } break; case ADDR_SRC_ANY: /* check for loopback device */ if (nh->nh_ifp->if_flags & IFF_LOOPBACK) saddr = (struct sockaddr *)&dst_tmp; else saddr = nh->nh_ifa->ifa_addr; /* get destination network interface from route */ ifp = nh->nh_ifp; dev_hold(ifp); break; default: break; } /* * Step 3 - resolve destination MAC address */ if (IN6_IS_ADDR_MULTICAST(&dst_tmp.sin6_addr)) { bool is_gw = (nh->nh_flags & NHF_GATEWAY) != 0; error = addr_resolve_multi(edst, ifp, (struct sockaddr *)&dst_tmp); if (error != 0) goto error_put_ifp; else if (is_gw) addr->network = RDMA_NETWORK_IPV6; } else if (nh->nh_ifp->if_flags & IFF_LOOPBACK) { memset(edst, 0, MAX_ADDR_LEN); error = 0; } else { bool is_gw = (nh->nh_flags & NHF_GATEWAY) != 0; memset(edst, 0, MAX_ADDR_LEN); error = nd6_resolve(ifp, LLE_SF(AF_INET6, is_gw), NULL, is_gw ? &nh->gw_sa : (const struct sockaddr *)&dst_tmp, edst, NULL, NULL); if (error != 0) goto error_put_ifp; else if (is_gw) addr->network = RDMA_NETWORK_IPV6; } /* * Step 4 - update source address, if any */ if (saddr != NULL) { src_port = src_in->sin6_port; memcpy(src_in, saddr, rdma_addr_size(saddr)); src_in->sin6_port = src_port; /* preserve port number */ } *ifpp = ifp; goto done; error_put_ifp: dev_put(ifp); done: CURVNET_RESTORE(); if (error == EWOULDBLOCK || error == EAGAIN) error = ENODATA; return (-error); } #else static int addr6_resolve(struct sockaddr_in6 *src_in, const struct sockaddr_in6 *dst_in, struct rdma_dev_addr *addr, u8 *edst, struct ifnet **ifpp) { return -EADDRNOTAVAIL; } #endif static int addr_resolve_neigh(struct ifnet *dev, const struct sockaddr *dst_in, u8 *edst, struct rdma_dev_addr *addr) { if (dev->if_flags & IFF_LOOPBACK) { int ret; /* * Binding to a loopback device is not allowed. Make * sure the destination device address is global by * clearing the bound device interface: */ if (addr->bound_dev_if == dev->if_index) addr->bound_dev_if = 0; ret = rdma_translate_ip(dst_in, addr); if (ret == 0) { memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); } return ret; } /* If the device doesn't do ARP internally */ if (!(dev->if_flags & IFF_NOARP)) return rdma_copy_addr(addr, dev, edst); return rdma_copy_addr(addr, dev, NULL); } static int addr_resolve(struct sockaddr *src_in, const struct sockaddr *dst_in, struct rdma_dev_addr *addr) { struct epoch_tracker et; struct ifnet *ndev = NULL; u8 edst[MAX_ADDR_LEN]; int ret; if (dst_in->sa_family != src_in->sa_family) return -EINVAL; NET_EPOCH_ENTER(et); switch (src_in->sa_family) { case AF_INET: ret = addr4_resolve((struct sockaddr_in *)src_in, (const struct sockaddr_in *)dst_in, addr, edst, &ndev); break; case AF_INET6: ret = addr6_resolve((struct sockaddr_in6 *)src_in, (const struct sockaddr_in6 *)dst_in, addr, edst, &ndev); break; default: ret = -EADDRNOTAVAIL; break; } NET_EPOCH_EXIT(et); /* check for error */ if (ret != 0) return ret; /* store MAC addresses and check for loopback */ ret = addr_resolve_neigh(ndev, dst_in, edst, addr); /* set belonging VNET, if any */ addr->net = dev_net(ndev); dev_put(ndev); return ret; } static void process_req(struct work_struct *work) { struct addr_req *req, *temp_req; struct sockaddr *src_in, *dst_in; struct list_head done_list; INIT_LIST_HEAD(&done_list); mutex_lock(&lock); list_for_each_entry_safe(req, temp_req, &req_list, list) { if (req->status == -ENODATA) { src_in = (struct sockaddr *) &req->src_addr; dst_in = (struct sockaddr *) &req->dst_addr; req->status = addr_resolve(src_in, dst_in, req->addr); if (req->status && time_after_eq(jiffies, req->timeout)) req->status = -ETIMEDOUT; else if (req->status == -ENODATA) continue; } list_move_tail(&req->list, &done_list); } if (!list_empty(&req_list)) { req = list_entry(req_list.next, struct addr_req, list); set_timeout(req->timeout); } mutex_unlock(&lock); list_for_each_entry_safe(req, temp_req, &done_list, list) { list_del(&req->list); req->callback(req->status, (struct sockaddr *) &req->src_addr, req->addr, req->context); put_client(req->client); kfree(req); } } int rdma_resolve_ip(struct rdma_addr_client *client, struct sockaddr *src_addr, struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), void *context) { struct sockaddr *src_in, *dst_in; struct addr_req *req; int ret = 0; req = kzalloc(sizeof *req, GFP_KERNEL); if (!req) return -ENOMEM; src_in = (struct sockaddr *) &req->src_addr; dst_in = (struct sockaddr *) &req->dst_addr; if (src_addr) { if (src_addr->sa_family != dst_addr->sa_family) { ret = -EINVAL; goto err; } memcpy(src_in, src_addr, rdma_addr_size(src_addr)); } else { src_in->sa_family = dst_addr->sa_family; } memcpy(dst_in, dst_addr, rdma_addr_size(dst_addr)); req->addr = addr; req->callback = callback; req->context = context; req->client = client; atomic_inc(&client->refcount); req->status = addr_resolve(src_in, dst_in, addr); switch (req->status) { case 0: req->timeout = jiffies; queue_req(req); break; case -ENODATA: req->timeout = msecs_to_jiffies(timeout_ms) + jiffies; queue_req(req); break; default: ret = req->status; atomic_dec(&client->refcount); goto err; } return ret; err: kfree(req); return ret; } EXPORT_SYMBOL(rdma_resolve_ip); int rdma_resolve_ip_route(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr) { struct sockaddr_storage ssrc_addr = {}; struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr; if (src_addr) { if (src_addr->sa_family != dst_addr->sa_family) return -EINVAL; memcpy(src_in, src_addr, rdma_addr_size(src_addr)); } else { src_in->sa_family = dst_addr->sa_family; } return addr_resolve(src_in, dst_addr, addr); } EXPORT_SYMBOL(rdma_resolve_ip_route); void rdma_addr_cancel(struct rdma_dev_addr *addr) { struct addr_req *req, *temp_req; mutex_lock(&lock); list_for_each_entry_safe(req, temp_req, &req_list, list) { if (req->addr == addr) { req->status = -ECANCELED; req->timeout = jiffies; list_move(&req->list, &req_list); set_timeout(req->timeout); break; } } mutex_unlock(&lock); } EXPORT_SYMBOL(rdma_addr_cancel); struct resolve_cb_context { struct rdma_dev_addr *addr; struct completion comp; int status; }; static void resolve_cb(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context) { if (!status) memcpy(((struct resolve_cb_context *)context)->addr, addr, sizeof(struct rdma_dev_addr)); ((struct resolve_cb_context *)context)->status = status; complete(&((struct resolve_cb_context *)context)->comp); } int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, u8 *dmac, struct ifnet *dev, int *hoplimit) { int ret = 0; struct rdma_dev_addr dev_addr; struct resolve_cb_context ctx; union rdma_sockaddr sgid_addr, dgid_addr; rdma_gid2ip(&sgid_addr._sockaddr, sgid); rdma_gid2ip(&dgid_addr._sockaddr, dgid); memset(&dev_addr, 0, sizeof(dev_addr)); dev_addr.bound_dev_if = dev->if_index; dev_addr.net = dev_net(dev); ctx.addr = &dev_addr; init_completion(&ctx.comp); ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr, &dev_addr, 1000, resolve_cb, &ctx); if (ret) return ret; wait_for_completion(&ctx.comp); ret = ctx.status; if (ret) return ret; memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); if (hoplimit) *hoplimit = dev_addr.hoplimit; return ret; } EXPORT_SYMBOL(rdma_addr_find_l2_eth_by_grh); int addr_init(void) { addr_wq = alloc_workqueue("ib_addr", WQ_MEM_RECLAIM, 0); if (!addr_wq) return -ENOMEM; rdma_addr_register_client(&self); return 0; } void addr_cleanup(void) { rdma_addr_unregister_client(&self); destroy_workqueue(addr_wq); }