diff --git a/sys/contrib/ipfilter/netinet/ip_fil_freebsd.c b/sys/contrib/ipfilter/netinet/ip_fil_freebsd.c index ef03cf678ed7..5f16d8b84c93 100644 --- a/sys/contrib/ipfilter/netinet/ip_fil_freebsd.c +++ b/sys/contrib/ipfilter/netinet/ip_fil_freebsd.c @@ -1,1452 +1,1455 @@ /* $FreeBSD$ */ /* * Copyright (C) 2012 by Darren Reed. * * See the IPFILTER.LICENCE file for details on licencing. */ #if !defined(lint) static const char sccsid[] = "@(#)ip_fil.c 2.41 6/5/96 (C) 1993-2000 Darren Reed"; static const char rcsid[] = "@(#)$Id$"; #endif #if defined(KERNEL) || defined(_KERNEL) # undef KERNEL # undef _KERNEL # define KERNEL 1 # define _KERNEL 1 #endif #if defined(__FreeBSD_version) && \ !defined(KLD_MODULE) && !defined(IPFILTER_LKM) # include "opt_inet6.h" #endif #if defined(__FreeBSD_version) && \ !defined(KLD_MODULE) && !defined(IPFILTER_LKM) # include "opt_random_ip_id.h" #endif #include #include #include #include #include #include #include #include #include #include # include #if defined(__FreeBSD_version) #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "netinet/ip_compat.h" #ifdef USE_INET6 # include #endif #include "netinet/ip_fil.h" #include "netinet/ip_nat.h" #include "netinet/ip_frag.h" #include "netinet/ip_state.h" #include "netinet/ip_proxy.h" #include "netinet/ip_auth.h" #include "netinet/ip_sync.h" #include "netinet/ip_lookup.h" #include "netinet/ip_dstlist.h" #ifdef IPFILTER_SCAN #include "netinet/ip_scan.h" #endif #include "netinet/ip_pool.h" #include #include #ifdef CSUM_DATA_VALID #include #endif extern int ip_optcopy __P((struct ip *, struct ip *)); # ifdef IPFILTER_M_IPFILTER MALLOC_DEFINE(M_IPFILTER, "ipfilter", "IP Filter packet filter data structures"); # endif static int ipf_send_ip __P((fr_info_t *, mb_t *)); static void ipf_timer_func __P((void *arg)); VNET_DEFINE(ipf_main_softc_t, ipfmain) = { .ipf_running = -2, }; #define V_ipfmain VNET(ipfmain) # include # include VNET_DEFINE_STATIC(eventhandler_tag, ipf_arrivetag); VNET_DEFINE_STATIC(eventhandler_tag, ipf_departtag); #define V_ipf_arrivetag VNET(ipf_arrivetag) #define V_ipf_departtag VNET(ipf_departtag) #if 0 /* * Disable the "cloner" event handler; we are getting interface * events before the firewall is fully initiallized and also no vnet * information thus leading to uninitialised memory accesses. * In addition it is unclear why we need it in first place. * If it turns out to be needed, well need a dedicated event handler * for it to deal with the ifc and the correct vnet. */ VNET_DEFINE_STATIC(eventhandler_tag, ipf_clonetag); #define V_ipf_clonetag VNET(ipf_clonetag) #endif static void ipf_ifevent(void *arg, struct ifnet *ifp); static void ipf_ifevent(arg, ifp) void *arg; struct ifnet *ifp; { CURVNET_SET(ifp->if_vnet); if (V_ipfmain.ipf_running > 0) ipf_sync(&V_ipfmain, NULL); CURVNET_RESTORE(); } static pfil_return_t ipf_check_wrapper(struct mbuf **mp, struct ifnet *ifp, int flags, void *ruleset __unused, struct inpcb *inp) { struct ip *ip = mtod(*mp, struct ip *); pfil_return_t rv; CURVNET_SET(ifp->if_vnet); rv = ipf_check(&V_ipfmain, ip, ip->ip_hl << 2, ifp, !!(flags & PFIL_OUT), mp); CURVNET_RESTORE(); return (rv == 0 ? PFIL_PASS : PFIL_DROPPED); } #ifdef USE_INET6 static pfil_return_t ipf_check_wrapper6(struct mbuf **mp, struct ifnet *ifp, int flags, void *ruleset __unused, struct inpcb *inp) { pfil_return_t rv; CURVNET_SET(ifp->if_vnet); rv = ipf_check(&V_ipfmain, mtod(*mp, struct ip *), sizeof(struct ip6_hdr), ifp, !!(flags & PFIL_OUT), mp); CURVNET_RESTORE(); return (rv == 0 ? PFIL_PASS : PFIL_DROPPED); } # endif #if defined(IPFILTER_LKM) int ipf_identify(s) char *s; { if (strcmp(s, "ipl") == 0) return 1; return 0; } #endif /* IPFILTER_LKM */ static void ipf_timer_func(arg) void *arg; { ipf_main_softc_t *softc = arg; SPL_INT(s); SPL_NET(s); READ_ENTER(&softc->ipf_global); if (softc->ipf_running > 0) ipf_slowtimer(softc); if (softc->ipf_running == -1 || softc->ipf_running == 1) { #if 0 softc->ipf_slow_ch = timeout(ipf_timer_func, softc, hz/2); #endif callout_init(&softc->ipf_slow_ch, 1); callout_reset(&softc->ipf_slow_ch, (hz / IPF_HZ_DIVIDE) * IPF_HZ_MULT, ipf_timer_func, softc); } RWLOCK_EXIT(&softc->ipf_global); SPL_X(s); } int ipfattach(softc) ipf_main_softc_t *softc; { #ifdef USE_SPL int s; #endif SPL_NET(s); if (softc->ipf_running > 0) { SPL_X(s); return EBUSY; } if (ipf_init_all(softc) < 0) { SPL_X(s); return EIO; } bzero((char *)V_ipfmain.ipf_selwait, sizeof(V_ipfmain.ipf_selwait)); softc->ipf_running = 1; if (softc->ipf_control_forwarding & 1) V_ipforwarding = 1; SPL_X(s); #if 0 softc->ipf_slow_ch = timeout(ipf_timer_func, softc, (hz / IPF_HZ_DIVIDE) * IPF_HZ_MULT); #endif callout_init(&softc->ipf_slow_ch, 1); callout_reset(&softc->ipf_slow_ch, (hz / IPF_HZ_DIVIDE) * IPF_HZ_MULT, ipf_timer_func, softc); return 0; } /* * Disable the filter by removing the hooks from the IP input/output * stream. */ int ipfdetach(softc) ipf_main_softc_t *softc; { #ifdef USE_SPL int s; #endif if (softc->ipf_control_forwarding & 2) V_ipforwarding = 0; SPL_NET(s); #if 0 if (softc->ipf_slow_ch.callout != NULL) untimeout(ipf_timer_func, softc, softc->ipf_slow_ch); bzero(&softc->ipf_slow, sizeof(softc->ipf_slow)); #endif callout_drain(&softc->ipf_slow_ch); ipf_fini_all(softc); softc->ipf_running = -2; SPL_X(s); return 0; } /* * Filter ioctl interface. */ int ipfioctl(dev, cmd, data, mode, p) struct thread *p; # define p_cred td_ucred # define p_uid td_ucred->cr_ruid struct cdev *dev; ioctlcmd_t cmd; caddr_t data; int mode; { int error = 0, unit = 0; SPL_INT(s); CURVNET_SET(TD_TO_VNET(p)); #if (BSD >= 199306) if (securelevel_ge(p->p_cred, 3) && (mode & FWRITE)) { V_ipfmain.ipf_interror = 130001; CURVNET_RESTORE(); return EPERM; } #endif unit = GET_MINOR(dev); if ((IPL_LOGMAX < unit) || (unit < 0)) { V_ipfmain.ipf_interror = 130002; CURVNET_RESTORE(); return ENXIO; } if (V_ipfmain.ipf_running <= 0) { if (unit != IPL_LOGIPF && cmd != SIOCIPFINTERROR) { V_ipfmain.ipf_interror = 130003; CURVNET_RESTORE(); return EIO; } if (cmd != SIOCIPFGETNEXT && cmd != SIOCIPFGET && cmd != SIOCIPFSET && cmd != SIOCFRENB && cmd != SIOCGETFS && cmd != SIOCGETFF && cmd != SIOCIPFINTERROR) { V_ipfmain.ipf_interror = 130004; CURVNET_RESTORE(); return EIO; } } SPL_NET(s); error = ipf_ioctlswitch(&V_ipfmain, unit, data, cmd, mode, p->p_uid, p); CURVNET_RESTORE(); if (error != -1) { SPL_X(s); return error; } SPL_X(s); return error; } /* * ipf_send_reset - this could conceivably be a call to tcp_respond(), but that * requires a large amount of setting up and isn't any more efficient. */ int ipf_send_reset(fin) fr_info_t *fin; { struct tcphdr *tcp, *tcp2; int tlen = 0, hlen; struct mbuf *m; #ifdef USE_INET6 ip6_t *ip6; #endif ip_t *ip; tcp = fin->fin_dp; if (tcp->th_flags & TH_RST) return -1; /* feedback loop */ if (ipf_checkl4sum(fin) == -1) return -1; tlen = fin->fin_dlen - (TCP_OFF(tcp) << 2) + ((tcp->th_flags & TH_SYN) ? 1 : 0) + ((tcp->th_flags & TH_FIN) ? 1 : 0); #ifdef USE_INET6 hlen = (fin->fin_v == 6) ? sizeof(ip6_t) : sizeof(ip_t); #else hlen = sizeof(ip_t); #endif #ifdef MGETHDR MGETHDR(m, M_NOWAIT, MT_HEADER); #else MGET(m, M_NOWAIT, MT_HEADER); #endif if (m == NULL) return -1; if (sizeof(*tcp2) + hlen > MLEN) { if (!(MCLGET(m, M_NOWAIT))) { FREE_MB_T(m); return -1; } } m->m_len = sizeof(*tcp2) + hlen; #if (BSD >= 199103) m->m_data += max_linkhdr; m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = (struct ifnet *)0; #endif ip = mtod(m, struct ip *); bzero((char *)ip, hlen); #ifdef USE_INET6 ip6 = (ip6_t *)ip; #endif tcp2 = (struct tcphdr *)((char *)ip + hlen); tcp2->th_sport = tcp->th_dport; tcp2->th_dport = tcp->th_sport; if (tcp->th_flags & TH_ACK) { tcp2->th_seq = tcp->th_ack; tcp2->th_flags = TH_RST; tcp2->th_ack = 0; } else { tcp2->th_seq = 0; tcp2->th_ack = ntohl(tcp->th_seq); tcp2->th_ack += tlen; tcp2->th_ack = htonl(tcp2->th_ack); tcp2->th_flags = TH_RST|TH_ACK; } TCP_X2_A(tcp2, 0); TCP_OFF_A(tcp2, sizeof(*tcp2) >> 2); tcp2->th_win = tcp->th_win; tcp2->th_sum = 0; tcp2->th_urp = 0; #ifdef USE_INET6 if (fin->fin_v == 6) { ip6->ip6_flow = ((ip6_t *)fin->fin_ip)->ip6_flow; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_hlim = 0; ip6->ip6_src = fin->fin_dst6.in6; ip6->ip6_dst = fin->fin_src6.in6; tcp2->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*ip6), sizeof(*tcp2)); return ipf_send_ip(fin, m); } #endif ip->ip_p = IPPROTO_TCP; ip->ip_len = htons(sizeof(struct tcphdr)); ip->ip_src.s_addr = fin->fin_daddr; ip->ip_dst.s_addr = fin->fin_saddr; tcp2->th_sum = in_cksum(m, hlen + sizeof(*tcp2)); ip->ip_len = htons(hlen + sizeof(*tcp2)); return ipf_send_ip(fin, m); } /* * ip_len must be in network byte order when called. */ static int ipf_send_ip(fin, m) fr_info_t *fin; mb_t *m; { fr_info_t fnew; ip_t *ip, *oip; int hlen; ip = mtod(m, ip_t *); bzero((char *)&fnew, sizeof(fnew)); fnew.fin_main_soft = fin->fin_main_soft; IP_V_A(ip, fin->fin_v); switch (fin->fin_v) { case 4 : oip = fin->fin_ip; hlen = sizeof(*oip); fnew.fin_v = 4; fnew.fin_p = ip->ip_p; fnew.fin_plen = ntohs(ip->ip_len); IP_HL_A(ip, sizeof(*oip) >> 2); ip->ip_tos = oip->ip_tos; ip->ip_id = fin->fin_ip->ip_id; ip->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0); ip->ip_ttl = V_ip_defttl; ip->ip_sum = 0; break; #ifdef USE_INET6 case 6 : { ip6_t *ip6 = (ip6_t *)ip; ip6->ip6_vfc = 0x60; ip6->ip6_hlim = IPDEFTTL; hlen = sizeof(*ip6); fnew.fin_p = ip6->ip6_nxt; fnew.fin_v = 6; fnew.fin_plen = ntohs(ip6->ip6_plen) + hlen; break; } #endif default : return EINVAL; } #ifdef IPSEC_SUPPORT m->m_pkthdr.rcvif = NULL; #endif fnew.fin_ifp = fin->fin_ifp; fnew.fin_flx = FI_NOCKSUM; fnew.fin_m = m; fnew.fin_ip = ip; fnew.fin_mp = &m; fnew.fin_hlen = hlen; fnew.fin_dp = (char *)ip + hlen; (void) ipf_makefrip(hlen, ip, &fnew); return ipf_fastroute(m, &m, &fnew, NULL); } int ipf_send_icmp_err(type, fin, dst) int type; fr_info_t *fin; int dst; { int err, hlen, xtra, iclen, ohlen, avail, code; struct in_addr dst4; struct icmp *icmp; struct mbuf *m; i6addr_t dst6; void *ifp; #ifdef USE_INET6 ip6_t *ip6; #endif ip_t *ip, *ip2; if ((type < 0) || (type >= ICMP_MAXTYPE)) return -1; code = fin->fin_icode; #ifdef USE_INET6 /* See NetBSD ip_fil_netbsd.c r1.4: */ if ((code < 0) || (code >= sizeof(icmptoicmp6unreach)/sizeof(int))) return -1; #endif if (ipf_checkl4sum(fin) == -1) return -1; #ifdef MGETHDR MGETHDR(m, M_NOWAIT, MT_HEADER); #else MGET(m, M_NOWAIT, MT_HEADER); #endif if (m == NULL) return -1; avail = MHLEN; xtra = 0; hlen = 0; ohlen = 0; dst4.s_addr = 0; ifp = fin->fin_ifp; if (fin->fin_v == 4) { if ((fin->fin_p == IPPROTO_ICMP) && !(fin->fin_flx & FI_SHORT)) switch (ntohs(fin->fin_data[0]) >> 8) { case ICMP_ECHO : case ICMP_TSTAMP : case ICMP_IREQ : case ICMP_MASKREQ : break; default : FREE_MB_T(m); return 0; } if (dst == 0) { if (ipf_ifpaddr(&V_ipfmain, 4, FRI_NORMAL, ifp, &dst6, NULL) == -1) { FREE_MB_T(m); return -1; } dst4 = dst6.in4; } else dst4.s_addr = fin->fin_daddr; hlen = sizeof(ip_t); ohlen = fin->fin_hlen; iclen = hlen + offsetof(struct icmp, icmp_ip) + ohlen; if (fin->fin_hlen < fin->fin_plen) xtra = MIN(fin->fin_dlen, 8); else xtra = 0; } #ifdef USE_INET6 else if (fin->fin_v == 6) { hlen = sizeof(ip6_t); ohlen = sizeof(ip6_t); iclen = hlen + offsetof(struct icmp, icmp_ip) + ohlen; type = icmptoicmp6types[type]; if (type == ICMP6_DST_UNREACH) code = icmptoicmp6unreach[code]; if (iclen + max_linkhdr + fin->fin_plen > avail) { if (!(MCLGET(m, M_NOWAIT))) { FREE_MB_T(m); return -1; } avail = MCLBYTES; } xtra = MIN(fin->fin_plen, avail - iclen - max_linkhdr); xtra = MIN(xtra, IPV6_MMTU - iclen); if (dst == 0) { if (ipf_ifpaddr(&V_ipfmain, 6, FRI_NORMAL, ifp, &dst6, NULL) == -1) { FREE_MB_T(m); return -1; } } else dst6 = fin->fin_dst6; } #endif else { FREE_MB_T(m); return -1; } avail -= (max_linkhdr + iclen); if (avail < 0) { FREE_MB_T(m); return -1; } if (xtra > avail) xtra = avail; iclen += xtra; m->m_data += max_linkhdr; m->m_pkthdr.rcvif = (struct ifnet *)0; m->m_pkthdr.len = iclen; m->m_len = iclen; ip = mtod(m, ip_t *); icmp = (struct icmp *)((char *)ip + hlen); ip2 = (ip_t *)&icmp->icmp_ip; icmp->icmp_type = type; icmp->icmp_code = fin->fin_icode; icmp->icmp_cksum = 0; #ifdef icmp_nextmtu if (type == ICMP_UNREACH && fin->fin_icode == ICMP_UNREACH_NEEDFRAG) { if (fin->fin_mtu != 0) { icmp->icmp_nextmtu = htons(fin->fin_mtu); } else if (ifp != NULL) { icmp->icmp_nextmtu = htons(GETIFMTU_4(ifp)); } else { /* make up a number... */ icmp->icmp_nextmtu = htons(fin->fin_plen - 20); } } #endif bcopy((char *)fin->fin_ip, (char *)ip2, ohlen); #ifdef USE_INET6 ip6 = (ip6_t *)ip; if (fin->fin_v == 6) { ip6->ip6_flow = ((ip6_t *)fin->fin_ip)->ip6_flow; ip6->ip6_plen = htons(iclen - hlen); ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 0; ip6->ip6_src = dst6.in6; ip6->ip6_dst = fin->fin_src6.in6; if (xtra > 0) bcopy((char *)fin->fin_ip + ohlen, (char *)&icmp->icmp_ip + ohlen, xtra); icmp->icmp_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), iclen - hlen); } else #endif { ip->ip_p = IPPROTO_ICMP; ip->ip_src.s_addr = dst4.s_addr; ip->ip_dst.s_addr = fin->fin_saddr; if (xtra > 0) bcopy((char *)fin->fin_ip + ohlen, (char *)&icmp->icmp_ip + ohlen, xtra); icmp->icmp_cksum = ipf_cksum((u_short *)icmp, sizeof(*icmp) + 8); ip->ip_len = htons(iclen); ip->ip_p = IPPROTO_ICMP; } err = ipf_send_ip(fin, m); return err; } /* * m0 - pointer to mbuf where the IP packet starts * mpp - pointer to the mbuf pointer that is the start of the mbuf chain */ int ipf_fastroute(m0, mpp, fin, fdp) mb_t *m0, **mpp; fr_info_t *fin; frdest_t *fdp; { register struct ip *ip, *mhip; register struct mbuf *m = *mpp; int len, off, error = 0, hlen, code; struct ifnet *ifp, *sifp; struct sockaddr_in dst; struct nhop4_extended nh4; int has_nhop = 0; u_long fibnum = 0; u_short ip_off; frdest_t node; frentry_t *fr; #ifdef M_WRITABLE /* * HOT FIX/KLUDGE: * * If the mbuf we're about to send is not writable (because of * a cluster reference, for example) we'll need to make a copy * of it since this routine modifies the contents. * * If you have non-crappy network hardware that can transmit data * from the mbuf, rather than making a copy, this is gonna be a * problem. */ if (M_WRITABLE(m) == 0) { m0 = m_dup(m, M_NOWAIT); if (m0 != NULL) { FREE_MB_T(m); m = m0; *mpp = m; } else { error = ENOBUFS; FREE_MB_T(m); goto done; } } #endif #ifdef USE_INET6 if (fin->fin_v == 6) { /* * currently "to " and "to :ip#" are not supported * for IPv6 */ return ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } #endif hlen = fin->fin_hlen; ip = mtod(m0, struct ip *); ifp = NULL; /* * Route packet. */ bzero(&dst, sizeof (dst)); dst.sin_family = AF_INET; dst.sin_addr = ip->ip_dst; dst.sin_len = sizeof(dst); fr = fin->fin_fr; if ((fr != NULL) && !(fr->fr_flags & FR_KEEPSTATE) && (fdp != NULL) && (fdp->fd_type == FRD_DSTLIST)) { if (ipf_dstlist_select_node(fin, fdp->fd_ptr, NULL, &node) == 0) fdp = &node; } if (fdp != NULL) ifp = fdp->fd_ptr; else ifp = fin->fin_ifp; if ((ifp == NULL) && ((fr == NULL) || !(fr->fr_flags & FR_FASTROUTE))) { error = -2; goto bad; } if ((fdp != NULL) && (fdp->fd_ip.s_addr != 0)) dst.sin_addr = fdp->fd_ip; fibnum = M_GETFIB(m0); if (fib4_lookup_nh_ext(fibnum, dst.sin_addr, NHR_REF, 0, &nh4) != 0) { if (in_localaddr(ip->ip_dst)) error = EHOSTUNREACH; else error = ENETUNREACH; goto bad; } has_nhop = 1; if (ifp == NULL) ifp = nh4.nh_ifp; if (nh4.nh_flags & NHF_GATEWAY) dst.sin_addr = nh4.nh_addr; /* * For input packets which are being "fastrouted", they won't * go back through output filtering and miss their chance to get * NAT'd and counted. Duplicated packets aren't considered to be * part of the normal packet stream, so do not NAT them or pass * them through stateful checking, etc. */ if ((fdp != &fr->fr_dif) && (fin->fin_out == 0)) { sifp = fin->fin_ifp; fin->fin_ifp = ifp; fin->fin_out = 1; (void) ipf_acctpkt(fin, NULL); fin->fin_fr = NULL; if (!fr || !(fr->fr_flags & FR_RETMASK)) { u_32_t pass; (void) ipf_state_check(fin, &pass); } switch (ipf_nat_checkout(fin, NULL)) { case 0 : break; case 1 : ip->ip_sum = 0; break; case -1 : error = -1; goto bad; break; } fin->fin_ifp = sifp; fin->fin_out = 0; } else ip->ip_sum = 0; /* * If small enough for interface, can just send directly. */ if (ntohs(ip->ip_len) <= ifp->if_mtu) { if (!ip->ip_sum) ip->ip_sum = in_cksum(m, hlen); error = (*ifp->if_output)(ifp, m, (struct sockaddr *)&dst, NULL ); goto done; } /* * Too large for interface; fragment if possible. * Must be able to put at least 8 bytes per fragment. */ ip_off = ntohs(ip->ip_off); if (ip_off & IP_DF) { error = EMSGSIZE; goto bad; } len = (ifp->if_mtu - hlen) &~ 7; if (len < 8) { error = EMSGSIZE; goto bad; } { int mhlen, firstlen = len; struct mbuf **mnext = &m->m_act; /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. */ m0 = m; mhlen = sizeof (struct ip); for (off = hlen + len; off < ntohs(ip->ip_len); off += len) { #ifdef MGETHDR MGETHDR(m, M_NOWAIT, MT_HEADER); #else MGET(m, M_NOWAIT, MT_HEADER); #endif if (m == NULL) { m = m0; error = ENOBUFS; goto bad; } m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); bcopy((char *)ip, (char *)mhip, sizeof(*ip)); if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); IP_HL_A(mhip, mhlen >> 2); } m->m_len = mhlen; mhip->ip_off = ((off - hlen) >> 3) + ip_off; if (off + len >= ntohs(ip->ip_len)) len = ntohs(ip->ip_len) - off; else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); *mnext = m; m->m_next = m_copym(m0, off, len, M_NOWAIT); if (m->m_next == 0) { error = ENOBUFS; /* ??? */ goto sendorfree; } m->m_pkthdr.len = mhlen + len; m->m_pkthdr.rcvif = NULL; mhip->ip_off = htons((u_short)mhip->ip_off); mhip->ip_sum = 0; mhip->ip_sum = in_cksum(m, mhlen); mnext = &m->m_act; } /* * Update first fragment by trimming what's been copied out * and updating header, then send each fragment (in order). */ m_adj(m0, hlen + firstlen - ip->ip_len); ip->ip_len = htons((u_short)(hlen + firstlen)); ip->ip_off = htons((u_short)IP_MF); ip->ip_sum = 0; ip->ip_sum = in_cksum(m0, hlen); sendorfree: for (m = m0; m; m = m0) { m0 = m->m_act; m->m_act = 0; if (error == 0) error = (*ifp->if_output)(ifp, m, (struct sockaddr *)&dst, NULL ); else FREE_MB_T(m); } } done: if (!error) V_ipfmain.ipf_frouteok[0]++; else V_ipfmain.ipf_frouteok[1]++; if (has_nhop) fib4_free_nh_ext(fibnum, &nh4); return 0; bad: if (error == EMSGSIZE) { sifp = fin->fin_ifp; code = fin->fin_icode; fin->fin_icode = ICMP_UNREACH_NEEDFRAG; fin->fin_ifp = ifp; (void) ipf_send_icmp_err(ICMP_UNREACH, fin, 1); fin->fin_ifp = sifp; fin->fin_icode = code; } FREE_MB_T(m); goto done; } int ipf_verifysrc(fin) fr_info_t *fin; { struct nhop4_basic nh4; if (fib4_lookup_nh_basic(0, fin->fin_src, 0, 0, &nh4) != 0) return (0); return (fin->fin_ifp == nh4.nh_ifp); } /* * return the first IP Address associated with an interface */ int ipf_ifpaddr(softc, v, atype, ifptr, inp, inpmask) ipf_main_softc_t *softc; int v, atype; void *ifptr; i6addr_t *inp, *inpmask; { #ifdef USE_INET6 struct in6_addr *ia6 = NULL; #endif struct sockaddr *sock, *mask; struct sockaddr_in *sin; struct ifaddr *ifa; struct ifnet *ifp; if ((ifptr == NULL) || (ifptr == (void *)-1)) return -1; sin = NULL; ifp = ifptr; if (v == 4) inp->in4.s_addr = 0; #ifdef USE_INET6 else if (v == 6) bzero((char *)inp, sizeof(*inp)); #endif ifa = CK_STAILQ_FIRST(&ifp->if_addrhead); sock = ifa->ifa_addr; while (sock != NULL && ifa != NULL) { sin = (struct sockaddr_in *)sock; if ((v == 4) && (sin->sin_family == AF_INET)) break; #ifdef USE_INET6 if ((v == 6) && (sin->sin_family == AF_INET6)) { ia6 = &((struct sockaddr_in6 *)sin)->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(ia6) && !IN6_IS_ADDR_LOOPBACK(ia6)) break; } #endif ifa = CK_STAILQ_NEXT(ifa, ifa_link); if (ifa != NULL) sock = ifa->ifa_addr; } if (ifa == NULL || sin == NULL) return -1; mask = ifa->ifa_netmask; if (atype == FRI_BROADCAST) sock = ifa->ifa_broadaddr; else if (atype == FRI_PEERADDR) sock = ifa->ifa_dstaddr; if (sock == NULL) return -1; #ifdef USE_INET6 if (v == 6) { return ipf_ifpfillv6addr(atype, (struct sockaddr_in6 *)sock, (struct sockaddr_in6 *)mask, inp, inpmask); } #endif return ipf_ifpfillv4addr(atype, (struct sockaddr_in *)sock, (struct sockaddr_in *)mask, &inp->in4, &inpmask->in4); } u_32_t ipf_newisn(fin) fr_info_t *fin; { u_32_t newiss; newiss = arc4random(); return newiss; } INLINE int ipf_checkv4sum(fin) fr_info_t *fin; { #ifdef CSUM_DATA_VALID int manual = 0; u_short sum; ip_t *ip; mb_t *m; if ((fin->fin_flx & FI_NOCKSUM) != 0) return 0; if ((fin->fin_flx & FI_SHORT) != 0) return 1; if (fin->fin_cksum != FI_CK_NEEDED) return (fin->fin_cksum > FI_CK_NEEDED) ? 0 : -1; m = fin->fin_m; if (m == NULL) { manual = 1; goto skipauto; } ip = fin->fin_ip; if ((m->m_pkthdr.csum_flags & (CSUM_IP_CHECKED|CSUM_IP_VALID)) == CSUM_IP_CHECKED) { fin->fin_cksum = FI_CK_BAD; fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv4sum_csum_ip_checked, fr_info_t *, fin, u_int, m->m_pkthdr.csum_flags & (CSUM_IP_CHECKED|CSUM_IP_VALID)); return -1; } if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { /* Depending on the driver, UDP may have zero checksum */ if (fin->fin_p == IPPROTO_UDP && (fin->fin_flx & (FI_FRAG|FI_SHORT|FI_BAD)) == 0) { udphdr_t *udp = fin->fin_dp; if (udp->uh_sum == 0) { /* * we're good no matter what the hardware * checksum flags and csum_data say (handling * of csum_data for zero UDP checksum is not * consistent across all drivers) */ fin->fin_cksum = 1; return 0; } } if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) sum = m->m_pkthdr.csum_data; else sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + fin->fin_dlen + fin->fin_p)); sum ^= 0xffff; if (sum != 0) { fin->fin_cksum = FI_CK_BAD; fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv4sum_sum, fr_info_t *, fin, u_int, sum); } else { fin->fin_cksum = FI_CK_SUMOK; return 0; } } else { if (m->m_pkthdr.csum_flags == CSUM_DELAY_DATA) { fin->fin_cksum = FI_CK_L4FULL; return 0; } else if (m->m_pkthdr.csum_flags == CSUM_TCP || m->m_pkthdr.csum_flags == CSUM_UDP) { fin->fin_cksum = FI_CK_L4PART; return 0; } else if (m->m_pkthdr.csum_flags == CSUM_IP) { fin->fin_cksum = FI_CK_L4PART; return 0; } else { manual = 1; } } skipauto: if (manual != 0) { if (ipf_checkl4sum(fin) == -1) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv4sum_manual, fr_info_t *, fin, u_int, manual); return -1; } } #else if (ipf_checkl4sum(fin) == -1) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv4sum_checkl4sum, fr_info_t *, fin, u_int, -1); return -1; } #endif return 0; } #ifdef USE_INET6 INLINE int ipf_checkv6sum(fin) fr_info_t *fin; { if ((fin->fin_flx & FI_NOCKSUM) != 0) { DT(ipf_checkv6sum_fi_nocksum); return 0; } if ((fin->fin_flx & FI_SHORT) != 0) { DT(ipf_checkv6sum_fi_short); return 1; } if (fin->fin_cksum != FI_CK_NEEDED) { DT(ipf_checkv6sum_fi_ck_needed); return (fin->fin_cksum > FI_CK_NEEDED) ? 0 : -1; } if (ipf_checkl4sum(fin) == -1) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv6sum_checkl4sum, fr_info_t *, fin, u_int, -1); return -1; } return 0; } #endif /* USE_INET6 */ size_t mbufchainlen(m0) struct mbuf *m0; { size_t len; if ((m0->m_flags & M_PKTHDR) != 0) { len = m0->m_pkthdr.len; } else { struct mbuf *m; for (m = m0, len = 0; m != NULL; m = m->m_next) len += m->m_len; } return len; } /* ------------------------------------------------------------------------ */ /* Function: ipf_pullup */ /* Returns: NULL == pullup failed, else pointer to protocol header */ /* Parameters: xmin(I)- pointer to buffer where data packet starts */ /* fin(I) - pointer to packet information */ /* len(I) - number of bytes to pullup */ /* */ /* Attempt to move at least len bytes (from the start of the buffer) into a */ /* single buffer for ease of access. Operating system native functions are */ /* used to manage buffers - if necessary. If the entire packet ends up in */ /* a single buffer, set the FI_COALESCE flag even though ipf_coalesce() has */ /* not been called. Both fin_ip and fin_dp are updated before exiting _IF_ */ /* and ONLY if the pullup succeeds. */ /* */ /* We assume that 'xmin' is a pointer to a buffer that is part of the chain */ /* of buffers that starts at *fin->fin_mp. */ /* ------------------------------------------------------------------------ */ void * ipf_pullup(xmin, fin, len) mb_t *xmin; fr_info_t *fin; int len; { int dpoff, ipoff; mb_t *m = xmin; char *ip; if (m == NULL) return NULL; ip = (char *)fin->fin_ip; if ((fin->fin_flx & FI_COALESCE) != 0) return ip; ipoff = fin->fin_ipoff; if (fin->fin_dp != NULL) dpoff = (char *)fin->fin_dp - (char *)ip; else dpoff = 0; if (M_LEN(m) < len) { mb_t *n = *fin->fin_mp; /* * Assume that M_PKTHDR is set and just work with what is left * rather than check.. * Should not make any real difference, anyway. */ if (m != n) { /* * Record the mbuf that points to the mbuf that we're * about to go to work on so that we can update the * m_next appropriately later. */ for (; n->m_next != m; n = n->m_next) ; } else { n = NULL; } #ifdef MHLEN if (len > MHLEN) #else if (len > MLEN) #endif { #ifdef HAVE_M_PULLDOWN if (m_pulldown(m, 0, len, NULL) == NULL) m = NULL; #else FREE_MB_T(*fin->fin_mp); m = NULL; n = NULL; #endif } else { m = m_pullup(m, len); } if (n != NULL) n->m_next = m; if (m == NULL) { /* * When n is non-NULL, it indicates that m pointed to * a sub-chain (tail) of the mbuf and that the head * of this chain has not yet been free'd. */ if (n != NULL) { FREE_MB_T(*fin->fin_mp); } *fin->fin_mp = NULL; fin->fin_m = NULL; return NULL; } if (n == NULL) *fin->fin_mp = m; while (M_LEN(m) == 0) { m = m->m_next; } fin->fin_m = m; ip = MTOD(m, char *) + ipoff; fin->fin_ip = (ip_t *)ip; if (fin->fin_dp != NULL) fin->fin_dp = (char *)fin->fin_ip + dpoff; if (fin->fin_fraghdr != NULL) fin->fin_fraghdr = (char *)ip + ((char *)fin->fin_fraghdr - (char *)fin->fin_ip); } if (len == fin->fin_plen) fin->fin_flx |= FI_COALESCE; return ip; } int ipf_inject(fin, m) fr_info_t *fin; mb_t *m; { + struct epoch_tracker et; int error = 0; + NET_EPOCH_ENTER(et); if (fin->fin_out == 0) { netisr_dispatch(NETISR_IP, m); } else { fin->fin_ip->ip_len = ntohs(fin->fin_ip->ip_len); fin->fin_ip->ip_off = ntohs(fin->fin_ip->ip_off); error = ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); } + NET_EPOCH_EXIT(et); return error; } VNET_DEFINE_STATIC(pfil_hook_t, ipf_inet_hook); VNET_DEFINE_STATIC(pfil_hook_t, ipf_inet6_hook); #define V_ipf_inet_hook VNET(ipf_inet_hook) #define V_ipf_inet6_hook VNET(ipf_inet6_hook) int ipf_pfil_unhook(void) { pfil_remove_hook(V_ipf_inet_hook); #ifdef USE_INET6 pfil_remove_hook(V_ipf_inet6_hook); #endif return (0); } int ipf_pfil_hook(void) { struct pfil_hook_args pha; struct pfil_link_args pla; int error, error6; pha.pa_version = PFIL_VERSION; pha.pa_flags = PFIL_IN | PFIL_OUT; pha.pa_modname = "ipfilter"; pha.pa_rulname = "default-ip4"; pha.pa_func = ipf_check_wrapper; pha.pa_ruleset = NULL; pha.pa_type = PFIL_TYPE_IP4; V_ipf_inet_hook = pfil_add_hook(&pha); #ifdef USE_INET6 pha.pa_rulname = "default-ip6"; pha.pa_func = ipf_check_wrapper6; pha.pa_type = PFIL_TYPE_IP6; V_ipf_inet6_hook = pfil_add_hook(&pha); #endif pla.pa_version = PFIL_VERSION; pla.pa_flags = PFIL_IN | PFIL_OUT | PFIL_HEADPTR | PFIL_HOOKPTR; pla.pa_head = V_inet_pfil_head; pla.pa_hook = V_ipf_inet_hook; error = pfil_link(&pla); error6 = 0; #ifdef USE_INET6 pla.pa_head = V_inet6_pfil_head; pla.pa_hook = V_ipf_inet6_hook; error6 = pfil_link(&pla); #endif if (error || error6) error = ENODEV; else error = 0; return (error); } void ipf_event_reg(void) { V_ipf_arrivetag = EVENTHANDLER_REGISTER(ifnet_arrival_event, \ ipf_ifevent, NULL, \ EVENTHANDLER_PRI_ANY); V_ipf_departtag = EVENTHANDLER_REGISTER(ifnet_departure_event, \ ipf_ifevent, NULL, \ EVENTHANDLER_PRI_ANY); #if 0 V_ipf_clonetag = EVENTHANDLER_REGISTER(if_clone_event, ipf_ifevent, \ NULL, EVENTHANDLER_PRI_ANY); #endif } void ipf_event_dereg(void) { if (V_ipf_arrivetag != NULL) { EVENTHANDLER_DEREGISTER(ifnet_arrival_event, V_ipf_arrivetag); } if (V_ipf_departtag != NULL) { EVENTHANDLER_DEREGISTER(ifnet_departure_event, V_ipf_departtag); } #if 0 if (V_ipf_clonetag != NULL) { EVENTHANDLER_DEREGISTER(if_clone_event, V_ipf_clonetag); } #endif } u_32_t ipf_random() { return arc4random(); } u_int ipf_pcksum(fin, hlen, sum) fr_info_t *fin; int hlen; u_int sum; { struct mbuf *m; u_int sum2; int off; m = fin->fin_m; off = (char *)fin->fin_dp - (char *)fin->fin_ip; m->m_data += hlen; m->m_len -= hlen; sum2 = in_cksum(fin->fin_m, fin->fin_plen - off); m->m_len += hlen; m->m_data -= hlen; /* * Both sum and sum2 are partial sums, so combine them together. */ sum += ~sum2 & 0xffff; while (sum > 0xffff) sum = (sum & 0xffff) + (sum >> 16); sum2 = ~sum & 0xffff; return sum2; } diff --git a/sys/dev/firewire/if_fwip.c b/sys/dev/firewire/if_fwip.c index 35a58c013707..6d2b3fb59221 100644 --- a/sys/dev/firewire/if_fwip.c +++ b/sys/dev/firewire/if_fwip.c @@ -1,933 +1,936 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2004 * Doug Rabson * Copyright (c) 2002-2003 * Hidetoshi Shimokawa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * * This product includes software developed by Hidetoshi Shimokawa. * * 4. Neither the name of the author nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifdef HAVE_KERNEL_OPTION_HEADERS #include "opt_device_polling.h" #include "opt_inet.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * We really need a mechanism for allocating regions in the FIFO * address space. We pick a address in the OHCI controller's 'middle' * address space. This means that the controller will automatically * send responses for us, which is fine since we don't have any * important information to put in the response anyway. */ #define INET_FIFO 0xfffe00000000LL #define FWIPDEBUG if (fwipdebug) if_printf #define TX_MAX_QUEUE (FWMAXQUEUE - 1) /* network interface */ static void fwip_start (struct ifnet *); static int fwip_ioctl (struct ifnet *, u_long, caddr_t); static void fwip_init (void *); static void fwip_post_busreset (void *); static void fwip_output_callback (struct fw_xfer *); static void fwip_async_output (struct fwip_softc *, struct ifnet *); static void fwip_start_send (void *, int); static void fwip_stream_input (struct fw_xferq *); static void fwip_unicast_input(struct fw_xfer *); static int fwipdebug = 0; static int broadcast_channel = 0xc0 | 0x1f; /* tag | channel(XXX) */ static int tx_speed = 2; static int rx_queue_len = FWMAXQUEUE; static MALLOC_DEFINE(M_FWIP, "if_fwip", "IP over FireWire interface"); SYSCTL_INT(_debug, OID_AUTO, if_fwip_debug, CTLFLAG_RW, &fwipdebug, 0, ""); SYSCTL_DECL(_hw_firewire); static SYSCTL_NODE(_hw_firewire, OID_AUTO, fwip, CTLFLAG_RD, 0, "Firewire ip subsystem"); SYSCTL_INT(_hw_firewire_fwip, OID_AUTO, rx_queue_len, CTLFLAG_RWTUN, &rx_queue_len, 0, "Length of the receive queue"); #ifdef DEVICE_POLLING static poll_handler_t fwip_poll; static int fwip_poll(struct ifnet *ifp, enum poll_cmd cmd, int count) { struct fwip_softc *fwip; struct firewire_comm *fc; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) return (0); fwip = ((struct fwip_eth_softc *)ifp->if_softc)->fwip; fc = fwip->fd.fc; fc->poll(fc, (cmd == POLL_AND_CHECK_STATUS)?0:1, count); return (0); } #endif /* DEVICE_POLLING */ static void fwip_identify(driver_t *driver, device_t parent) { BUS_ADD_CHILD(parent, 0, "fwip", device_get_unit(parent)); } static int fwip_probe(device_t dev) { device_t pa; pa = device_get_parent(dev); if (device_get_unit(dev) != device_get_unit(pa)) { return (ENXIO); } device_set_desc(dev, "IP over FireWire"); return (0); } static int fwip_attach(device_t dev) { struct fwip_softc *fwip; struct ifnet *ifp; int unit, s; struct fw_hwaddr *hwaddr; fwip = ((struct fwip_softc *)device_get_softc(dev)); unit = device_get_unit(dev); ifp = fwip->fw_softc.fwip_ifp = if_alloc(IFT_IEEE1394); if (ifp == NULL) return (ENOSPC); mtx_init(&fwip->mtx, "fwip", NULL, MTX_DEF); /* XXX */ fwip->dma_ch = -1; fwip->fd.fc = device_get_ivars(dev); if (tx_speed < 0) tx_speed = fwip->fd.fc->speed; fwip->fd.dev = dev; fwip->fd.post_explore = NULL; fwip->fd.post_busreset = fwip_post_busreset; fwip->fw_softc.fwip = fwip; TASK_INIT(&fwip->start_send, 0, fwip_start_send, fwip); /* * Encode our hardware the way that arp likes it. */ hwaddr = &IFP2FWC(fwip->fw_softc.fwip_ifp)->fc_hwaddr; hwaddr->sender_unique_ID_hi = htonl(fwip->fd.fc->eui.hi); hwaddr->sender_unique_ID_lo = htonl(fwip->fd.fc->eui.lo); hwaddr->sender_max_rec = fwip->fd.fc->maxrec; hwaddr->sspd = fwip->fd.fc->speed; hwaddr->sender_unicast_FIFO_hi = htons((uint16_t)(INET_FIFO >> 32)); hwaddr->sender_unicast_FIFO_lo = htonl((uint32_t)INET_FIFO); /* fill the rest and attach interface */ ifp->if_softc = &fwip->fw_softc; if_initname(ifp, device_get_name(dev), unit); ifp->if_init = fwip_init; ifp->if_start = fwip_start; ifp->if_ioctl = fwip_ioctl; ifp->if_flags = (IFF_BROADCAST|IFF_SIMPLEX|IFF_MULTICAST); ifp->if_snd.ifq_maxlen = TX_MAX_QUEUE; #ifdef DEVICE_POLLING ifp->if_capabilities |= IFCAP_POLLING; #endif s = splimp(); firewire_ifattach(ifp, hwaddr); splx(s); FWIPDEBUG(ifp, "interface created\n"); return 0; } static void fwip_stop(struct fwip_softc *fwip) { struct firewire_comm *fc; struct fw_xferq *xferq; struct ifnet *ifp = fwip->fw_softc.fwip_ifp; struct fw_xfer *xfer, *next; int i; fc = fwip->fd.fc; if (fwip->dma_ch >= 0) { xferq = fc->ir[fwip->dma_ch]; if (xferq->flag & FWXFERQ_RUNNING) fc->irx_disable(fc, fwip->dma_ch); xferq->flag &= ~(FWXFERQ_MODEMASK | FWXFERQ_OPEN | FWXFERQ_STREAM | FWXFERQ_EXTBUF | FWXFERQ_HANDLER | FWXFERQ_CHTAGMASK); xferq->hand = NULL; for (i = 0; i < xferq->bnchunk; i++) m_freem(xferq->bulkxfer[i].mbuf); free(xferq->bulkxfer, M_FWIP); fw_bindremove(fc, &fwip->fwb); for (xfer = STAILQ_FIRST(&fwip->fwb.xferlist); xfer != NULL; xfer = next) { next = STAILQ_NEXT(xfer, link); fw_xfer_free(xfer); } for (xfer = STAILQ_FIRST(&fwip->xferlist); xfer != NULL; xfer = next) { next = STAILQ_NEXT(xfer, link); fw_xfer_free(xfer); } STAILQ_INIT(&fwip->xferlist); xferq->bulkxfer = NULL; fwip->dma_ch = -1; } ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); } static int fwip_detach(device_t dev) { struct fwip_softc *fwip; struct ifnet *ifp; int s; fwip = (struct fwip_softc *)device_get_softc(dev); ifp = fwip->fw_softc.fwip_ifp; #ifdef DEVICE_POLLING if (ifp->if_capenable & IFCAP_POLLING) ether_poll_deregister(ifp); #endif s = splimp(); fwip_stop(fwip); firewire_ifdetach(ifp); if_free(ifp); mtx_destroy(&fwip->mtx); splx(s); return 0; } static void fwip_init(void *arg) { struct fwip_softc *fwip = ((struct fwip_eth_softc *)arg)->fwip; struct firewire_comm *fc; struct ifnet *ifp = fwip->fw_softc.fwip_ifp; struct fw_xferq *xferq; struct fw_xfer *xfer; struct mbuf *m; int i; FWIPDEBUG(ifp, "initializing\n"); fc = fwip->fd.fc; #define START 0 if (fwip->dma_ch < 0) { fwip->dma_ch = fw_open_isodma(fc, /* tx */0); if (fwip->dma_ch < 0) return; xferq = fc->ir[fwip->dma_ch]; xferq->flag |= FWXFERQ_EXTBUF | FWXFERQ_HANDLER | FWXFERQ_STREAM; xferq->flag &= ~0xff; xferq->flag |= broadcast_channel & 0xff; /* register fwip_input handler */ xferq->sc = (caddr_t) fwip; xferq->hand = fwip_stream_input; xferq->bnchunk = rx_queue_len; xferq->bnpacket = 1; xferq->psize = MCLBYTES; xferq->queued = 0; xferq->buf = NULL; xferq->bulkxfer = (struct fw_bulkxfer *) malloc( sizeof(struct fw_bulkxfer) * xferq->bnchunk, M_FWIP, M_WAITOK); if (xferq->bulkxfer == NULL) { printf("if_fwip: malloc failed\n"); return; } STAILQ_INIT(&xferq->stvalid); STAILQ_INIT(&xferq->stfree); STAILQ_INIT(&xferq->stdma); xferq->stproc = NULL; for (i = 0; i < xferq->bnchunk; i++) { m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR); xferq->bulkxfer[i].mbuf = m; m->m_len = m->m_pkthdr.len = m->m_ext.ext_size; STAILQ_INSERT_TAIL(&xferq->stfree, &xferq->bulkxfer[i], link); } fwip->fwb.start = INET_FIFO; fwip->fwb.end = INET_FIFO + 16384; /* S3200 packet size */ /* pre-allocate xfer */ STAILQ_INIT(&fwip->fwb.xferlist); for (i = 0; i < rx_queue_len; i++) { xfer = fw_xfer_alloc(M_FWIP); if (xfer == NULL) break; m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR); xfer->recv.payload = mtod(m, uint32_t *); xfer->recv.pay_len = MCLBYTES; xfer->hand = fwip_unicast_input; xfer->fc = fc; xfer->sc = (caddr_t)fwip; xfer->mbuf = m; STAILQ_INSERT_TAIL(&fwip->fwb.xferlist, xfer, link); } fw_bindadd(fc, &fwip->fwb); STAILQ_INIT(&fwip->xferlist); for (i = 0; i < TX_MAX_QUEUE; i++) { xfer = fw_xfer_alloc(M_FWIP); if (xfer == NULL) break; xfer->send.spd = tx_speed; xfer->fc = fwip->fd.fc; xfer->sc = (caddr_t)fwip; xfer->hand = fwip_output_callback; STAILQ_INSERT_TAIL(&fwip->xferlist, xfer, link); } } else xferq = fc->ir[fwip->dma_ch]; fwip->last_dest.hi = 0; fwip->last_dest.lo = 0; /* start dma */ if ((xferq->flag & FWXFERQ_RUNNING) == 0) fc->irx_enable(fc, fwip->dma_ch); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; #if 0 /* attempt to start output */ fwip_start(ifp); #endif } static int fwip_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct fwip_softc *fwip = ((struct fwip_eth_softc *)ifp->if_softc)->fwip; int s, error; switch (cmd) { case SIOCSIFFLAGS: s = splimp(); if (ifp->if_flags & IFF_UP) { if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) fwip_init(&fwip->fw_softc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) fwip_stop(fwip); } splx(s); break; case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCSIFCAP: #ifdef DEVICE_POLLING { struct ifreq *ifr = (struct ifreq *) data; struct firewire_comm *fc = fwip->fd.fc; if (ifr->ifr_reqcap & IFCAP_POLLING && !(ifp->if_capenable & IFCAP_POLLING)) { error = ether_poll_register(fwip_poll, ifp); if (error) return (error); /* Disable interrupts */ fc->set_intr(fc, 0); ifp->if_capenable |= IFCAP_POLLING; return (error); } if (!(ifr->ifr_reqcap & IFCAP_POLLING) && ifp->if_capenable & IFCAP_POLLING) { error = ether_poll_deregister(ifp); /* Enable interrupts. */ fc->set_intr(fc, 1); ifp->if_capenable &= ~IFCAP_POLLING; return (error); } } #endif /* DEVICE_POLLING */ break; default: s = splimp(); error = firewire_ioctl(ifp, cmd, data); splx(s); return (error); } return (0); } static void fwip_post_busreset(void *arg) { struct fwip_softc *fwip = arg; struct crom_src *src; struct crom_chunk *root; src = fwip->fd.fc->crom_src; root = fwip->fd.fc->crom_root; /* RFC2734 IPv4 over IEEE1394 */ bzero(&fwip->unit4, sizeof(struct crom_chunk)); crom_add_chunk(src, root, &fwip->unit4, CROM_UDIR); crom_add_entry(&fwip->unit4, CSRKEY_SPEC, CSRVAL_IETF); crom_add_simple_text(src, &fwip->unit4, &fwip->spec4, "IANA"); crom_add_entry(&fwip->unit4, CSRKEY_VER, 1); crom_add_simple_text(src, &fwip->unit4, &fwip->ver4, "IPv4"); /* RFC3146 IPv6 over IEEE1394 */ bzero(&fwip->unit6, sizeof(struct crom_chunk)); crom_add_chunk(src, root, &fwip->unit6, CROM_UDIR); crom_add_entry(&fwip->unit6, CSRKEY_SPEC, CSRVAL_IETF); crom_add_simple_text(src, &fwip->unit6, &fwip->spec6, "IANA"); crom_add_entry(&fwip->unit6, CSRKEY_VER, 2); crom_add_simple_text(src, &fwip->unit6, &fwip->ver6, "IPv6"); fwip->last_dest.hi = 0; fwip->last_dest.lo = 0; firewire_busreset(fwip->fw_softc.fwip_ifp); } static void fwip_output_callback(struct fw_xfer *xfer) { struct fwip_softc *fwip; struct ifnet *ifp; int s; fwip = (struct fwip_softc *)xfer->sc; ifp = fwip->fw_softc.fwip_ifp; /* XXX error check */ FWIPDEBUG(ifp, "resp = %d\n", xfer->resp); if (xfer->resp != 0) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(xfer->mbuf); fw_xfer_unload(xfer); s = splimp(); FWIP_LOCK(fwip); STAILQ_INSERT_TAIL(&fwip->xferlist, xfer, link); FWIP_UNLOCK(fwip); splx(s); /* for queue full */ if (ifp->if_snd.ifq_head != NULL) { fwip_start(ifp); } } static void fwip_start(struct ifnet *ifp) { struct fwip_softc *fwip = ((struct fwip_eth_softc *)ifp->if_softc)->fwip; int s; FWIPDEBUG(ifp, "starting\n"); if (fwip->dma_ch < 0) { struct mbuf *m = NULL; FWIPDEBUG(ifp, "not ready\n"); s = splimp(); do { IF_DEQUEUE(&ifp->if_snd, m); if (m != NULL) m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } while (m != NULL); splx(s); return; } s = splimp(); ifp->if_drv_flags |= IFF_DRV_OACTIVE; if (ifp->if_snd.ifq_len != 0) fwip_async_output(fwip, ifp); ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; splx(s); } /* Async. stream output */ static void fwip_async_output(struct fwip_softc *fwip, struct ifnet *ifp) { struct firewire_comm *fc = fwip->fd.fc; struct mbuf *m; struct m_tag *mtag; struct fw_hwaddr *destfw; struct fw_xfer *xfer; struct fw_xferq *xferq; struct fw_pkt *fp; uint16_t nodeid; int error; int i = 0; xfer = NULL; xferq = fc->atq; while ((xferq->queued < xferq->maxq - 1) && (ifp->if_snd.ifq_head != NULL)) { FWIP_LOCK(fwip); xfer = STAILQ_FIRST(&fwip->xferlist); if (xfer == NULL) { FWIP_UNLOCK(fwip); #if 0 printf("if_fwip: lack of xfer\n"); #endif break; } STAILQ_REMOVE_HEAD(&fwip->xferlist, link); FWIP_UNLOCK(fwip); IF_DEQUEUE(&ifp->if_snd, m); if (m == NULL) { FWIP_LOCK(fwip); STAILQ_INSERT_HEAD(&fwip->xferlist, xfer, link); FWIP_UNLOCK(fwip); break; } /* * Dig out the link-level address which * firewire_output got via arp or neighbour * discovery. If we don't have a link-level address, * just stick the thing on the broadcast channel. */ mtag = m_tag_locate(m, MTAG_FIREWIRE, MTAG_FIREWIRE_HWADDR, 0); if (mtag == NULL) destfw = NULL; else destfw = (struct fw_hwaddr *) (mtag + 1); /* * We don't do any bpf stuff here - the generic code * in firewire_output gives the packet to bpf before * it adds the link-level encapsulation. */ /* * Put the mbuf in the xfer early in case we hit an * error case below - fwip_output_callback will free * the mbuf. */ xfer->mbuf = m; /* * We use the arp result (if any) to add a suitable firewire * packet header before handing off to the bus. */ fp = &xfer->send.hdr; nodeid = FWLOCALBUS | fc->nodeid; if ((m->m_flags & M_BCAST) || !destfw) { /* * Broadcast packets are sent as GASP packets with * specifier ID 0x00005e, version 1 on the broadcast * channel. To be conservative, we send at the * slowest possible speed. */ uint32_t *p; M_PREPEND(m, 2*sizeof(uint32_t), M_NOWAIT); p = mtod(m, uint32_t *); fp->mode.stream.len = m->m_pkthdr.len; fp->mode.stream.chtag = broadcast_channel; fp->mode.stream.tcode = FWTCODE_STREAM; fp->mode.stream.sy = 0; xfer->send.spd = 0; p[0] = htonl(nodeid << 16); p[1] = htonl((0x5e << 24) | 1); } else { /* * Unicast packets are sent as block writes to the * target's unicast fifo address. If we can't * find the node address, we just give up. We * could broadcast it but that might overflow * the packet size limitations due to the * extra GASP header. Note: the hardware * address is stored in network byte order to * make life easier for ARP. */ struct fw_device *fd; struct fw_eui64 eui; eui.hi = ntohl(destfw->sender_unique_ID_hi); eui.lo = ntohl(destfw->sender_unique_ID_lo); if (fwip->last_dest.hi != eui.hi || fwip->last_dest.lo != eui.lo) { fd = fw_noderesolve_eui64(fc, &eui); if (!fd) { /* error */ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); /* XXX set error code */ fwip_output_callback(xfer); continue; } fwip->last_hdr.mode.wreqb.dst = FWLOCALBUS | fd->dst; fwip->last_hdr.mode.wreqb.tlrt = 0; fwip->last_hdr.mode.wreqb.tcode = FWTCODE_WREQB; fwip->last_hdr.mode.wreqb.pri = 0; fwip->last_hdr.mode.wreqb.src = nodeid; fwip->last_hdr.mode.wreqb.dest_hi = ntohs(destfw->sender_unicast_FIFO_hi); fwip->last_hdr.mode.wreqb.dest_lo = ntohl(destfw->sender_unicast_FIFO_lo); fwip->last_hdr.mode.wreqb.extcode = 0; fwip->last_dest = eui; } fp->mode.wreqb = fwip->last_hdr.mode.wreqb; fp->mode.wreqb.len = m->m_pkthdr.len; xfer->send.spd = min(destfw->sspd, fc->speed); } xfer->send.pay_len = m->m_pkthdr.len; error = fw_asyreq(fc, -1, xfer); if (error == EAGAIN) { /* * We ran out of tlabels - requeue the packet * for later transmission. */ xfer->mbuf = 0; FWIP_LOCK(fwip); STAILQ_INSERT_TAIL(&fwip->xferlist, xfer, link); FWIP_UNLOCK(fwip); IF_PREPEND(&ifp->if_snd, m); break; } if (error) { /* error */ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); /* XXX set error code */ fwip_output_callback(xfer); continue; } else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); i++; } } #if 0 if (i > 1) printf("%d queued\n", i); #endif if (i > 0) xferq->start(fc); } static void fwip_start_send (void *arg, int count) { struct fwip_softc *fwip = arg; fwip->fd.fc->atq->start(fwip->fd.fc); } /* Async. stream output */ static void fwip_stream_input(struct fw_xferq *xferq) { + struct epoch_tracker et; struct mbuf *m, *m0; struct m_tag *mtag; struct ifnet *ifp; struct fwip_softc *fwip; struct fw_bulkxfer *sxfer; struct fw_pkt *fp; uint16_t src; uint32_t *p; fwip = (struct fwip_softc *)xferq->sc; ifp = fwip->fw_softc.fwip_ifp; + NET_EPOCH_ENTER(et); while ((sxfer = STAILQ_FIRST(&xferq->stvalid)) != NULL) { STAILQ_REMOVE_HEAD(&xferq->stvalid, link); fp = mtod(sxfer->mbuf, struct fw_pkt *); if (fwip->fd.fc->irx_post != NULL) fwip->fd.fc->irx_post(fwip->fd.fc, fp->mode.ld); m = sxfer->mbuf; /* insert new rbuf */ sxfer->mbuf = m0 = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m0 != NULL) { m0->m_len = m0->m_pkthdr.len = m0->m_ext.ext_size; STAILQ_INSERT_TAIL(&xferq->stfree, sxfer, link); } else printf("fwip_as_input: m_getcl failed\n"); /* * We must have a GASP header - leave the * encapsulation sanity checks to the generic * code. Remember that we also have the firewire async * stream header even though that isn't accounted for * in mode.stream.len. */ if (sxfer->resp != 0 || fp->mode.stream.len < 2*sizeof(uint32_t)) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); continue; } m->m_len = m->m_pkthdr.len = fp->mode.stream.len + sizeof(fp->mode.stream); /* * If we received the packet on the broadcast channel, * mark it as broadcast, otherwise we assume it must * be multicast. */ if (fp->mode.stream.chtag == broadcast_channel) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; /* * Make sure we recognise the GASP specifier and * version. */ p = mtod(m, uint32_t *); if ((((ntohl(p[1]) & 0xffff) << 8) | ntohl(p[2]) >> 24) != 0x00005e || (ntohl(p[2]) & 0xffffff) != 1) { FWIPDEBUG(ifp, "Unrecognised GASP header %#08x %#08x\n", ntohl(p[1]), ntohl(p[2])); m_freem(m); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); continue; } /* * Record the sender ID for possible BPF usage. */ src = ntohl(p[1]) >> 16; if (bpf_peers_present(ifp->if_bpf)) { mtag = m_tag_alloc(MTAG_FIREWIRE, MTAG_FIREWIRE_SENDER_EUID, 2*sizeof(uint32_t), M_NOWAIT); if (mtag) { /* bpf wants it in network byte order */ struct fw_device *fd; uint32_t *p = (uint32_t *) (mtag + 1); fd = fw_noderesolve_nodeid(fwip->fd.fc, src & 0x3f); if (fd) { p[0] = htonl(fd->eui.hi); p[1] = htonl(fd->eui.lo); } else { p[0] = 0; p[1] = 0; } m_tag_prepend(m, mtag); } } /* * Trim off the GASP header */ m_adj(m, 3*sizeof(uint32_t)); m->m_pkthdr.rcvif = ifp; firewire_input(ifp, m, src); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); } + NET_EPOCH_EXIT(et); if (STAILQ_FIRST(&xferq->stfree) != NULL) fwip->fd.fc->irx_enable(fwip->fd.fc, fwip->dma_ch); } static __inline void fwip_unicast_input_recycle(struct fwip_softc *fwip, struct fw_xfer *xfer) { struct mbuf *m; /* * We have finished with a unicast xfer. Allocate a new * cluster and stick it on the back of the input queue. */ m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR); xfer->mbuf = m; xfer->recv.payload = mtod(m, uint32_t *); xfer->recv.pay_len = MCLBYTES; xfer->mbuf = m; STAILQ_INSERT_TAIL(&fwip->fwb.xferlist, xfer, link); } static void fwip_unicast_input(struct fw_xfer *xfer) { uint64_t address; struct mbuf *m; struct m_tag *mtag; struct ifnet *ifp; struct fwip_softc *fwip; struct fw_pkt *fp; //struct fw_pkt *sfp; int rtcode; fwip = (struct fwip_softc *)xfer->sc; ifp = fwip->fw_softc.fwip_ifp; m = xfer->mbuf; xfer->mbuf = 0; fp = &xfer->recv.hdr; /* * Check the fifo address - we only accept addresses of * exactly INET_FIFO. */ address = ((uint64_t)fp->mode.wreqb.dest_hi << 32) | fp->mode.wreqb.dest_lo; if (fp->mode.wreqb.tcode != FWTCODE_WREQB) { rtcode = FWRCODE_ER_TYPE; } else if (address != INET_FIFO) { rtcode = FWRCODE_ER_ADDR; } else { rtcode = FWRCODE_COMPLETE; } /* * Pick up a new mbuf and stick it on the back of the receive * queue. */ fwip_unicast_input_recycle(fwip, xfer); /* * If we've already rejected the packet, give up now. */ if (rtcode != FWRCODE_COMPLETE) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return; } if (bpf_peers_present(ifp->if_bpf)) { /* * Record the sender ID for possible BPF usage. */ mtag = m_tag_alloc(MTAG_FIREWIRE, MTAG_FIREWIRE_SENDER_EUID, 2*sizeof(uint32_t), M_NOWAIT); if (mtag) { /* bpf wants it in network byte order */ struct fw_device *fd; uint32_t *p = (uint32_t *) (mtag + 1); fd = fw_noderesolve_nodeid(fwip->fd.fc, fp->mode.wreqb.src & 0x3f); if (fd) { p[0] = htonl(fd->eui.hi); p[1] = htonl(fd->eui.lo); } else { p[0] = 0; p[1] = 0; } m_tag_prepend(m, mtag); } } /* * Hand off to the generic encapsulation code. We don't use * ifp->if_input so that we can pass the source nodeid as an * argument to facilitate link-level fragment reassembly. */ m->m_len = m->m_pkthdr.len = fp->mode.wreqb.len; m->m_pkthdr.rcvif = ifp; firewire_input(ifp, m, fp->mode.wreqb.src); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); } static devclass_t fwip_devclass; static device_method_t fwip_methods[] = { /* device interface */ DEVMETHOD(device_identify, fwip_identify), DEVMETHOD(device_probe, fwip_probe), DEVMETHOD(device_attach, fwip_attach), DEVMETHOD(device_detach, fwip_detach), { 0, 0 } }; static driver_t fwip_driver = { "fwip", fwip_methods, sizeof(struct fwip_softc), }; DRIVER_MODULE(fwip, firewire, fwip_driver, fwip_devclass, 0, 0); MODULE_VERSION(fwip, 1); MODULE_DEPEND(fwip, firewire, 1, 1, 1); diff --git a/sys/dev/iicbus/if_ic.c b/sys/dev/iicbus/if_ic.c index ca50fbb11c33..4dac86141230 100644 --- a/sys/dev/iicbus/if_ic.c +++ b/sys/dev/iicbus/if_ic.c @@ -1,436 +1,440 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1998, 2001 Nicolas Souchu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * I2C bus IP driver */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "iicbus_if.h" #define PCF_MASTER_ADDRESS 0xaa #define ICHDRLEN sizeof(u_int32_t) #define ICMTU 1500 /* default mtu */ struct ic_softc { struct ifnet *ic_ifp; device_t ic_dev; u_char ic_addr; /* peer I2C address */ int ic_flags; char *ic_obuf; char *ic_ifbuf; char *ic_cp; int ic_xfercnt; int ic_iferrs; struct mtx ic_lock; }; #define IC_SENDING 0x0001 #define IC_OBUF_BUSY 0x0002 #define IC_IFBUF_BUSY 0x0004 #define IC_BUFFERS_BUSY (IC_OBUF_BUSY | IC_IFBUF_BUSY) #define IC_BUFFER_WAITER 0x0004 static devclass_t ic_devclass; static int icprobe(device_t); static int icattach(device_t); static int icioctl(struct ifnet *, u_long, caddr_t); static int icoutput(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int icintr(device_t, int, char *); static device_method_t ic_methods[] = { /* device interface */ DEVMETHOD(device_probe, icprobe), DEVMETHOD(device_attach, icattach), /* iicbus interface */ DEVMETHOD(iicbus_intr, icintr), { 0, 0 } }; static driver_t ic_driver = { "ic", ic_methods, sizeof(struct ic_softc), }; static void ic_alloc_buffers(struct ic_softc *sc, int mtu) { char *obuf, *ifbuf; obuf = malloc(mtu + ICHDRLEN, M_DEVBUF, M_WAITOK); ifbuf = malloc(mtu + ICHDRLEN, M_DEVBUF, M_WAITOK); mtx_lock(&sc->ic_lock); while (sc->ic_flags & IC_BUFFERS_BUSY) { sc->ic_flags |= IC_BUFFER_WAITER; mtx_sleep(sc, &sc->ic_lock, 0, "icalloc", 0); sc->ic_flags &= ~IC_BUFFER_WAITER; } free(sc->ic_obuf, M_DEVBUF); free(sc->ic_ifbuf, M_DEVBUF); sc->ic_obuf = obuf; sc->ic_ifbuf = ifbuf; sc->ic_ifp->if_mtu = mtu; mtx_unlock(&sc->ic_lock); } /* * icprobe() */ static int icprobe(device_t dev) { return (BUS_PROBE_NOWILDCARD); } /* * icattach() */ static int icattach(device_t dev) { struct ic_softc *sc = (struct ic_softc *)device_get_softc(dev); struct ifnet *ifp; ifp = sc->ic_ifp = if_alloc(IFT_PARA); if (ifp == NULL) return (ENOSPC); mtx_init(&sc->ic_lock, device_get_nameunit(dev), MTX_NETWORK_LOCK, MTX_DEF); sc->ic_addr = PCF_MASTER_ADDRESS; /* XXX only PCF masters */ sc->ic_dev = dev; ifp->if_softc = sc; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_flags = IFF_SIMPLEX | IFF_POINTOPOINT | IFF_MULTICAST; ifp->if_ioctl = icioctl; ifp->if_output = icoutput; ifp->if_hdrlen = 0; ifp->if_addrlen = 0; ifp->if_snd.ifq_maxlen = ifqmaxlen; ic_alloc_buffers(sc, ICMTU); if_attach(ifp); bpfattach(ifp, DLT_NULL, ICHDRLEN); return (0); } /* * iciotcl() */ static int icioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ic_softc *sc = ifp->if_softc; device_t icdev = sc->ic_dev; device_t parent = device_get_parent(icdev); struct ifaddr *ifa = (struct ifaddr *)data; struct ifreq *ifr = (struct ifreq *)data; int error; switch (cmd) { case SIOCAIFADDR: case SIOCSIFADDR: if (ifa->ifa_addr->sa_family != AF_INET) return (EAFNOSUPPORT); mtx_lock(&sc->ic_lock); ifp->if_flags |= IFF_UP; goto locked; case SIOCSIFFLAGS: mtx_lock(&sc->ic_lock); locked: if ((!(ifp->if_flags & IFF_UP)) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* XXX disable PCF */ ifp->if_drv_flags &= ~IFF_DRV_RUNNING; mtx_unlock(&sc->ic_lock); /* IFF_UP is not set, try to release the bus anyway */ iicbus_release_bus(parent, icdev); break; } if (((ifp->if_flags & IFF_UP)) && (!(ifp->if_drv_flags & IFF_DRV_RUNNING))) { mtx_unlock(&sc->ic_lock); if ((error = iicbus_request_bus(parent, icdev, IIC_WAIT | IIC_INTR))) return (error); mtx_lock(&sc->ic_lock); iicbus_reset(parent, IIC_FASTEST, 0, NULL); ifp->if_drv_flags |= IFF_DRV_RUNNING; } mtx_unlock(&sc->ic_lock); break; case SIOCSIFMTU: ic_alloc_buffers(sc, ifr->ifr_mtu); break; case SIOCGIFMTU: mtx_lock(&sc->ic_lock); ifr->ifr_mtu = sc->ic_ifp->if_mtu; mtx_unlock(&sc->ic_lock); break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifr == NULL) return (EAFNOSUPPORT); /* XXX */ switch (ifr->ifr_addr.sa_family) { case AF_INET: break; default: return (EAFNOSUPPORT); } break; default: return (EINVAL); } return (0); } /* * icintr() */ static int icintr(device_t dev, int event, char *ptr) { struct ic_softc *sc = (struct ic_softc *)device_get_softc(dev); struct mbuf *top; int len; mtx_lock(&sc->ic_lock); switch (event) { case INTR_GENERAL: case INTR_START: sc->ic_cp = sc->ic_ifbuf; sc->ic_xfercnt = 0; sc->ic_flags |= IC_IFBUF_BUSY; break; case INTR_STOP: /* if any error occurred during transfert, * drop the packet */ sc->ic_flags &= ~IC_IFBUF_BUSY; if ((sc->ic_flags & (IC_BUFFERS_BUSY | IC_BUFFER_WAITER)) == IC_BUFFER_WAITER) wakeup(&sc); if (sc->ic_iferrs) goto err; if ((len = sc->ic_xfercnt) == 0) break; /* ignore */ if (len <= ICHDRLEN) goto err; len -= ICHDRLEN; if_inc_counter(sc->ic_ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(sc->ic_ifp, IFCOUNTER_IBYTES, len); BPF_TAP(sc->ic_ifp, sc->ic_ifbuf, len + ICHDRLEN); top = m_devget(sc->ic_ifbuf + ICHDRLEN, len, 0, sc->ic_ifp, 0); if (top) { + struct epoch_tracker et; + mtx_unlock(&sc->ic_lock); M_SETFIB(top, sc->ic_ifp->if_fib); + NET_EPOCH_ENTER(et); netisr_dispatch(NETISR_IP, top); + NET_EPOCH_EXIT(et); mtx_lock(&sc->ic_lock); } break; err: if_printf(sc->ic_ifp, "errors (%d)!\n", sc->ic_iferrs); sc->ic_iferrs = 0; /* reset error count */ if_inc_counter(sc->ic_ifp, IFCOUNTER_IERRORS, 1); break; case INTR_RECEIVE: if (sc->ic_xfercnt >= sc->ic_ifp->if_mtu + ICHDRLEN) { sc->ic_iferrs++; } else { *sc->ic_cp++ = *ptr; sc->ic_xfercnt++; } break; case INTR_NOACK: /* xfer terminated by master */ break; case INTR_TRANSMIT: *ptr = 0xff; /* XXX */ break; case INTR_ERROR: sc->ic_iferrs++; break; default: panic("%s: unknown event (%d)!", __func__, event); } mtx_unlock(&sc->ic_lock); return (0); } /* * icoutput() */ static int icoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct ic_softc *sc = ifp->if_softc; device_t icdev = sc->ic_dev; device_t parent = device_get_parent(icdev); int len, sent; struct mbuf *mm; u_char *cp; u_int32_t hdr; /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &hdr, sizeof(hdr)); else hdr = dst->sa_family; mtx_lock(&sc->ic_lock); ifp->if_drv_flags |= IFF_DRV_RUNNING; /* already sending? */ if (sc->ic_flags & IC_SENDING) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); goto error; } /* insert header */ bcopy ((char *)&hdr, sc->ic_obuf, ICHDRLEN); cp = sc->ic_obuf + ICHDRLEN; len = 0; mm = m; do { if (len + mm->m_len > sc->ic_ifp->if_mtu) { /* packet too large */ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); goto error; } bcopy(mtod(mm,char *), cp, mm->m_len); cp += mm->m_len; len += mm->m_len; } while ((mm = mm->m_next)); BPF_MTAP2(ifp, &hdr, sizeof(hdr), m); sc->ic_flags |= (IC_SENDING | IC_OBUF_BUSY); m_freem(m); mtx_unlock(&sc->ic_lock); /* send the packet */ if (iicbus_block_write(parent, sc->ic_addr, sc->ic_obuf, len + ICHDRLEN, &sent)) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); } mtx_lock(&sc->ic_lock); sc->ic_flags &= ~(IC_SENDING | IC_OBUF_BUSY); if ((sc->ic_flags & (IC_BUFFERS_BUSY | IC_BUFFER_WAITER)) == IC_BUFFER_WAITER) wakeup(&sc); mtx_unlock(&sc->ic_lock); return (0); error: m_freem(m); mtx_unlock(&sc->ic_lock); return(0); } DRIVER_MODULE(ic, iicbus, ic_driver, ic_devclass, 0, 0); MODULE_DEPEND(ic, iicbus, IICBUS_MINVER, IICBUS_PREFVER, IICBUS_MAXVER); MODULE_VERSION(ic, 1); diff --git a/sys/dev/usb/net/if_usie.c b/sys/dev/usb/net/if_usie.c index a6e9c62f2537..eb92f4aabe4b 100644 --- a/sys/dev/usb/net/if_usie.c +++ b/sys/dev/usb/net/if_usie.c @@ -1,1618 +1,1621 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 Anybots Inc * written by Akinori Furukoshi * - ucom part is based on u3g.c * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "usbdevs.h" #define USB_DEBUG_VAR usie_debug #include #include #include #include #include #ifdef USB_DEBUG static int usie_debug = 0; static SYSCTL_NODE(_hw_usb, OID_AUTO, usie, CTLFLAG_RW, 0, "sierra USB modem"); SYSCTL_INT(_hw_usb_usie, OID_AUTO, debug, CTLFLAG_RWTUN, &usie_debug, 0, "usie debug level"); #endif /* Sierra Wireless Direct IP modems */ static const STRUCT_USB_HOST_ID usie_devs[] = { #define USIE_DEV(v, d) { \ USB_VP(USB_VENDOR_##v, USB_PRODUCT_##v##_##d) } USIE_DEV(SIERRA, MC8700), USIE_DEV(SIERRA, TRUINSTALL), USIE_DEV(AIRPRIME, USB308), #undef USIE_DEV }; static device_probe_t usie_probe; static device_attach_t usie_attach; static device_detach_t usie_detach; static void usie_free_softc(struct usie_softc *); static void usie_free(struct ucom_softc *); static void usie_uc_update_line_state(struct ucom_softc *, uint8_t); static void usie_uc_cfg_get_status(struct ucom_softc *, uint8_t *, uint8_t *); static void usie_uc_cfg_set_dtr(struct ucom_softc *, uint8_t); static void usie_uc_cfg_set_rts(struct ucom_softc *, uint8_t); static void usie_uc_cfg_open(struct ucom_softc *); static void usie_uc_cfg_close(struct ucom_softc *); static void usie_uc_start_read(struct ucom_softc *); static void usie_uc_stop_read(struct ucom_softc *); static void usie_uc_start_write(struct ucom_softc *); static void usie_uc_stop_write(struct ucom_softc *); static usb_callback_t usie_uc_tx_callback; static usb_callback_t usie_uc_rx_callback; static usb_callback_t usie_uc_status_callback; static usb_callback_t usie_if_tx_callback; static usb_callback_t usie_if_rx_callback; static usb_callback_t usie_if_status_callback; static void usie_if_sync_to(void *); static void usie_if_sync_cb(void *, int); static void usie_if_status_cb(void *, int); static void usie_if_start(struct ifnet *); static int usie_if_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void usie_if_init(void *); static void usie_if_stop(struct usie_softc *); static int usie_if_ioctl(struct ifnet *, u_long, caddr_t); static int usie_do_request(struct usie_softc *, struct usb_device_request *, void *); static int usie_if_cmd(struct usie_softc *, uint8_t); static void usie_cns_req(struct usie_softc *, uint32_t, uint16_t); static void usie_cns_rsp(struct usie_softc *, struct usie_cns *); static void usie_hip_rsp(struct usie_softc *, uint8_t *, uint32_t); static int usie_driver_loaded(struct module *, int, void *); static const struct usb_config usie_uc_config[USIE_UC_N_XFER] = { [USIE_UC_STATUS] = { .type = UE_INTERRUPT, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_IN, .bufsize = 0, /* use wMaxPacketSize */ .flags = {.pipe_bof = 1,.short_xfer_ok = 1,}, .callback = &usie_uc_status_callback, }, [USIE_UC_RX] = { .type = UE_BULK, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_IN, .bufsize = USIE_BUFSIZE, .flags = {.pipe_bof = 1,.short_xfer_ok = 1,.proxy_buffer = 1,}, .callback = &usie_uc_rx_callback, }, [USIE_UC_TX] = { .type = UE_BULK, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_OUT, .bufsize = USIE_BUFSIZE, .flags = {.pipe_bof = 1,.force_short_xfer = 1,}, .callback = &usie_uc_tx_callback, } }; static const struct usb_config usie_if_config[USIE_IF_N_XFER] = { [USIE_IF_STATUS] = { .type = UE_INTERRUPT, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_IN, .bufsize = 0, /* use wMaxPacketSize */ .flags = {.pipe_bof = 1,.short_xfer_ok = 1,}, .callback = &usie_if_status_callback, }, [USIE_IF_RX] = { .type = UE_BULK, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_IN, .bufsize = USIE_BUFSIZE, .flags = {.pipe_bof = 1,.short_xfer_ok = 1,}, .callback = &usie_if_rx_callback, }, [USIE_IF_TX] = { .type = UE_BULK, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_OUT, .bufsize = MAX(USIE_BUFSIZE, MCLBYTES), .flags = {.pipe_bof = 1,.force_short_xfer = 1,}, .callback = &usie_if_tx_callback, } }; static device_method_t usie_methods[] = { DEVMETHOD(device_probe, usie_probe), DEVMETHOD(device_attach, usie_attach), DEVMETHOD(device_detach, usie_detach), DEVMETHOD_END }; static driver_t usie_driver = { .name = "usie", .methods = usie_methods, .size = sizeof(struct usie_softc), }; static devclass_t usie_devclass; static eventhandler_tag usie_etag; DRIVER_MODULE(usie, uhub, usie_driver, usie_devclass, usie_driver_loaded, 0); MODULE_DEPEND(usie, ucom, 1, 1, 1); MODULE_DEPEND(usie, usb, 1, 1, 1); MODULE_VERSION(usie, 1); USB_PNP_HOST_INFO(usie_devs); static const struct ucom_callback usie_uc_callback = { .ucom_cfg_get_status = &usie_uc_cfg_get_status, .ucom_cfg_set_dtr = &usie_uc_cfg_set_dtr, .ucom_cfg_set_rts = &usie_uc_cfg_set_rts, .ucom_cfg_open = &usie_uc_cfg_open, .ucom_cfg_close = &usie_uc_cfg_close, .ucom_start_read = &usie_uc_start_read, .ucom_stop_read = &usie_uc_stop_read, .ucom_start_write = &usie_uc_start_write, .ucom_stop_write = &usie_uc_stop_write, .ucom_free = &usie_free, }; static void usie_autoinst(void *arg, struct usb_device *udev, struct usb_attach_arg *uaa) { struct usb_interface *iface; struct usb_interface_descriptor *id; struct usb_device_request req; int err; if (uaa->dev_state != UAA_DEV_READY) return; iface = usbd_get_iface(udev, 0); if (iface == NULL) return; id = iface->idesc; if (id == NULL || id->bInterfaceClass != UICLASS_MASS) return; if (usbd_lookup_id_by_uaa(usie_devs, sizeof(usie_devs), uaa) != 0) return; /* no device match */ if (bootverbose) { DPRINTF("Ejecting %s %s\n", usb_get_manufacturer(udev), usb_get_product(udev)); } req.bmRequestType = UT_VENDOR; req.bRequest = UR_SET_INTERFACE; USETW(req.wValue, UF_DEVICE_REMOTE_WAKEUP); USETW(req.wIndex, UHF_PORT_CONNECTION); USETW(req.wLength, 0); /* at this moment there is no mutex */ err = usbd_do_request_flags(udev, NULL, &req, NULL, 0, NULL, 250 /* ms */ ); /* success, mark the udev as disappearing */ if (err == 0) uaa->dev_state = UAA_DEV_EJECTING; } static int usie_probe(device_t self) { struct usb_attach_arg *uaa = device_get_ivars(self); if (uaa->usb_mode != USB_MODE_HOST) return (ENXIO); if (uaa->info.bConfigIndex != USIE_CNFG_INDEX) return (ENXIO); if (uaa->info.bIfaceIndex != USIE_IFACE_INDEX) return (ENXIO); if (uaa->info.bInterfaceClass != UICLASS_VENDOR) return (ENXIO); return (usbd_lookup_id_by_uaa(usie_devs, sizeof(usie_devs), uaa)); } static int usie_attach(device_t self) { struct usie_softc *sc = device_get_softc(self); struct usb_attach_arg *uaa = device_get_ivars(self); struct ifnet *ifp; struct usb_interface *iface; struct usb_interface_descriptor *id; struct usb_device_request req; int err; uint16_t fwattr; uint8_t iface_index; uint8_t ifidx; uint8_t start; device_set_usb_desc(self); sc->sc_udev = uaa->device; sc->sc_dev = self; mtx_init(&sc->sc_mtx, "usie", MTX_NETWORK_LOCK, MTX_DEF); ucom_ref(&sc->sc_super_ucom); TASK_INIT(&sc->sc_if_status_task, 0, usie_if_status_cb, sc); TASK_INIT(&sc->sc_if_sync_task, 0, usie_if_sync_cb, sc); usb_callout_init_mtx(&sc->sc_if_sync_ch, &sc->sc_mtx, 0); mtx_lock(&sc->sc_mtx); /* set power mode to D0 */ req.bmRequestType = UT_WRITE_VENDOR_DEVICE; req.bRequest = USIE_POWER; USETW(req.wValue, 0); USETW(req.wIndex, 0); USETW(req.wLength, 0); if (usie_do_request(sc, &req, NULL)) { mtx_unlock(&sc->sc_mtx); goto detach; } /* read fw attr */ fwattr = 0; req.bmRequestType = UT_READ_VENDOR_DEVICE; req.bRequest = USIE_FW_ATTR; USETW(req.wValue, 0); USETW(req.wIndex, 0); USETW(req.wLength, sizeof(fwattr)); if (usie_do_request(sc, &req, &fwattr)) { mtx_unlock(&sc->sc_mtx); goto detach; } mtx_unlock(&sc->sc_mtx); /* check DHCP supports */ DPRINTF("fwattr=%x\n", fwattr); if (!(fwattr & USIE_FW_DHCP)) { device_printf(self, "DHCP is not supported. A firmware upgrade might be needed.\n"); } /* find available interfaces */ sc->sc_nucom = 0; for (ifidx = 0; ifidx < USIE_IFACE_MAX; ifidx++) { iface = usbd_get_iface(uaa->device, ifidx); if (iface == NULL) break; id = usbd_get_interface_descriptor(iface); if ((id == NULL) || (id->bInterfaceClass != UICLASS_VENDOR)) continue; /* setup Direct IP transfer */ if (id->bInterfaceNumber >= 7 && id->bNumEndpoints == 3) { sc->sc_if_ifnum = id->bInterfaceNumber; iface_index = ifidx; DPRINTF("ifnum=%d, ifidx=%d\n", sc->sc_if_ifnum, ifidx); err = usbd_transfer_setup(uaa->device, &iface_index, sc->sc_if_xfer, usie_if_config, USIE_IF_N_XFER, sc, &sc->sc_mtx); if (err == 0) continue; device_printf(self, "could not allocate USB transfers on " "iface_index=%d, err=%s\n", iface_index, usbd_errstr(err)); goto detach; } /* setup ucom */ if (sc->sc_nucom >= USIE_UCOM_MAX) continue; usbd_set_parent_iface(uaa->device, ifidx, uaa->info.bIfaceIndex); DPRINTF("NumEndpoints=%d bInterfaceNumber=%d\n", id->bNumEndpoints, id->bInterfaceNumber); if (id->bNumEndpoints == 2) { sc->sc_uc_xfer[sc->sc_nucom][0] = NULL; start = 1; } else start = 0; err = usbd_transfer_setup(uaa->device, &ifidx, sc->sc_uc_xfer[sc->sc_nucom] + start, usie_uc_config + start, USIE_UC_N_XFER - start, &sc->sc_ucom[sc->sc_nucom], &sc->sc_mtx); if (err != 0) { DPRINTF("usbd_transfer_setup error=%s\n", usbd_errstr(err)); continue; } mtx_lock(&sc->sc_mtx); for (; start < USIE_UC_N_XFER; start++) usbd_xfer_set_stall(sc->sc_uc_xfer[sc->sc_nucom][start]); mtx_unlock(&sc->sc_mtx); sc->sc_uc_ifnum[sc->sc_nucom] = id->bInterfaceNumber; sc->sc_nucom++; /* found a port */ } if (sc->sc_nucom == 0) { device_printf(self, "no comports found\n"); goto detach; } err = ucom_attach(&sc->sc_super_ucom, sc->sc_ucom, sc->sc_nucom, sc, &usie_uc_callback, &sc->sc_mtx); if (err != 0) { DPRINTF("ucom_attach failed\n"); goto detach; } DPRINTF("Found %d interfaces.\n", sc->sc_nucom); /* setup ifnet (Direct IP) */ sc->sc_ifp = ifp = if_alloc(IFT_OTHER); if (ifp == NULL) { device_printf(self, "Could not allocate a network interface\n"); goto detach; } if_initname(ifp, "usie", device_get_unit(self)); ifp->if_softc = sc; ifp->if_mtu = USIE_MTU_MAX; ifp->if_flags |= IFF_NOARP; ifp->if_init = usie_if_init; ifp->if_ioctl = usie_if_ioctl; ifp->if_start = usie_if_start; ifp->if_output = usie_if_output; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_snd.ifq_drv_maxlen = ifqmaxlen; IFQ_SET_READY(&ifp->if_snd); if_attach(ifp); bpfattach(ifp, DLT_RAW, 0); if (fwattr & USIE_PM_AUTO) { usbd_set_power_mode(uaa->device, USB_POWER_MODE_SAVE); DPRINTF("enabling automatic suspend and resume\n"); } else { usbd_set_power_mode(uaa->device, USB_POWER_MODE_ON); DPRINTF("USB power is always ON\n"); } DPRINTF("device attached\n"); return (0); detach: usie_detach(self); return (ENOMEM); } static int usie_detach(device_t self) { struct usie_softc *sc = device_get_softc(self); uint8_t x; /* detach ifnet */ if (sc->sc_ifp != NULL) { usie_if_stop(sc); usbd_transfer_unsetup(sc->sc_if_xfer, USIE_IF_N_XFER); bpfdetach(sc->sc_ifp); if_detach(sc->sc_ifp); if_free(sc->sc_ifp); sc->sc_ifp = NULL; } /* detach ucom */ if (sc->sc_nucom > 0) ucom_detach(&sc->sc_super_ucom, sc->sc_ucom); /* stop all USB transfers */ usbd_transfer_unsetup(sc->sc_if_xfer, USIE_IF_N_XFER); for (x = 0; x != USIE_UCOM_MAX; x++) usbd_transfer_unsetup(sc->sc_uc_xfer[x], USIE_UC_N_XFER); device_claim_softc(self); usie_free_softc(sc); return (0); } UCOM_UNLOAD_DRAIN(usie); static void usie_free_softc(struct usie_softc *sc) { if (ucom_unref(&sc->sc_super_ucom)) { mtx_destroy(&sc->sc_mtx); device_free_softc(sc); } } static void usie_free(struct ucom_softc *ucom) { usie_free_softc(ucom->sc_parent); } static void usie_uc_update_line_state(struct ucom_softc *ucom, uint8_t ls) { struct usie_softc *sc = ucom->sc_parent; struct usb_device_request req; if (sc->sc_uc_xfer[ucom->sc_subunit][USIE_UC_STATUS] == NULL) return; req.bmRequestType = UT_WRITE_CLASS_INTERFACE; req.bRequest = USIE_LINK_STATE; USETW(req.wValue, ls); USETW(req.wIndex, sc->sc_uc_ifnum[ucom->sc_subunit]); USETW(req.wLength, 0); DPRINTF("sc_uc_ifnum=%d\n", sc->sc_uc_ifnum[ucom->sc_subunit]); usie_do_request(sc, &req, NULL); } static void usie_uc_cfg_get_status(struct ucom_softc *ucom, uint8_t *lsr, uint8_t *msr) { struct usie_softc *sc = ucom->sc_parent; *msr = sc->sc_msr; *lsr = sc->sc_lsr; } static void usie_uc_cfg_set_dtr(struct ucom_softc *ucom, uint8_t flag) { uint8_t dtr; dtr = flag ? USIE_LS_DTR : 0; usie_uc_update_line_state(ucom, dtr); } static void usie_uc_cfg_set_rts(struct ucom_softc *ucom, uint8_t flag) { uint8_t rts; rts = flag ? USIE_LS_RTS : 0; usie_uc_update_line_state(ucom, rts); } static void usie_uc_cfg_open(struct ucom_softc *ucom) { struct usie_softc *sc = ucom->sc_parent; /* usbd_transfer_start() is NULL safe */ usbd_transfer_start(sc->sc_uc_xfer[ucom->sc_subunit][USIE_UC_STATUS]); } static void usie_uc_cfg_close(struct ucom_softc *ucom) { struct usie_softc *sc = ucom->sc_parent; usbd_transfer_stop(sc->sc_uc_xfer[ucom->sc_subunit][USIE_UC_STATUS]); } static void usie_uc_start_read(struct ucom_softc *ucom) { struct usie_softc *sc = ucom->sc_parent; usbd_transfer_start(sc->sc_uc_xfer[ucom->sc_subunit][USIE_UC_RX]); } static void usie_uc_stop_read(struct ucom_softc *ucom) { struct usie_softc *sc = ucom->sc_parent; usbd_transfer_stop(sc->sc_uc_xfer[ucom->sc_subunit][USIE_UC_RX]); } static void usie_uc_start_write(struct ucom_softc *ucom) { struct usie_softc *sc = ucom->sc_parent; usbd_transfer_start(sc->sc_uc_xfer[ucom->sc_subunit][USIE_UC_TX]); } static void usie_uc_stop_write(struct ucom_softc *ucom) { struct usie_softc *sc = ucom->sc_parent; usbd_transfer_stop(sc->sc_uc_xfer[ucom->sc_subunit][USIE_UC_TX]); } static void usie_uc_rx_callback(struct usb_xfer *xfer, usb_error_t error) { struct ucom_softc *ucom = usbd_xfer_softc(xfer); struct usie_softc *sc = ucom->sc_parent; struct usb_page_cache *pc; uint32_t actlen; usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL); switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: pc = usbd_xfer_get_frame(xfer, 0); /* handle CnS response */ if (ucom == sc->sc_ucom && actlen >= USIE_HIPCNS_MIN) { DPRINTF("transferred=%u\n", actlen); /* check if it is really CnS reply */ usbd_copy_out(pc, 0, sc->sc_resp_temp, 1); if (sc->sc_resp_temp[0] == USIE_HIP_FRM_CHR) { /* verify actlen */ if (actlen > USIE_BUFSIZE) actlen = USIE_BUFSIZE; /* get complete message */ usbd_copy_out(pc, 0, sc->sc_resp_temp, actlen); usie_hip_rsp(sc, sc->sc_resp_temp, actlen); /* need to fall though */ goto tr_setup; } /* else call ucom_put_data() */ } /* standard ucom transfer */ ucom_put_data(ucom, pc, 0, actlen); /* fall though */ case USB_ST_SETUP: tr_setup: usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer)); usbd_transfer_submit(xfer); break; default: /* Error */ if (error != USB_ERR_CANCELLED) { usbd_xfer_set_stall(xfer); goto tr_setup; } break; } } static void usie_uc_tx_callback(struct usb_xfer *xfer, usb_error_t error) { struct ucom_softc *ucom = usbd_xfer_softc(xfer); struct usb_page_cache *pc; uint32_t actlen; switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: case USB_ST_SETUP: tr_setup: pc = usbd_xfer_get_frame(xfer, 0); /* handle CnS request */ struct mbuf *m = usbd_xfer_get_priv(xfer); if (m != NULL) { usbd_m_copy_in(pc, 0, m, 0, m->m_pkthdr.len); usbd_xfer_set_frame_len(xfer, 0, m->m_pkthdr.len); usbd_xfer_set_priv(xfer, NULL); usbd_transfer_submit(xfer); m_freem(m); break; } /* standard ucom transfer */ if (ucom_get_data(ucom, pc, 0, USIE_BUFSIZE, &actlen)) { usbd_xfer_set_frame_len(xfer, 0, actlen); usbd_transfer_submit(xfer); } break; default: /* Error */ if (error != USB_ERR_CANCELLED) { usbd_xfer_set_stall(xfer); goto tr_setup; } break; } } static void usie_uc_status_callback(struct usb_xfer *xfer, usb_error_t error) { struct usb_page_cache *pc; struct { struct usb_device_request req; uint16_t param; } st; uint32_t actlen; uint16_t param; usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL); switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: DPRINTFN(4, "info received, actlen=%u\n", actlen); if (actlen < sizeof(st)) { DPRINTF("data too short actlen=%u\n", actlen); goto tr_setup; } pc = usbd_xfer_get_frame(xfer, 0); usbd_copy_out(pc, 0, &st, sizeof(st)); if (st.req.bmRequestType == 0xa1 && st.req.bRequest == 0x20) { struct ucom_softc *ucom = usbd_xfer_softc(xfer); struct usie_softc *sc = ucom->sc_parent; param = le16toh(st.param); DPRINTF("param=%x\n", param); sc->sc_msr = sc->sc_lsr = 0; sc->sc_msr |= (param & USIE_DCD) ? SER_DCD : 0; sc->sc_msr |= (param & USIE_DSR) ? SER_DSR : 0; sc->sc_msr |= (param & USIE_RI) ? SER_RI : 0; sc->sc_msr |= (param & USIE_CTS) ? 0 : SER_CTS; sc->sc_msr |= (param & USIE_RTS) ? SER_RTS : 0; sc->sc_msr |= (param & USIE_DTR) ? SER_DTR : 0; } /* fall though */ case USB_ST_SETUP: tr_setup: usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer)); usbd_transfer_submit(xfer); break; default: /* Error */ DPRINTF("USB transfer error, %s\n", usbd_errstr(error)); if (error != USB_ERR_CANCELLED) { usbd_xfer_set_stall(xfer); goto tr_setup; } break; } } static void usie_if_rx_callback(struct usb_xfer *xfer, usb_error_t error) { + struct epoch_tracker et; struct usie_softc *sc = usbd_xfer_softc(xfer); struct ifnet *ifp = sc->sc_ifp; struct mbuf *m0; struct mbuf *m = NULL; struct usie_desc *rxd; uint32_t actlen; uint16_t err; uint16_t pkt; uint16_t ipl; uint16_t len; uint16_t diff; uint8_t pad; uint8_t ipv; usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL); switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: DPRINTFN(15, "rx done, actlen=%u\n", actlen); if (actlen < sizeof(struct usie_hip)) { DPRINTF("data too short %u\n", actlen); goto tr_setup; } m = sc->sc_rxm; sc->sc_rxm = NULL; /* fall though */ case USB_ST_SETUP: tr_setup: if (sc->sc_rxm == NULL) { sc->sc_rxm = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, MJUMPAGESIZE /* could be bigger than MCLBYTES */ ); } if (sc->sc_rxm == NULL) { DPRINTF("could not allocate Rx mbuf\n"); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); usbd_xfer_set_stall(xfer); usbd_xfer_set_frames(xfer, 0); } else { /* * Directly loading a mbuf cluster into DMA to * save some data copying. This works because * there is only one cluster. */ usbd_xfer_set_frame_data(xfer, 0, mtod(sc->sc_rxm, caddr_t), MIN(MJUMPAGESIZE, USIE_RXSZ_MAX)); usbd_xfer_set_frames(xfer, 1); } usbd_transfer_submit(xfer); break; default: /* Error */ DPRINTF("USB transfer error, %s\n", usbd_errstr(error)); if (error != USB_ERR_CANCELLED) { /* try to clear stall first */ usbd_xfer_set_stall(xfer); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); goto tr_setup; } if (sc->sc_rxm != NULL) { m_freem(sc->sc_rxm); sc->sc_rxm = NULL; } break; } if (m == NULL) return; mtx_unlock(&sc->sc_mtx); m->m_pkthdr.len = m->m_len = actlen; err = pkt = 0; /* HW can aggregate multiple frames in a single USB xfer */ + NET_EPOCH_ENTER(et); for (;;) { rxd = mtod(m, struct usie_desc *); len = be16toh(rxd->hip.len) & USIE_HIP_IP_LEN_MASK; pad = (rxd->hip.id & USIE_HIP_PAD) ? 1 : 0; ipl = (len - pad - ETHER_HDR_LEN); if (ipl >= len) { DPRINTF("Corrupt frame\n"); m_freem(m); break; } diff = sizeof(struct usie_desc) + ipl + pad; if (((rxd->hip.id & USIE_HIP_MASK) != USIE_HIP_IP) || (be16toh(rxd->desc_type) & USIE_TYPE_MASK) != USIE_IP_RX) { DPRINTF("received wrong type of packet\n"); m->m_data += diff; m->m_pkthdr.len = (m->m_len -= diff); err++; if (m->m_pkthdr.len > 0) continue; m_freem(m); break; } switch (be16toh(rxd->ethhdr.ether_type)) { case ETHERTYPE_IP: ipv = NETISR_IP; break; #ifdef INET6 case ETHERTYPE_IPV6: ipv = NETISR_IPV6; break; #endif default: DPRINTF("unsupported ether type\n"); err++; break; } /* the last packet */ if (m->m_pkthdr.len <= diff) { m->m_data += (sizeof(struct usie_desc) + pad); m->m_pkthdr.len = m->m_len = ipl; m->m_pkthdr.rcvif = ifp; BPF_MTAP(sc->sc_ifp, m); netisr_dispatch(ipv, m); break; } /* copy aggregated frames to another mbuf */ m0 = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (__predict_false(m0 == NULL)) { DPRINTF("could not allocate mbuf\n"); err++; m_freem(m); break; } m_copydata(m, sizeof(struct usie_desc) + pad, ipl, mtod(m0, caddr_t)); m0->m_pkthdr.rcvif = ifp; m0->m_pkthdr.len = m0->m_len = ipl; BPF_MTAP(sc->sc_ifp, m0); netisr_dispatch(ipv, m0); m->m_data += diff; m->m_pkthdr.len = (m->m_len -= diff); } + NET_EPOCH_EXIT(et); mtx_lock(&sc->sc_mtx); if_inc_counter(ifp, IFCOUNTER_IERRORS, err); if_inc_counter(ifp, IFCOUNTER_IPACKETS, pkt); } static void usie_if_tx_callback(struct usb_xfer *xfer, usb_error_t error) { struct usie_softc *sc = usbd_xfer_softc(xfer); struct usb_page_cache *pc; struct ifnet *ifp = sc->sc_ifp; struct mbuf *m; uint16_t size; switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: DPRINTFN(11, "transfer complete\n"); ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* fall though */ case USB_ST_SETUP: tr_setup: if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; IFQ_DRV_DEQUEUE(&ifp->if_snd, m); if (m == NULL) break; if (m->m_pkthdr.len > (int)(MCLBYTES - ETHER_HDR_LEN + ETHER_CRC_LEN - sizeof(sc->sc_txd))) { DPRINTF("packet len is too big: %d\n", m->m_pkthdr.len); break; } pc = usbd_xfer_get_frame(xfer, 0); sc->sc_txd.hip.len = htobe16(m->m_pkthdr.len + ETHER_HDR_LEN + ETHER_CRC_LEN); size = sizeof(sc->sc_txd); usbd_copy_in(pc, 0, &sc->sc_txd, size); usbd_m_copy_in(pc, size, m, 0, m->m_pkthdr.len); usbd_xfer_set_frame_len(xfer, 0, m->m_pkthdr.len + size + ETHER_CRC_LEN); BPF_MTAP(ifp, m); m_freem(m); usbd_transfer_submit(xfer); break; default: /* Error */ DPRINTF("USB transfer error, %s\n", usbd_errstr(error)); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); if (error != USB_ERR_CANCELLED) { usbd_xfer_set_stall(xfer); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); goto tr_setup; } break; } } static void usie_if_status_callback(struct usb_xfer *xfer, usb_error_t error) { struct usie_softc *sc = usbd_xfer_softc(xfer); struct usb_page_cache *pc; struct usb_cdc_notification cdc; uint32_t actlen; usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL); switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: DPRINTFN(4, "info received, actlen=%d\n", actlen); /* usb_cdc_notification - .data[16] */ if (actlen < (sizeof(cdc) - 16)) { DPRINTF("data too short %d\n", actlen); goto tr_setup; } pc = usbd_xfer_get_frame(xfer, 0); usbd_copy_out(pc, 0, &cdc, (sizeof(cdc) - 16)); DPRINTFN(4, "bNotification=%x\n", cdc.bNotification); if (cdc.bNotification & UCDC_N_RESPONSE_AVAILABLE) { taskqueue_enqueue(taskqueue_thread, &sc->sc_if_status_task); } /* fall though */ case USB_ST_SETUP: tr_setup: usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer)); usbd_transfer_submit(xfer); break; default: /* Error */ DPRINTF("USB transfer error, %s\n", usbd_errstr(error)); if (error != USB_ERR_CANCELLED) { usbd_xfer_set_stall(xfer); goto tr_setup; } break; } } static void usie_if_sync_to(void *arg) { struct usie_softc *sc = arg; taskqueue_enqueue(taskqueue_thread, &sc->sc_if_sync_task); } static void usie_if_sync_cb(void *arg, int pending) { struct usie_softc *sc = arg; mtx_lock(&sc->sc_mtx); /* call twice */ usie_if_cmd(sc, USIE_HIP_SYNC2M); usie_if_cmd(sc, USIE_HIP_SYNC2M); usb_callout_reset(&sc->sc_if_sync_ch, 2 * hz, usie_if_sync_to, sc); mtx_unlock(&sc->sc_mtx); } static void usie_if_status_cb(void *arg, int pending) { struct usie_softc *sc = arg; struct ifnet *ifp = sc->sc_ifp; struct usb_device_request req; struct usie_hip *hip; struct usie_lsi *lsi; uint16_t actlen; uint8_t ntries; uint8_t pad; mtx_lock(&sc->sc_mtx); req.bmRequestType = UT_READ_CLASS_INTERFACE; req.bRequest = UCDC_GET_ENCAPSULATED_RESPONSE; USETW(req.wValue, 0); USETW(req.wIndex, sc->sc_if_ifnum); USETW(req.wLength, sizeof(sc->sc_status_temp)); for (ntries = 0; ntries != 10; ntries++) { int err; err = usbd_do_request_flags(sc->sc_udev, &sc->sc_mtx, &req, sc->sc_status_temp, USB_SHORT_XFER_OK, &actlen, USB_DEFAULT_TIMEOUT); if (err == 0) break; DPRINTF("Control request failed: %s %d/10\n", usbd_errstr(err), ntries); usb_pause_mtx(&sc->sc_mtx, USB_MS_TO_TICKS(10)); } if (ntries == 10) { mtx_unlock(&sc->sc_mtx); DPRINTF("Timeout\n"); return; } hip = (struct usie_hip *)sc->sc_status_temp; pad = (hip->id & USIE_HIP_PAD) ? 1 : 0; DPRINTF("hip.id=%x hip.len=%d actlen=%u pad=%d\n", hip->id, be16toh(hip->len), actlen, pad); switch (hip->id & USIE_HIP_MASK) { case USIE_HIP_SYNC2H: usie_if_cmd(sc, USIE_HIP_SYNC2M); break; case USIE_HIP_RESTR: usb_callout_stop(&sc->sc_if_sync_ch); break; case USIE_HIP_UMTS: lsi = (struct usie_lsi *)( sc->sc_status_temp + sizeof(struct usie_hip) + pad); DPRINTF("lsi.proto=%x lsi.len=%d\n", lsi->proto, be16toh(lsi->len)); if (lsi->proto != USIE_LSI_UMTS) break; if (lsi->area == USIE_LSI_AREA_NO || lsi->area == USIE_LSI_AREA_NODATA) { device_printf(sc->sc_dev, "no service available\n"); break; } if (lsi->state == USIE_LSI_STATE_IDLE) { DPRINTF("lsi.state=%x\n", lsi->state); break; } DPRINTF("ctx=%x\n", hip->param); sc->sc_txd.hip.param = hip->param; sc->sc_net.addr_len = lsi->pdp_addr_len; memcpy(&sc->sc_net.dns1_addr, &lsi->dns1_addr, 16); memcpy(&sc->sc_net.dns2_addr, &lsi->dns2_addr, 16); memcpy(sc->sc_net.pdp_addr, lsi->pdp_addr, 16); memcpy(sc->sc_net.gw_addr, lsi->gw_addr, 16); ifp->if_flags |= IFF_UP; ifp->if_drv_flags |= IFF_DRV_RUNNING; device_printf(sc->sc_dev, "IP Addr=%d.%d.%d.%d\n", *lsi->pdp_addr, *(lsi->pdp_addr + 1), *(lsi->pdp_addr + 2), *(lsi->pdp_addr + 3)); device_printf(sc->sc_dev, "Gateway Addr=%d.%d.%d.%d\n", *lsi->gw_addr, *(lsi->gw_addr + 1), *(lsi->gw_addr + 2), *(lsi->gw_addr + 3)); device_printf(sc->sc_dev, "Prim NS Addr=%d.%d.%d.%d\n", *lsi->dns1_addr, *(lsi->dns1_addr + 1), *(lsi->dns1_addr + 2), *(lsi->dns1_addr + 3)); device_printf(sc->sc_dev, "Scnd NS Addr=%d.%d.%d.%d\n", *lsi->dns2_addr, *(lsi->dns2_addr + 1), *(lsi->dns2_addr + 2), *(lsi->dns2_addr + 3)); usie_cns_req(sc, USIE_CNS_ID_RSSI, USIE_CNS_OB_RSSI); break; case USIE_HIP_RCGI: /* ignore, workaround for sloppy windows */ break; default: DPRINTF("undefined msgid: %x\n", hip->id); break; } mtx_unlock(&sc->sc_mtx); } static void usie_if_start(struct ifnet *ifp) { struct usie_softc *sc = ifp->if_softc; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { DPRINTF("Not running\n"); return; } mtx_lock(&sc->sc_mtx); usbd_transfer_start(sc->sc_if_xfer[USIE_IF_TX]); mtx_unlock(&sc->sc_mtx); DPRINTFN(3, "interface started\n"); } static int usie_if_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { int err; DPRINTF("proto=%x\n", dst->sa_family); switch (dst->sa_family) { #ifdef INET6 case AF_INET6; /* fall though */ #endif case AF_INET: break; /* silently drop dhclient packets */ case AF_UNSPEC: m_freem(m); return (0); /* drop other packet types */ default: m_freem(m); return (EAFNOSUPPORT); } err = (ifp->if_transmit)(ifp, m); if (err) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); return (0); } static void usie_if_init(void *arg) { struct usie_softc *sc = arg; struct ifnet *ifp = sc->sc_ifp; uint8_t i; mtx_lock(&sc->sc_mtx); /* write tx descriptor */ sc->sc_txd.hip.id = USIE_HIP_CTX; sc->sc_txd.hip.param = 0; /* init value */ sc->sc_txd.desc_type = htobe16(USIE_IP_TX); for (i = 0; i != USIE_IF_N_XFER; i++) usbd_xfer_set_stall(sc->sc_if_xfer[i]); usbd_transfer_start(sc->sc_uc_xfer[USIE_HIP_IF][USIE_UC_RX]); usbd_transfer_start(sc->sc_if_xfer[USIE_IF_STATUS]); usbd_transfer_start(sc->sc_if_xfer[USIE_IF_RX]); /* if not running, initiate the modem */ if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) usie_cns_req(sc, USIE_CNS_ID_INIT, USIE_CNS_OB_LINK_UPDATE); mtx_unlock(&sc->sc_mtx); DPRINTF("ifnet initialized\n"); } static void usie_if_stop(struct usie_softc *sc) { usb_callout_drain(&sc->sc_if_sync_ch); mtx_lock(&sc->sc_mtx); /* usie_cns_req() clears IFF_* flags */ usie_cns_req(sc, USIE_CNS_ID_STOP, USIE_CNS_OB_LINK_UPDATE); usbd_transfer_stop(sc->sc_if_xfer[USIE_IF_TX]); usbd_transfer_stop(sc->sc_if_xfer[USIE_IF_RX]); usbd_transfer_stop(sc->sc_if_xfer[USIE_IF_STATUS]); /* shutdown device */ usie_if_cmd(sc, USIE_HIP_DOWN); mtx_unlock(&sc->sc_mtx); } static int usie_if_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct usie_softc *sc = ifp->if_softc; struct ieee80211req *ireq; struct ieee80211req_sta_info si; struct ifmediareq *ifmr; switch (cmd) { case SIOCSIFFLAGS: if (ifp->if_flags & IFF_UP) { if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) usie_if_init(sc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) usie_if_stop(sc); } break; case SIOCSIFCAP: if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { device_printf(sc->sc_dev, "Connect to the network first.\n"); break; } mtx_lock(&sc->sc_mtx); usie_cns_req(sc, USIE_CNS_ID_RSSI, USIE_CNS_OB_RSSI); mtx_unlock(&sc->sc_mtx); break; case SIOCG80211: ireq = (struct ieee80211req *)data; if (ireq->i_type != IEEE80211_IOC_STA_INFO) break; memset(&si, 0, sizeof(si)); si.isi_len = sizeof(si); /* * ifconfig expects RSSI in 0.5dBm units * relative to the noise floor. */ si.isi_rssi = 2 * sc->sc_rssi; if (copyout(&si, (uint8_t *)ireq->i_data + 8, sizeof(struct ieee80211req_sta_info))) DPRINTF("copyout failed\n"); DPRINTF("80211\n"); break; case SIOCGIFMEDIA: /* to fool ifconfig */ ifmr = (struct ifmediareq *)data; ifmr->ifm_count = 1; DPRINTF("media\n"); break; case SIOCSIFADDR: break; default: return (EINVAL); } return (0); } static int usie_do_request(struct usie_softc *sc, struct usb_device_request *req, void *data) { int err = 0; int ntries; mtx_assert(&sc->sc_mtx, MA_OWNED); for (ntries = 0; ntries != 10; ntries++) { err = usbd_do_request(sc->sc_udev, &sc->sc_mtx, req, data); if (err == 0) break; DPRINTF("Control request failed: %s %d/10\n", usbd_errstr(err), ntries); usb_pause_mtx(&sc->sc_mtx, USB_MS_TO_TICKS(10)); } return (err); } static int usie_if_cmd(struct usie_softc *sc, uint8_t cmd) { struct usb_device_request req; struct usie_hip msg; msg.len = 0; msg.id = cmd; msg.param = 0; req.bmRequestType = UT_WRITE_CLASS_INTERFACE; req.bRequest = UCDC_SEND_ENCAPSULATED_COMMAND; USETW(req.wValue, 0); USETW(req.wIndex, sc->sc_if_ifnum); USETW(req.wLength, sizeof(msg)); DPRINTF("cmd=%x\n", cmd); return (usie_do_request(sc, &req, &msg)); } static void usie_cns_req(struct usie_softc *sc, uint32_t id, uint16_t obj) { struct ifnet *ifp = sc->sc_ifp; struct mbuf *m; struct usb_xfer *xfer; struct usie_hip *hip; struct usie_cns *cns; uint8_t *param; uint8_t *tmp; uint8_t cns_len; m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (__predict_false(m == NULL)) { DPRINTF("could not allocate mbuf\n"); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return; } /* to align usie_hip{} on 32 bit */ m->m_data += 3; param = mtod(m, uint8_t *); *param++ = USIE_HIP_FRM_CHR; hip = (struct usie_hip *)param; cns = (struct usie_cns *)(hip + 1); tmp = param + USIE_HIPCNS_MIN - 2; switch (obj) { case USIE_CNS_OB_LINK_UPDATE: cns_len = 2; cns->op = USIE_CNS_OP_SET; *tmp++ = 1; /* profile ID, always use 1 for now */ *tmp++ = id == USIE_CNS_ID_INIT ? 1 : 0; break; case USIE_CNS_OB_PROF_WRITE: cns_len = 245; cns->op = USIE_CNS_OP_SET; *tmp++ = 1; /* profile ID, always use 1 for now */ *tmp++ = 2; memcpy(tmp, &sc->sc_net, 34); memset(tmp + 35, 0, 245 - 36); tmp += 243; break; case USIE_CNS_OB_RSSI: cns_len = 0; cns->op = USIE_CNS_OP_REQ; break; default: DPRINTF("unsupported CnS object type\n"); return; } *tmp = USIE_HIP_FRM_CHR; hip->len = htobe16(sizeof(struct usie_cns) + cns_len); hip->id = USIE_HIP_CNS2M; hip->param = 0; /* none for CnS */ cns->obj = htobe16(obj); cns->id = htobe32(id); cns->len = cns_len; cns->rsv0 = cns->rsv1 = 0; /* always '0' */ param = (uint8_t *)(cns + 1); DPRINTF("param: %16D\n", param, ":"); m->m_pkthdr.len = m->m_len = USIE_HIPCNS_MIN + cns_len + 2; xfer = sc->sc_uc_xfer[USIE_HIP_IF][USIE_UC_TX]; if (usbd_xfer_get_priv(xfer) == NULL) { usbd_xfer_set_priv(xfer, m); usbd_transfer_start(xfer); } else { DPRINTF("Dropped CNS event\n"); m_freem(m); } } static void usie_cns_rsp(struct usie_softc *sc, struct usie_cns *cns) { struct ifnet *ifp = sc->sc_ifp; DPRINTF("received CnS\n"); switch (be16toh(cns->obj)) { case USIE_CNS_OB_LINK_UPDATE: if (be32toh(cns->id) & USIE_CNS_ID_INIT) usie_if_sync_to(sc); else if (be32toh(cns->id) & USIE_CNS_ID_STOP) { ifp->if_flags &= ~IFF_UP; ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); } else DPRINTF("undefined link update\n"); break; case USIE_CNS_OB_RSSI: sc->sc_rssi = be16toh(*(int16_t *)(cns + 1)); if (sc->sc_rssi <= 0) device_printf(sc->sc_dev, "No signal\n"); else { device_printf(sc->sc_dev, "RSSI=%ddBm\n", sc->sc_rssi - 110); } break; case USIE_CNS_OB_PROF_WRITE: break; case USIE_CNS_OB_PDP_READ: break; default: DPRINTF("undefined CnS\n"); break; } } static void usie_hip_rsp(struct usie_softc *sc, uint8_t *rsp, uint32_t len) { struct usie_hip *hip; struct usie_cns *cns; uint32_t i; uint32_t j; uint32_t off; uint8_t tmp[USIE_HIPCNS_MAX] __aligned(4); for (off = 0; (off + USIE_HIPCNS_MIN) <= len; off++) { uint8_t pad; while ((off < len) && (rsp[off] == USIE_HIP_FRM_CHR)) off++; /* Unstuff the bytes */ for (i = j = 0; ((i + off) < len) && (j < USIE_HIPCNS_MAX); i++) { if (rsp[i + off] == USIE_HIP_FRM_CHR) break; if (rsp[i + off] == USIE_HIP_ESC_CHR) { if ((i + off + 1) >= len) break; tmp[j++] = rsp[i++ + off + 1] ^ 0x20; } else { tmp[j++] = rsp[i + off]; } } off += i; DPRINTF("frame len=%d\n", j); if (j < sizeof(struct usie_hip)) { DPRINTF("too little data\n"); break; } /* * Make sure we are not reading the stack if something * is wrong. */ memset(tmp + j, 0, sizeof(tmp) - j); hip = (struct usie_hip *)tmp; DPRINTF("hip: len=%d msgID=%02x, param=%02x\n", be16toh(hip->len), hip->id, hip->param); pad = (hip->id & USIE_HIP_PAD) ? 1 : 0; if ((hip->id & USIE_HIP_MASK) == USIE_HIP_CNS2H) { cns = (struct usie_cns *)(((uint8_t *)(hip + 1)) + pad); if (j < (sizeof(struct usie_cns) + sizeof(struct usie_hip) + pad)) { DPRINTF("too little data\n"); break; } DPRINTF("cns: obj=%04x, op=%02x, rsv0=%02x, " "app=%08x, rsv1=%02x, len=%d\n", be16toh(cns->obj), cns->op, cns->rsv0, be32toh(cns->id), cns->rsv1, cns->len); if (cns->op & USIE_CNS_OP_ERR) DPRINTF("CnS error response\n"); else usie_cns_rsp(sc, cns); i = sizeof(struct usie_hip) + pad + sizeof(struct usie_cns); j = cns->len; } else { i = sizeof(struct usie_hip) + pad; j = be16toh(hip->len); } #ifdef USB_DEBUG if (usie_debug == 0) continue; while (i < USIE_HIPCNS_MAX && j > 0) { DPRINTF("param[0x%02x] = 0x%02x\n", i, tmp[i]); i++; j--; } #endif } } static int usie_driver_loaded(struct module *mod, int what, void *arg) { switch (what) { case MOD_LOAD: /* register autoinstall handler */ usie_etag = EVENTHANDLER_REGISTER(usb_dev_configured, usie_autoinst, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(usb_dev_configured, usie_etag); break; default: return (EOPNOTSUPP); } return (0); } diff --git a/sys/dev/usb/net/uhso.c b/sys/dev/usb/net/uhso.c index 8e826fd9e6a7..127379fe22b0 100644 --- a/sys/dev/usb/net/uhso.c +++ b/sys/dev/usb/net/uhso.c @@ -1,1934 +1,1937 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Fredrik Lindberg * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "usbdevs.h" #define USB_DEBUG_VAR uhso_debug #include #include #include #include #include struct uhso_tty { struct uhso_softc *ht_sc; struct usb_xfer *ht_xfer[3]; int ht_muxport; /* Mux. port no */ int ht_open; char ht_name[32]; }; struct uhso_softc { device_t sc_dev; struct usb_device *sc_udev; struct mtx sc_mtx; uint32_t sc_type; /* Interface definition */ int sc_radio; struct usb_xfer *sc_xfer[3]; uint8_t sc_iface_no; uint8_t sc_iface_index; /* Control pipe */ struct usb_xfer * sc_ctrl_xfer[2]; uint8_t sc_ctrl_iface_no; /* Network */ struct usb_xfer *sc_if_xfer[2]; struct ifnet *sc_ifp; struct mbuf *sc_mwait; /* Partial packet */ size_t sc_waitlen; /* No. of outstanding bytes */ struct mbufq sc_rxq; struct callout sc_c; /* TTY related structures */ struct ucom_super_softc sc_super_ucom; int sc_ttys; struct uhso_tty *sc_tty; struct ucom_softc *sc_ucom; int sc_msr; int sc_lsr; int sc_line; }; #define UHSO_MAX_MTU 2048 /* * There are mainly two type of cards floating around. * The first one has 2,3 or 4 interfaces with a multiplexed serial port * and packet interface on the first interface and bulk serial ports * on the others. * The second type of card has several other interfaces, their purpose * can be detected during run-time. */ #define UHSO_IFACE_SPEC(usb_type, port, port_type) \ (((usb_type) << 24) | ((port) << 16) | (port_type)) #define UHSO_IFACE_USB_TYPE(x) ((x >> 24) & 0xff) #define UHSO_IFACE_PORT(x) ((x >> 16) & 0xff) #define UHSO_IFACE_PORT_TYPE(x) (x & 0xff) /* * USB interface types */ #define UHSO_IF_NET 0x01 /* Network packet interface */ #define UHSO_IF_MUX 0x02 /* Multiplexed serial port */ #define UHSO_IF_BULK 0x04 /* Bulk interface */ /* * Port types */ #define UHSO_PORT_UNKNOWN 0x00 #define UHSO_PORT_SERIAL 0x01 /* Serial port */ #define UHSO_PORT_NETWORK 0x02 /* Network packet interface */ /* * Multiplexed serial port destination sub-port names */ #define UHSO_MPORT_TYPE_CTL 0x00 /* Control port */ #define UHSO_MPORT_TYPE_APP 0x01 /* Application */ #define UHSO_MPORT_TYPE_PCSC 0x02 #define UHSO_MPORT_TYPE_GPS 0x03 #define UHSO_MPORT_TYPE_APP2 0x04 /* Secondary application */ #define UHSO_MPORT_TYPE_MAX UHSO_MPORT_TYPE_APP2 #define UHSO_MPORT_TYPE_NOMAX 8 /* Max number of mux ports */ /* * Port definitions * Note that these definitions are arbitrary and do not match the values * returned by the auto config descriptor. */ #define UHSO_PORT_TYPE_UNKNOWN 0x00 #define UHSO_PORT_TYPE_CTL 0x01 #define UHSO_PORT_TYPE_APP 0x02 #define UHSO_PORT_TYPE_APP2 0x03 #define UHSO_PORT_TYPE_MODEM 0x04 #define UHSO_PORT_TYPE_NETWORK 0x05 #define UHSO_PORT_TYPE_DIAG 0x06 #define UHSO_PORT_TYPE_DIAG2 0x07 #define UHSO_PORT_TYPE_GPS 0x08 #define UHSO_PORT_TYPE_GPSCTL 0x09 #define UHSO_PORT_TYPE_PCSC 0x0a #define UHSO_PORT_TYPE_MSD 0x0b #define UHSO_PORT_TYPE_VOICE 0x0c #define UHSO_PORT_TYPE_MAX 0x0c static eventhandler_tag uhso_etag; /* Overall port type */ static char *uhso_port[] = { "Unknown", "Serial", "Network", "Network/Serial" }; /* * Map between interface port type read from device and description type. * The position in this array is a direct map to the auto config * descriptor values. */ static unsigned char uhso_port_map[] = { UHSO_PORT_TYPE_UNKNOWN, UHSO_PORT_TYPE_DIAG, UHSO_PORT_TYPE_GPS, UHSO_PORT_TYPE_GPSCTL, UHSO_PORT_TYPE_APP, UHSO_PORT_TYPE_APP2, UHSO_PORT_TYPE_CTL, UHSO_PORT_TYPE_NETWORK, UHSO_PORT_TYPE_MODEM, UHSO_PORT_TYPE_MSD, UHSO_PORT_TYPE_PCSC, UHSO_PORT_TYPE_VOICE }; static char uhso_port_map_max = sizeof(uhso_port_map) / sizeof(char); static unsigned char uhso_mux_port_map[] = { UHSO_PORT_TYPE_CTL, UHSO_PORT_TYPE_APP, UHSO_PORT_TYPE_PCSC, UHSO_PORT_TYPE_GPS, UHSO_PORT_TYPE_APP2 }; static char *uhso_port_type[] = { "Unknown", /* Not a valid port */ "Control", "Application", "Application (Secondary)", "Modem", "Network", "Diagnostic", "Diagnostic (Secondary)", "GPS", "GPS Control", "PC Smartcard", "MSD", "Voice", }; static char *uhso_port_type_sysctl[] = { "unknown", "control", "application", "application", "modem", "network", "diagnostic", "diagnostic", "gps", "gps_control", "pcsc", "msd", "voice", }; #define UHSO_STATIC_IFACE 0x01 #define UHSO_AUTO_IFACE 0x02 /* ifnet device unit allocations */ static struct unrhdr *uhso_ifnet_unit = NULL; static const STRUCT_USB_HOST_ID uhso_devs[] = { #define UHSO_DEV(v,p,i) { USB_VPI(USB_VENDOR_##v, USB_PRODUCT_##v##_##p, i) } /* Option GlobeTrotter MAX 7.2 with upgraded firmware */ UHSO_DEV(OPTION, GTMAX72, UHSO_STATIC_IFACE), /* Option GlobeSurfer iCON 7.2 */ UHSO_DEV(OPTION, GSICON72, UHSO_STATIC_IFACE), /* Option iCON 225 */ UHSO_DEV(OPTION, GTHSDPA, UHSO_STATIC_IFACE), /* Option GlobeSurfer iCON HSUPA */ UHSO_DEV(OPTION, GSICONHSUPA, UHSO_STATIC_IFACE), /* Option GlobeTrotter HSUPA */ UHSO_DEV(OPTION, GTHSUPA, UHSO_STATIC_IFACE), /* GE40x */ UHSO_DEV(OPTION, GE40X, UHSO_AUTO_IFACE), UHSO_DEV(OPTION, GE40X_1, UHSO_AUTO_IFACE), UHSO_DEV(OPTION, GE40X_2, UHSO_AUTO_IFACE), UHSO_DEV(OPTION, GE40X_3, UHSO_AUTO_IFACE), /* Option GlobeSurfer iCON 401 */ UHSO_DEV(OPTION, ICON401, UHSO_AUTO_IFACE), /* Option GlobeTrotter Module 382 */ UHSO_DEV(OPTION, GMT382, UHSO_AUTO_IFACE), /* Option GTM661W */ UHSO_DEV(OPTION, GTM661W, UHSO_AUTO_IFACE), /* Option iCON EDGE */ UHSO_DEV(OPTION, ICONEDGE, UHSO_STATIC_IFACE), /* Option Module HSxPA */ UHSO_DEV(OPTION, MODHSXPA, UHSO_STATIC_IFACE), /* Option iCON 321 */ UHSO_DEV(OPTION, ICON321, UHSO_STATIC_IFACE), /* Option iCON 322 */ UHSO_DEV(OPTION, GTICON322, UHSO_STATIC_IFACE), /* Option iCON 505 */ UHSO_DEV(OPTION, ICON505, UHSO_AUTO_IFACE), /* Option iCON 452 */ UHSO_DEV(OPTION, ICON505, UHSO_AUTO_IFACE), #undef UHSO_DEV }; static SYSCTL_NODE(_hw_usb, OID_AUTO, uhso, CTLFLAG_RW, 0, "USB uhso"); static int uhso_autoswitch = 1; SYSCTL_INT(_hw_usb_uhso, OID_AUTO, auto_switch, CTLFLAG_RWTUN, &uhso_autoswitch, 0, "Automatically switch to modem mode"); #ifdef USB_DEBUG #ifdef UHSO_DEBUG static int uhso_debug = UHSO_DEBUG; #else static int uhso_debug = -1; #endif SYSCTL_INT(_hw_usb_uhso, OID_AUTO, debug, CTLFLAG_RWTUN, &uhso_debug, 0, "Debug level"); #define UHSO_DPRINTF(n, x, ...) {\ if (uhso_debug >= n) {\ printf("%s: " x, __func__, ##__VA_ARGS__);\ }\ } #else #define UHSO_DPRINTF(n, x, ...) #endif #ifdef UHSO_DEBUG_HEXDUMP # define UHSO_HEXDUMP(_buf, _len) do { \ { \ size_t __tmp; \ const char *__buf = (const char *)_buf; \ for (__tmp = 0; __tmp < _len; __tmp++) \ printf("%02hhx ", *__buf++); \ printf("\n"); \ } \ } while(0) #else # define UHSO_HEXDUMP(_buf, _len) #endif enum { UHSO_MUX_ENDPT_INTR = 0, UHSO_MUX_ENDPT_MAX }; enum { UHSO_CTRL_READ = 0, UHSO_CTRL_WRITE, UHSO_CTRL_MAX }; enum { UHSO_IFNET_READ = 0, UHSO_IFNET_WRITE, UHSO_IFNET_MAX }; enum { UHSO_BULK_ENDPT_READ = 0, UHSO_BULK_ENDPT_WRITE, UHSO_BULK_ENDPT_INTR, UHSO_BULK_ENDPT_MAX }; static usb_callback_t uhso_mux_intr_callback; static usb_callback_t uhso_mux_read_callback; static usb_callback_t uhso_mux_write_callback; static usb_callback_t uhso_bs_read_callback; static usb_callback_t uhso_bs_write_callback; static usb_callback_t uhso_bs_intr_callback; static usb_callback_t uhso_ifnet_read_callback; static usb_callback_t uhso_ifnet_write_callback; /* Config used for the default control pipes */ static const struct usb_config uhso_ctrl_config[UHSO_CTRL_MAX] = { [UHSO_CTRL_READ] = { .type = UE_CONTROL, .endpoint = 0x00, .direction = UE_DIR_ANY, .flags = { .pipe_bof = 1, .short_xfer_ok = 1 }, .bufsize = sizeof(struct usb_device_request) + 1024, .callback = &uhso_mux_read_callback }, [UHSO_CTRL_WRITE] = { .type = UE_CONTROL, .endpoint = 0x00, .direction = UE_DIR_ANY, .flags = { .pipe_bof = 1, .force_short_xfer = 1 }, .bufsize = sizeof(struct usb_device_request) + 1024, .timeout = 1000, .callback = &uhso_mux_write_callback } }; /* Config for the multiplexed serial ports */ static const struct usb_config uhso_mux_config[UHSO_MUX_ENDPT_MAX] = { [UHSO_MUX_ENDPT_INTR] = { .type = UE_INTERRUPT, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_IN, .flags = { .short_xfer_ok = 1 }, .bufsize = 0, .callback = &uhso_mux_intr_callback, } }; /* Config for the raw IP-packet interface */ static const struct usb_config uhso_ifnet_config[UHSO_IFNET_MAX] = { [UHSO_IFNET_READ] = { .type = UE_BULK, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_IN, .flags = { .pipe_bof = 1, .short_xfer_ok = 1 }, .bufsize = MCLBYTES, .callback = &uhso_ifnet_read_callback }, [UHSO_IFNET_WRITE] = { .type = UE_BULK, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_OUT, .flags = { .pipe_bof = 1, .force_short_xfer = 1 }, .bufsize = MCLBYTES, .timeout = 5 * USB_MS_HZ, .callback = &uhso_ifnet_write_callback } }; /* Config for interfaces with normal bulk serial ports */ static const struct usb_config uhso_bs_config[UHSO_BULK_ENDPT_MAX] = { [UHSO_BULK_ENDPT_READ] = { .type = UE_BULK, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_IN, .flags = { .pipe_bof = 1, .short_xfer_ok = 1 }, .bufsize = 4096, .callback = &uhso_bs_read_callback }, [UHSO_BULK_ENDPT_WRITE] = { .type = UE_BULK, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_OUT, .flags = { .pipe_bof = 1, .force_short_xfer = 1 }, .bufsize = 8192, .callback = &uhso_bs_write_callback }, [UHSO_BULK_ENDPT_INTR] = { .type = UE_INTERRUPT, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_IN, .flags = { .short_xfer_ok = 1 }, .bufsize = 0, .callback = &uhso_bs_intr_callback, } }; static int uhso_probe_iface(struct uhso_softc *, int, int (*probe)(struct usb_device *, int)); static int uhso_probe_iface_auto(struct usb_device *, int); static int uhso_probe_iface_static(struct usb_device *, int); static int uhso_attach_muxserial(struct uhso_softc *, struct usb_interface *, int type); static int uhso_attach_bulkserial(struct uhso_softc *, struct usb_interface *, int type); static int uhso_attach_ifnet(struct uhso_softc *, struct usb_interface *, int type); static void uhso_test_autoinst(void *, struct usb_device *, struct usb_attach_arg *); static int uhso_driver_loaded(struct module *, int, void *); static int uhso_radio_sysctl(SYSCTL_HANDLER_ARGS); static int uhso_radio_ctrl(struct uhso_softc *, int); static void uhso_free(struct ucom_softc *); static void uhso_ucom_start_read(struct ucom_softc *); static void uhso_ucom_stop_read(struct ucom_softc *); static void uhso_ucom_start_write(struct ucom_softc *); static void uhso_ucom_stop_write(struct ucom_softc *); static void uhso_ucom_cfg_get_status(struct ucom_softc *, uint8_t *, uint8_t *); static void uhso_ucom_cfg_set_dtr(struct ucom_softc *, uint8_t); static void uhso_ucom_cfg_set_rts(struct ucom_softc *, uint8_t); static void uhso_if_init(void *); static void uhso_if_start(struct ifnet *); static void uhso_if_stop(struct uhso_softc *); static int uhso_if_ioctl(struct ifnet *, u_long, caddr_t); static int uhso_if_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void uhso_if_rxflush(void *); static device_probe_t uhso_probe; static device_attach_t uhso_attach; static device_detach_t uhso_detach; static void uhso_free_softc(struct uhso_softc *); static device_method_t uhso_methods[] = { DEVMETHOD(device_probe, uhso_probe), DEVMETHOD(device_attach, uhso_attach), DEVMETHOD(device_detach, uhso_detach), { 0, 0 } }; static driver_t uhso_driver = { .name = "uhso", .methods = uhso_methods, .size = sizeof(struct uhso_softc) }; static devclass_t uhso_devclass; DRIVER_MODULE(uhso, uhub, uhso_driver, uhso_devclass, uhso_driver_loaded, 0); MODULE_DEPEND(uhso, ucom, 1, 1, 1); MODULE_DEPEND(uhso, usb, 1, 1, 1); MODULE_VERSION(uhso, 1); USB_PNP_HOST_INFO(uhso_devs); static struct ucom_callback uhso_ucom_callback = { .ucom_cfg_get_status = &uhso_ucom_cfg_get_status, .ucom_cfg_set_dtr = &uhso_ucom_cfg_set_dtr, .ucom_cfg_set_rts = &uhso_ucom_cfg_set_rts, .ucom_start_read = uhso_ucom_start_read, .ucom_stop_read = uhso_ucom_stop_read, .ucom_start_write = uhso_ucom_start_write, .ucom_stop_write = uhso_ucom_stop_write, .ucom_free = &uhso_free, }; static int uhso_probe(device_t self) { struct usb_attach_arg *uaa = device_get_ivars(self); int error; if (uaa->usb_mode != USB_MODE_HOST) return (ENXIO); if (uaa->info.bConfigIndex != 0) return (ENXIO); if (uaa->info.bDeviceClass != 0xff) return (ENXIO); error = usbd_lookup_id_by_uaa(uhso_devs, sizeof(uhso_devs), uaa); if (error != 0) return (error); /* * Probe device to see if we are able to attach * to this interface or not. */ if (USB_GET_DRIVER_INFO(uaa) == UHSO_AUTO_IFACE) { if (uhso_probe_iface_auto(uaa->device, uaa->info.bIfaceNum) == 0) return (ENXIO); } return (error); } static int uhso_attach(device_t self) { struct uhso_softc *sc = device_get_softc(self); struct usb_attach_arg *uaa = device_get_ivars(self); struct usb_interface_descriptor *id; struct sysctl_ctx_list *sctx; struct sysctl_oid *soid; struct sysctl_oid *tree = NULL, *tty_node; struct ucom_softc *ucom; struct uhso_tty *ht; int i, error, port; void *probe_f; usb_error_t uerr; char *desc; sc->sc_dev = self; sc->sc_udev = uaa->device; mtx_init(&sc->sc_mtx, "uhso", NULL, MTX_DEF); mbufq_init(&sc->sc_rxq, INT_MAX); /* XXXGL: sane maximum */ ucom_ref(&sc->sc_super_ucom); sc->sc_radio = 1; id = usbd_get_interface_descriptor(uaa->iface); sc->sc_ctrl_iface_no = id->bInterfaceNumber; sc->sc_iface_no = uaa->info.bIfaceNum; sc->sc_iface_index = uaa->info.bIfaceIndex; /* Setup control pipe */ uerr = usbd_transfer_setup(uaa->device, &sc->sc_iface_index, sc->sc_ctrl_xfer, uhso_ctrl_config, UHSO_CTRL_MAX, sc, &sc->sc_mtx); if (uerr) { device_printf(self, "Failed to setup control pipe: %s\n", usbd_errstr(uerr)); goto out; } if (USB_GET_DRIVER_INFO(uaa) == UHSO_STATIC_IFACE) probe_f = uhso_probe_iface_static; else if (USB_GET_DRIVER_INFO(uaa) == UHSO_AUTO_IFACE) probe_f = uhso_probe_iface_auto; else goto out; error = uhso_probe_iface(sc, uaa->info.bIfaceNum, probe_f); if (error != 0) goto out; sctx = device_get_sysctl_ctx(sc->sc_dev); soid = device_get_sysctl_tree(sc->sc_dev); SYSCTL_ADD_STRING(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "type", CTLFLAG_RD, uhso_port[UHSO_IFACE_PORT(sc->sc_type)], 0, "Port available at this interface"); SYSCTL_ADD_PROC(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "radio", CTLTYPE_INT | CTLFLAG_RWTUN, sc, 0, uhso_radio_sysctl, "I", "Enable radio"); /* * The default interface description on most Option devices isn't * very helpful. So we skip device_set_usb_desc and set the * device description manually. */ device_set_desc_copy(self, uhso_port_type[UHSO_IFACE_PORT_TYPE(sc->sc_type)]); /* Announce device */ device_printf(self, "<%s port> at <%s %s> on %s\n", uhso_port_type[UHSO_IFACE_PORT_TYPE(sc->sc_type)], usb_get_manufacturer(uaa->device), usb_get_product(uaa->device), device_get_nameunit(device_get_parent(self))); if (sc->sc_ttys > 0) { SYSCTL_ADD_INT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "ports", CTLFLAG_RD, &sc->sc_ttys, 0, "Number of attached serial ports"); tree = SYSCTL_ADD_NODE(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "port", CTLFLAG_RD, NULL, "Serial ports"); } /* * Loop through the number of found TTYs and create sysctl * nodes for them. */ for (i = 0; i < sc->sc_ttys; i++) { ht = &sc->sc_tty[i]; ucom = &sc->sc_ucom[i]; if (UHSO_IFACE_USB_TYPE(sc->sc_type) & UHSO_IF_MUX) port = uhso_mux_port_map[ht->ht_muxport]; else port = UHSO_IFACE_PORT_TYPE(sc->sc_type); desc = uhso_port_type_sysctl[port]; tty_node = SYSCTL_ADD_NODE(sctx, SYSCTL_CHILDREN(tree), OID_AUTO, desc, CTLFLAG_RD, NULL, ""); ht->ht_name[0] = 0; if (sc->sc_ttys == 1) snprintf(ht->ht_name, 32, "cuaU%d", ucom->sc_super->sc_unit); else { snprintf(ht->ht_name, 32, "cuaU%d.%d", ucom->sc_super->sc_unit, ucom->sc_subunit); } desc = uhso_port_type[port]; SYSCTL_ADD_STRING(sctx, SYSCTL_CHILDREN(tty_node), OID_AUTO, "tty", CTLFLAG_RD, ht->ht_name, 0, ""); SYSCTL_ADD_STRING(sctx, SYSCTL_CHILDREN(tty_node), OID_AUTO, "desc", CTLFLAG_RD, desc, 0, ""); if (bootverbose) device_printf(sc->sc_dev, "\"%s\" port at %s\n", desc, ht->ht_name); } return (0); out: uhso_detach(sc->sc_dev); return (ENXIO); } static int uhso_detach(device_t self) { struct uhso_softc *sc = device_get_softc(self); int i; usbd_transfer_unsetup(sc->sc_xfer, 3); usbd_transfer_unsetup(sc->sc_ctrl_xfer, UHSO_CTRL_MAX); if (sc->sc_ttys > 0) { ucom_detach(&sc->sc_super_ucom, sc->sc_ucom); for (i = 0; i < sc->sc_ttys; i++) { if (sc->sc_tty[i].ht_muxport != -1) { usbd_transfer_unsetup(sc->sc_tty[i].ht_xfer, UHSO_CTRL_MAX); } } } if (sc->sc_ifp != NULL) { callout_drain(&sc->sc_c); free_unr(uhso_ifnet_unit, sc->sc_ifp->if_dunit); mtx_lock(&sc->sc_mtx); uhso_if_stop(sc); bpfdetach(sc->sc_ifp); if_detach(sc->sc_ifp); if_free(sc->sc_ifp); mtx_unlock(&sc->sc_mtx); usbd_transfer_unsetup(sc->sc_if_xfer, UHSO_IFNET_MAX); } device_claim_softc(self); uhso_free_softc(sc); return (0); } UCOM_UNLOAD_DRAIN(uhso); static void uhso_free_softc(struct uhso_softc *sc) { if (ucom_unref(&sc->sc_super_ucom)) { free(sc->sc_tty, M_USBDEV); free(sc->sc_ucom, M_USBDEV); mtx_destroy(&sc->sc_mtx); device_free_softc(sc); } } static void uhso_free(struct ucom_softc *ucom) { uhso_free_softc(ucom->sc_parent); } static void uhso_test_autoinst(void *arg, struct usb_device *udev, struct usb_attach_arg *uaa) { struct usb_interface *iface; struct usb_interface_descriptor *id; if (uaa->dev_state != UAA_DEV_READY || !uhso_autoswitch) return; iface = usbd_get_iface(udev, 0); if (iface == NULL) return; id = iface->idesc; if (id == NULL || id->bInterfaceClass != UICLASS_MASS) return; if (usbd_lookup_id_by_uaa(uhso_devs, sizeof(uhso_devs), uaa)) return; /* no device match */ if (usb_msc_eject(udev, 0, MSC_EJECT_REZERO) == 0) { /* success, mark the udev as disappearing */ uaa->dev_state = UAA_DEV_EJECTING; } } static int uhso_driver_loaded(struct module *mod, int what, void *arg) { switch (what) { case MOD_LOAD: /* register our autoinstall handler */ uhso_etag = EVENTHANDLER_REGISTER(usb_dev_configured, uhso_test_autoinst, NULL, EVENTHANDLER_PRI_ANY); /* create our unit allocator for inet devs */ uhso_ifnet_unit = new_unrhdr(0, INT_MAX, NULL); break; case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(usb_dev_configured, uhso_etag); delete_unrhdr(uhso_ifnet_unit); break; default: return (EOPNOTSUPP); } return (0); } /* * Probe the interface type by querying the device. The elements * of an array indicates the capabilities of a particular interface. * Returns a bit mask with the interface capabilities. */ static int uhso_probe_iface_auto(struct usb_device *udev, int index) { struct usb_device_request req; usb_error_t uerr; uint16_t actlen = 0; char port; char buf[17] = {0}; req.bmRequestType = UT_READ_VENDOR_DEVICE; req.bRequest = 0x86; USETW(req.wValue, 0); USETW(req.wIndex, 0); USETW(req.wLength, 17); uerr = usbd_do_request_flags(udev, NULL, &req, buf, 0, &actlen, USB_MS_HZ); if (uerr != 0) { printf("%s: usbd_do_request_flags failed, %s\n", __func__, usbd_errstr(uerr)); return (0); } UHSO_DPRINTF(1, "actlen=%d\n", actlen); UHSO_HEXDUMP(buf, 17); if (index < 0 || index > 16) { UHSO_DPRINTF(0, "Index %d out of range\n", index); return (0); } UHSO_DPRINTF(1, "index=%d, type=%x[%s]\n", index, buf[index], uhso_port_type[(int)uhso_port_map[(int)buf[index]]]); if (buf[index] >= uhso_port_map_max) port = 0; else port = uhso_port_map[(int)buf[index]]; switch (port) { case UHSO_PORT_TYPE_NETWORK: return (UHSO_IFACE_SPEC(UHSO_IF_NET | UHSO_IF_MUX, UHSO_PORT_SERIAL | UHSO_PORT_NETWORK, port)); case UHSO_PORT_TYPE_DIAG: case UHSO_PORT_TYPE_DIAG2: case UHSO_PORT_TYPE_GPS: case UHSO_PORT_TYPE_GPSCTL: case UHSO_PORT_TYPE_CTL: case UHSO_PORT_TYPE_APP: case UHSO_PORT_TYPE_APP2: case UHSO_PORT_TYPE_MODEM: return (UHSO_IFACE_SPEC(UHSO_IF_BULK, UHSO_PORT_SERIAL, port)); case UHSO_PORT_TYPE_MSD: return (0); case UHSO_PORT_TYPE_UNKNOWN: default: return (0); } return (0); } /* * Returns the capabilities of interfaces for devices that don't * support the automatic query. * Returns a bit mask with the interface capabilities. */ static int uhso_probe_iface_static(struct usb_device *udev, int index) { struct usb_config_descriptor *cd; cd = usbd_get_config_descriptor(udev); if (cd->bNumInterface <= 3) { /* Cards with 3 or less interfaces */ switch (index) { case 0: return UHSO_IFACE_SPEC(UHSO_IF_NET | UHSO_IF_MUX, UHSO_PORT_SERIAL | UHSO_PORT_NETWORK, UHSO_PORT_TYPE_NETWORK); case 1: return UHSO_IFACE_SPEC(UHSO_IF_BULK, UHSO_PORT_SERIAL, UHSO_PORT_TYPE_DIAG); case 2: return UHSO_IFACE_SPEC(UHSO_IF_BULK, UHSO_PORT_SERIAL, UHSO_PORT_TYPE_MODEM); } } else { /* Cards with 4 interfaces */ switch (index) { case 0: return UHSO_IFACE_SPEC(UHSO_IF_NET | UHSO_IF_MUX, UHSO_PORT_SERIAL | UHSO_PORT_NETWORK, UHSO_PORT_TYPE_NETWORK); case 1: return UHSO_IFACE_SPEC(UHSO_IF_BULK, UHSO_PORT_SERIAL, UHSO_PORT_TYPE_DIAG2); case 2: return UHSO_IFACE_SPEC(UHSO_IF_BULK, UHSO_PORT_SERIAL, UHSO_PORT_TYPE_MODEM); case 3: return UHSO_IFACE_SPEC(UHSO_IF_BULK, UHSO_PORT_SERIAL, UHSO_PORT_TYPE_DIAG); } } return (0); } /* * Probes an interface for its particular capabilities and attaches if * it's a supported interface. */ static int uhso_probe_iface(struct uhso_softc *sc, int index, int (*probe)(struct usb_device *, int)) { struct usb_interface *iface; int type, error; UHSO_DPRINTF(1, "Probing for interface %d, probe_func=%p\n", index, probe); type = probe(sc->sc_udev, index); UHSO_DPRINTF(1, "Probe result %x\n", type); if (type <= 0) return (ENXIO); sc->sc_type = type; iface = usbd_get_iface(sc->sc_udev, index); if (UHSO_IFACE_PORT_TYPE(type) == UHSO_PORT_TYPE_NETWORK) { error = uhso_attach_ifnet(sc, iface, type); if (error) { UHSO_DPRINTF(1, "uhso_attach_ifnet failed"); return (ENXIO); } /* * If there is an additional interrupt endpoint on this * interface then we most likely have a multiplexed serial port * available. */ if (iface->idesc->bNumEndpoints < 3) { sc->sc_type = UHSO_IFACE_SPEC( UHSO_IFACE_USB_TYPE(type) & ~UHSO_IF_MUX, UHSO_IFACE_PORT(type) & ~UHSO_PORT_SERIAL, UHSO_IFACE_PORT_TYPE(type)); return (0); } UHSO_DPRINTF(1, "Trying to attach mux. serial\n"); error = uhso_attach_muxserial(sc, iface, type); if (error == 0 && sc->sc_ttys > 0) { error = ucom_attach(&sc->sc_super_ucom, sc->sc_ucom, sc->sc_ttys, sc, &uhso_ucom_callback, &sc->sc_mtx); if (error) { device_printf(sc->sc_dev, "ucom_attach failed\n"); return (ENXIO); } ucom_set_pnpinfo_usb(&sc->sc_super_ucom, sc->sc_dev); mtx_lock(&sc->sc_mtx); usbd_transfer_start(sc->sc_xfer[UHSO_MUX_ENDPT_INTR]); mtx_unlock(&sc->sc_mtx); } } else if ((UHSO_IFACE_USB_TYPE(type) & UHSO_IF_BULK) && UHSO_IFACE_PORT(type) & UHSO_PORT_SERIAL) { error = uhso_attach_bulkserial(sc, iface, type); if (error) return (ENXIO); error = ucom_attach(&sc->sc_super_ucom, sc->sc_ucom, sc->sc_ttys, sc, &uhso_ucom_callback, &sc->sc_mtx); if (error) { device_printf(sc->sc_dev, "ucom_attach failed\n"); return (ENXIO); } ucom_set_pnpinfo_usb(&sc->sc_super_ucom, sc->sc_dev); } else { UHSO_DPRINTF(0, "Unknown type %x\n", type); return (ENXIO); } return (0); } static int uhso_radio_ctrl(struct uhso_softc *sc, int onoff) { struct usb_device_request req; usb_error_t uerr; req.bmRequestType = UT_VENDOR; req.bRequest = onoff ? 0x82 : 0x81; USETW(req.wValue, 0); USETW(req.wIndex, 0); USETW(req.wLength, 0); uerr = usbd_do_request(sc->sc_udev, NULL, &req, NULL); if (uerr != 0) { device_printf(sc->sc_dev, "usbd_do_request_flags failed: %s\n", usbd_errstr(uerr)); return (-1); } return (onoff); } static int uhso_radio_sysctl(SYSCTL_HANDLER_ARGS) { struct uhso_softc *sc = arg1; int error, radio; radio = sc->sc_radio; error = sysctl_handle_int(oidp, &radio, 0, req); if (error) return (error); if (radio != sc->sc_radio) { radio = radio != 0 ? 1 : 0; error = uhso_radio_ctrl(sc, radio); if (error != -1) sc->sc_radio = radio; } return (0); } /* * Expands allocated memory to fit an additional TTY. * Two arrays are kept with matching indexes, one for ucom and one * for our private data. */ static int uhso_alloc_tty(struct uhso_softc *sc) { sc->sc_ttys++; sc->sc_tty = reallocf(sc->sc_tty, sizeof(struct uhso_tty) * sc->sc_ttys, M_USBDEV, M_WAITOK | M_ZERO); if (sc->sc_tty == NULL) return (-1); sc->sc_ucom = reallocf(sc->sc_ucom, sizeof(struct ucom_softc) * sc->sc_ttys, M_USBDEV, M_WAITOK | M_ZERO); if (sc->sc_ucom == NULL) return (-1); sc->sc_tty[sc->sc_ttys - 1].ht_sc = sc; UHSO_DPRINTF(1, "Allocated TTY %d\n", sc->sc_ttys - 1); return (sc->sc_ttys - 1); } /* * Attach a multiplexed serial port * Data is read/written with requests on the default control pipe. An interrupt * endpoint returns when there is new data to be read. */ static int uhso_attach_muxserial(struct uhso_softc *sc, struct usb_interface *iface, int type) { struct usb_descriptor *desc; int i, port, tty; usb_error_t uerr; /* * The class specific interface (type 0x24) descriptor subtype field * contains a bitmask that specifies which (and how many) ports that * are available through this multiplexed serial port. */ desc = usbd_find_descriptor(sc->sc_udev, NULL, iface->idesc->bInterfaceNumber, UDESC_CS_INTERFACE, 0xff, 0, 0); if (desc == NULL) { UHSO_DPRINTF(0, "Failed to find UDESC_CS_INTERFACE\n"); return (ENXIO); } UHSO_DPRINTF(1, "Mux port mask %x\n", desc->bDescriptorSubtype); if (desc->bDescriptorSubtype == 0) return (ENXIO); /* * The bitmask is one octet, loop through the number of * bits that are set and create a TTY for each. */ for (i = 0; i < 8; i++) { port = (1 << i); if ((port & desc->bDescriptorSubtype) == port) { UHSO_DPRINTF(2, "Found mux port %x (%d)\n", port, i); tty = uhso_alloc_tty(sc); if (tty < 0) return (ENOMEM); sc->sc_tty[tty].ht_muxport = i; uerr = usbd_transfer_setup(sc->sc_udev, &sc->sc_iface_index, sc->sc_tty[tty].ht_xfer, uhso_ctrl_config, UHSO_CTRL_MAX, sc, &sc->sc_mtx); if (uerr) { device_printf(sc->sc_dev, "Failed to setup control pipe: %s\n", usbd_errstr(uerr)); return (ENXIO); } } } /* Setup the intr. endpoint */ uerr = usbd_transfer_setup(sc->sc_udev, &iface->idesc->bInterfaceNumber, sc->sc_xfer, uhso_mux_config, 1, sc, &sc->sc_mtx); if (uerr) return (ENXIO); return (0); } /* * Interrupt callback for the multiplexed serial port. Indicates * which serial port has data waiting. */ static void uhso_mux_intr_callback(struct usb_xfer *xfer, usb_error_t error) { struct usb_page_cache *pc; struct usb_page_search res; struct uhso_softc *sc = usbd_xfer_softc(xfer); unsigned int i, mux; UHSO_DPRINTF(3, "status %d\n", USB_GET_STATE(xfer)); switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: /* * The multiplexed port number can be found at the first byte. * It contains a bit mask, we transform this in to an integer. */ pc = usbd_xfer_get_frame(xfer, 0); usbd_get_page(pc, 0, &res); i = *((unsigned char *)res.buffer); mux = 0; while (i >>= 1) { mux++; } UHSO_DPRINTF(3, "mux port %d (%d)\n", mux, i); if (mux > UHSO_MPORT_TYPE_NOMAX) break; /* Issue a read for this serial port */ usbd_xfer_set_priv( sc->sc_tty[mux].ht_xfer[UHSO_CTRL_READ], &sc->sc_tty[mux]); usbd_transfer_start(sc->sc_tty[mux].ht_xfer[UHSO_CTRL_READ]); break; case USB_ST_SETUP: tr_setup: usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer)); usbd_transfer_submit(xfer); break; default: UHSO_DPRINTF(0, "error: %s\n", usbd_errstr(error)); if (error == USB_ERR_CANCELLED) break; usbd_xfer_set_stall(xfer); goto tr_setup; } } static void uhso_mux_read_callback(struct usb_xfer *xfer, usb_error_t error) { struct uhso_softc *sc = usbd_xfer_softc(xfer); struct usb_page_cache *pc; struct usb_device_request req; struct uhso_tty *ht; int actlen, len; usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL); UHSO_DPRINTF(3, "status %d\n", USB_GET_STATE(xfer)); ht = usbd_xfer_get_priv(xfer); UHSO_DPRINTF(3, "ht=%p open=%d\n", ht, ht->ht_open); switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: /* Got data, send to ucom */ pc = usbd_xfer_get_frame(xfer, 1); len = usbd_xfer_frame_len(xfer, 1); UHSO_DPRINTF(3, "got %d bytes on mux port %d\n", len, ht->ht_muxport); if (len <= 0) { usbd_transfer_start(sc->sc_xfer[UHSO_MUX_ENDPT_INTR]); break; } /* Deliver data if the TTY is open, discard otherwise */ if (ht->ht_open) ucom_put_data(&sc->sc_ucom[ht->ht_muxport], pc, 0, len); /* FALLTHROUGH */ case USB_ST_SETUP: tr_setup: memset(&req, 0, sizeof(struct usb_device_request)); req.bmRequestType = UT_READ_CLASS_INTERFACE; req.bRequest = UCDC_GET_ENCAPSULATED_RESPONSE; USETW(req.wValue, 0); USETW(req.wIndex, ht->ht_muxport); USETW(req.wLength, 1024); pc = usbd_xfer_get_frame(xfer, 0); usbd_copy_in(pc, 0, &req, sizeof(req)); usbd_xfer_set_frame_len(xfer, 0, sizeof(req)); usbd_xfer_set_frame_len(xfer, 1, 1024); usbd_xfer_set_frames(xfer, 2); usbd_transfer_submit(xfer); break; default: UHSO_DPRINTF(0, "error: %s\n", usbd_errstr(error)); if (error == USB_ERR_CANCELLED) break; usbd_xfer_set_stall(xfer); goto tr_setup; } } static void uhso_mux_write_callback(struct usb_xfer *xfer, usb_error_t error) { struct uhso_softc *sc = usbd_xfer_softc(xfer); struct uhso_tty *ht; struct usb_page_cache *pc; struct usb_device_request req; int actlen; struct usb_page_search res; usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL); ht = usbd_xfer_get_priv(xfer); UHSO_DPRINTF(3, "status=%d, using mux port %d\n", USB_GET_STATE(xfer), ht->ht_muxport); switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: UHSO_DPRINTF(3, "wrote %zd data bytes to muxport %d\n", actlen - sizeof(struct usb_device_request) , ht->ht_muxport); /* FALLTHROUGH */ case USB_ST_SETUP: tr_setup: pc = usbd_xfer_get_frame(xfer, 1); if (ucom_get_data(&sc->sc_ucom[ht->ht_muxport], pc, 0, 32, &actlen)) { usbd_get_page(pc, 0, &res); memset(&req, 0, sizeof(struct usb_device_request)); req.bmRequestType = UT_WRITE_CLASS_INTERFACE; req.bRequest = UCDC_SEND_ENCAPSULATED_COMMAND; USETW(req.wValue, 0); USETW(req.wIndex, ht->ht_muxport); USETW(req.wLength, actlen); pc = usbd_xfer_get_frame(xfer, 0); usbd_copy_in(pc, 0, &req, sizeof(req)); usbd_xfer_set_frame_len(xfer, 0, sizeof(req)); usbd_xfer_set_frame_len(xfer, 1, actlen); usbd_xfer_set_frames(xfer, 2); UHSO_DPRINTF(3, "Prepared %d bytes for transmit " "on muxport %d\n", actlen, ht->ht_muxport); usbd_transfer_submit(xfer); } break; default: UHSO_DPRINTF(0, "error: %s\n", usbd_errstr(error)); if (error == USB_ERR_CANCELLED) break; usbd_xfer_set_stall(xfer); goto tr_setup; } } static int uhso_attach_bulkserial(struct uhso_softc *sc, struct usb_interface *iface, int type) { usb_error_t uerr; int tty; /* Try attaching RD/WR/INTR first */ uerr = usbd_transfer_setup(sc->sc_udev, &iface->idesc->bInterfaceNumber, sc->sc_xfer, uhso_bs_config, UHSO_BULK_ENDPT_MAX, sc, &sc->sc_mtx); if (uerr) { /* Try only RD/WR */ uerr = usbd_transfer_setup(sc->sc_udev, &iface->idesc->bInterfaceNumber, sc->sc_xfer, uhso_bs_config, UHSO_BULK_ENDPT_MAX - 1, sc, &sc->sc_mtx); } if (uerr) { UHSO_DPRINTF(0, "usbd_transfer_setup failed"); return (-1); } tty = uhso_alloc_tty(sc); if (tty < 0) { usbd_transfer_unsetup(sc->sc_xfer, UHSO_BULK_ENDPT_MAX); return (ENOMEM); } sc->sc_tty[tty].ht_muxport = -1; return (0); } static void uhso_bs_read_callback(struct usb_xfer *xfer, usb_error_t error) { struct uhso_softc *sc = usbd_xfer_softc(xfer); struct usb_page_cache *pc; int actlen; usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL); UHSO_DPRINTF(3, "status %d, actlen=%d\n", USB_GET_STATE(xfer), actlen); switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: pc = usbd_xfer_get_frame(xfer, 0); ucom_put_data(&sc->sc_ucom[0], pc, 0, actlen); /* FALLTHROUGH */ case USB_ST_SETUP: tr_setup: usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer)); usbd_transfer_submit(xfer); break; default: UHSO_DPRINTF(0, "error: %s\n", usbd_errstr(error)); if (error == USB_ERR_CANCELLED) break; usbd_xfer_set_stall(xfer); goto tr_setup; } } static void uhso_bs_write_callback(struct usb_xfer *xfer, usb_error_t error) { struct uhso_softc *sc = usbd_xfer_softc(xfer); struct usb_page_cache *pc; int actlen; usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL); UHSO_DPRINTF(3, "status %d, actlen=%d\n", USB_GET_STATE(xfer), actlen); switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: case USB_ST_SETUP: tr_setup: pc = usbd_xfer_get_frame(xfer, 0); if (ucom_get_data(&sc->sc_ucom[0], pc, 0, 8192, &actlen)) { usbd_xfer_set_frame_len(xfer, 0, actlen); usbd_transfer_submit(xfer); } break; break; default: UHSO_DPRINTF(0, "error: %s\n", usbd_errstr(error)); if (error == USB_ERR_CANCELLED) break; usbd_xfer_set_stall(xfer); goto tr_setup; } } static void uhso_bs_cfg(struct uhso_softc *sc) { struct usb_device_request req; usb_error_t uerr; if (!(UHSO_IFACE_USB_TYPE(sc->sc_type) & UHSO_IF_BULK)) return; req.bmRequestType = UT_WRITE_CLASS_INTERFACE; req.bRequest = UCDC_SET_CONTROL_LINE_STATE; USETW(req.wValue, sc->sc_line); USETW(req.wIndex, sc->sc_iface_no); USETW(req.wLength, 0); uerr = ucom_cfg_do_request(sc->sc_udev, &sc->sc_ucom[0], &req, NULL, 0, 1000); if (uerr != 0) { device_printf(sc->sc_dev, "failed to set ctrl line state to " "0x%02x: %s\n", sc->sc_line, usbd_errstr(uerr)); } } static void uhso_bs_intr_callback(struct usb_xfer *xfer, usb_error_t error) { struct uhso_softc *sc = usbd_xfer_softc(xfer); struct usb_page_cache *pc; int actlen; struct usb_cdc_notification cdc; usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL); UHSO_DPRINTF(3, "status %d, actlen=%d\n", USB_GET_STATE(xfer), actlen); switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: if (actlen < UCDC_NOTIFICATION_LENGTH) { UHSO_DPRINTF(0, "UCDC notification too short: %d\n", actlen); goto tr_setup; } else if (actlen > (int)sizeof(struct usb_cdc_notification)) { UHSO_DPRINTF(0, "UCDC notification too large: %d\n", actlen); actlen = sizeof(struct usb_cdc_notification); } pc = usbd_xfer_get_frame(xfer, 0); usbd_copy_out(pc, 0, &cdc, actlen); if (UGETW(cdc.wIndex) != sc->sc_iface_no) { UHSO_DPRINTF(0, "Interface mismatch, got %d expected %d\n", UGETW(cdc.wIndex), sc->sc_iface_no); goto tr_setup; } if (cdc.bmRequestType == UCDC_NOTIFICATION && cdc.bNotification == UCDC_N_SERIAL_STATE) { UHSO_DPRINTF(2, "notify = 0x%02x\n", cdc.data[0]); sc->sc_msr = 0; sc->sc_lsr = 0; if (cdc.data[0] & UCDC_N_SERIAL_RI) sc->sc_msr |= SER_RI; if (cdc.data[0] & UCDC_N_SERIAL_DSR) sc->sc_msr |= SER_DSR; if (cdc.data[0] & UCDC_N_SERIAL_DCD) sc->sc_msr |= SER_DCD; ucom_status_change(&sc->sc_ucom[0]); } case USB_ST_SETUP: tr_setup: default: if (error == USB_ERR_CANCELLED) break; usbd_xfer_set_stall(xfer); goto tr_setup; } } static void uhso_ucom_cfg_get_status(struct ucom_softc *ucom, uint8_t *lsr, uint8_t *msr) { struct uhso_softc *sc = ucom->sc_parent; *lsr = sc->sc_lsr; *msr = sc->sc_msr; } static void uhso_ucom_cfg_set_dtr(struct ucom_softc *ucom, uint8_t onoff) { struct uhso_softc *sc = ucom->sc_parent; if (!(UHSO_IFACE_USB_TYPE(sc->sc_type) & UHSO_IF_BULK)) return; if (onoff) sc->sc_line |= UCDC_LINE_DTR; else sc->sc_line &= ~UCDC_LINE_DTR; uhso_bs_cfg(sc); } static void uhso_ucom_cfg_set_rts(struct ucom_softc *ucom, uint8_t onoff) { struct uhso_softc *sc = ucom->sc_parent; if (!(UHSO_IFACE_USB_TYPE(sc->sc_type) & UHSO_IF_BULK)) return; if (onoff) sc->sc_line |= UCDC_LINE_RTS; else sc->sc_line &= ~UCDC_LINE_RTS; uhso_bs_cfg(sc); } static void uhso_ucom_start_read(struct ucom_softc *ucom) { struct uhso_softc *sc = ucom->sc_parent; UHSO_DPRINTF(3, "unit=%d, subunit=%d\n", ucom->sc_super->sc_unit, ucom->sc_subunit); if (UHSO_IFACE_USB_TYPE(sc->sc_type) & UHSO_IF_MUX) { sc->sc_tty[ucom->sc_subunit].ht_open = 1; usbd_transfer_start(sc->sc_xfer[UHSO_MUX_ENDPT_INTR]); } else if (UHSO_IFACE_USB_TYPE(sc->sc_type) & UHSO_IF_BULK) { sc->sc_tty[0].ht_open = 1; usbd_transfer_start(sc->sc_xfer[UHSO_BULK_ENDPT_READ]); if (sc->sc_xfer[UHSO_BULK_ENDPT_INTR] != NULL) usbd_transfer_start(sc->sc_xfer[UHSO_BULK_ENDPT_INTR]); } } static void uhso_ucom_stop_read(struct ucom_softc *ucom) { struct uhso_softc *sc = ucom->sc_parent; if (UHSO_IFACE_USB_TYPE(sc->sc_type) & UHSO_IF_MUX) { sc->sc_tty[ucom->sc_subunit].ht_open = 0; usbd_transfer_stop( sc->sc_tty[ucom->sc_subunit].ht_xfer[UHSO_CTRL_READ]); } else if (UHSO_IFACE_USB_TYPE(sc->sc_type) & UHSO_IF_BULK) { sc->sc_tty[0].ht_open = 0; usbd_transfer_start(sc->sc_xfer[UHSO_BULK_ENDPT_READ]); if (sc->sc_xfer[UHSO_BULK_ENDPT_INTR] != NULL) usbd_transfer_stop(sc->sc_xfer[UHSO_BULK_ENDPT_INTR]); } } static void uhso_ucom_start_write(struct ucom_softc *ucom) { struct uhso_softc *sc = ucom->sc_parent; if (UHSO_IFACE_USB_TYPE(sc->sc_type) & UHSO_IF_MUX) { UHSO_DPRINTF(3, "local unit %d\n", ucom->sc_subunit); usbd_transfer_start(sc->sc_xfer[UHSO_MUX_ENDPT_INTR]); usbd_xfer_set_priv( sc->sc_tty[ucom->sc_subunit].ht_xfer[UHSO_CTRL_WRITE], &sc->sc_tty[ucom->sc_subunit]); usbd_transfer_start( sc->sc_tty[ucom->sc_subunit].ht_xfer[UHSO_CTRL_WRITE]); } else if (UHSO_IFACE_USB_TYPE(sc->sc_type) & UHSO_IF_BULK) { usbd_transfer_start(sc->sc_xfer[UHSO_BULK_ENDPT_WRITE]); } } static void uhso_ucom_stop_write(struct ucom_softc *ucom) { struct uhso_softc *sc = ucom->sc_parent; if (UHSO_IFACE_USB_TYPE(sc->sc_type) & UHSO_IF_MUX) { usbd_transfer_stop( sc->sc_tty[ucom->sc_subunit].ht_xfer[UHSO_CTRL_WRITE]); } else if (UHSO_IFACE_USB_TYPE(sc->sc_type) & UHSO_IF_BULK) { usbd_transfer_stop(sc->sc_xfer[UHSO_BULK_ENDPT_WRITE]); } } static int uhso_attach_ifnet(struct uhso_softc *sc, struct usb_interface *iface, int type) { struct ifnet *ifp; usb_error_t uerr; struct sysctl_ctx_list *sctx; struct sysctl_oid *soid; unsigned int devunit; uerr = usbd_transfer_setup(sc->sc_udev, &iface->idesc->bInterfaceNumber, sc->sc_if_xfer, uhso_ifnet_config, UHSO_IFNET_MAX, sc, &sc->sc_mtx); if (uerr) { UHSO_DPRINTF(0, "usbd_transfer_setup failed: %s\n", usbd_errstr(uerr)); return (-1); } sc->sc_ifp = ifp = if_alloc(IFT_OTHER); if (sc->sc_ifp == NULL) { device_printf(sc->sc_dev, "if_alloc() failed\n"); return (-1); } callout_init_mtx(&sc->sc_c, &sc->sc_mtx, 0); mtx_lock(&sc->sc_mtx); callout_reset(&sc->sc_c, 1, uhso_if_rxflush, sc); mtx_unlock(&sc->sc_mtx); /* * We create our own unit numbers for ifnet devices because the * USB interface unit numbers can be at arbitrary positions yielding * odd looking device names. */ devunit = alloc_unr(uhso_ifnet_unit); if_initname(ifp, device_get_name(sc->sc_dev), devunit); ifp->if_mtu = UHSO_MAX_MTU; ifp->if_ioctl = uhso_if_ioctl; ifp->if_init = uhso_if_init; ifp->if_start = uhso_if_start; ifp->if_output = uhso_if_output; ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST | IFF_NOARP; ifp->if_softc = sc; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_snd.ifq_drv_maxlen = ifqmaxlen; IFQ_SET_READY(&ifp->if_snd); if_attach(ifp); bpfattach(ifp, DLT_RAW, 0); sctx = device_get_sysctl_ctx(sc->sc_dev); soid = device_get_sysctl_tree(sc->sc_dev); /* Unlocked read... */ SYSCTL_ADD_STRING(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "netif", CTLFLAG_RD, ifp->if_xname, 0, "Attached network interface"); return (0); } static void uhso_ifnet_read_callback(struct usb_xfer *xfer, usb_error_t error) { struct uhso_softc *sc = usbd_xfer_softc(xfer); struct mbuf *m; struct usb_page_cache *pc; int actlen; usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL); UHSO_DPRINTF(3, "status=%d, actlen=%d\n", USB_GET_STATE(xfer), actlen); switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: if (actlen > 0 && (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING)) { pc = usbd_xfer_get_frame(xfer, 0); if (mbufq_full(&sc->sc_rxq)) break; m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); usbd_copy_out(pc, 0, mtod(m, uint8_t *), actlen); m->m_pkthdr.len = m->m_len = actlen; /* Enqueue frame for further processing */ mbufq_enqueue(&sc->sc_rxq, m); if (!callout_pending(&sc->sc_c) || !callout_active(&sc->sc_c)) { callout_schedule(&sc->sc_c, 1); } } /* FALLTHROUGH */ case USB_ST_SETUP: tr_setup: usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer)); usbd_transfer_submit(xfer); break; default: UHSO_DPRINTF(0, "error: %s\n", usbd_errstr(error)); if (error == USB_ERR_CANCELLED) break; usbd_xfer_set_stall(xfer); goto tr_setup; } } /* * Deferred RX processing, called with mutex locked. * * Each frame we receive might contain several small ip-packets as well * as partial ip-packets. We need to separate/assemble them into individual * packets before sending them to the ip-layer. */ static void uhso_if_rxflush(void *arg) { + struct epoch_tracker et; struct uhso_softc *sc = arg; struct ifnet *ifp = sc->sc_ifp; uint8_t *cp; struct mbuf *m, *m0, *mwait; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif uint16_t iplen; int isr; m = NULL; mwait = sc->sc_mwait; + NET_EPOCH_ENTER(et); for (;;) { if (m == NULL) { if ((m = mbufq_dequeue(&sc->sc_rxq)) == NULL) break; UHSO_DPRINTF(3, "dequeue m=%p, len=%d\n", m, m->m_len); } mtx_unlock(&sc->sc_mtx); /* Do we have a partial packet waiting? */ if (mwait != NULL) { m0 = mwait; mwait = NULL; UHSO_DPRINTF(3, "partial m0=%p(%d), concat w/ m=%p(%d)\n", m0, m0->m_len, m, m->m_len); m_catpkt(m0, m); m = m_pullup(m0, sizeof(struct ip)); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); UHSO_DPRINTF(0, "m_pullup failed\n"); mtx_lock(&sc->sc_mtx); continue; } UHSO_DPRINTF(3, "Constructed mbuf=%p, len=%d\n", m, m->m_pkthdr.len); } cp = mtod(m, uint8_t *); ip = (struct ip *)cp; #ifdef INET6 ip6 = (struct ip6_hdr *)cp; #endif /* Check for IPv4 */ if (ip->ip_v == IPVERSION) { iplen = htons(ip->ip_len); isr = NETISR_IP; } #ifdef INET6 /* Check for IPv6 */ else if ((ip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION) { iplen = htons(ip6->ip6_plen); isr = NETISR_IPV6; } #endif else { UHSO_DPRINTF(0, "got unexpected ip version %d, " "m=%p, len=%d\n", (*cp & 0xf0) >> 4, m, m->m_len); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); UHSO_HEXDUMP(cp, 4); m_freem(m); m = NULL; mtx_lock(&sc->sc_mtx); continue; } if (iplen == 0) { UHSO_DPRINTF(0, "Zero IP length\n"); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); m = NULL; mtx_lock(&sc->sc_mtx); continue; } UHSO_DPRINTF(3, "m=%p, len=%d, cp=%p, iplen=%d\n", m, m->m_pkthdr.len, cp, iplen); m0 = NULL; /* More IP packets in this mbuf */ if (iplen < m->m_pkthdr.len) { m0 = m; /* * Allocate a new mbuf for this IP packet and * copy the IP-packet into it. */ m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR); memcpy(mtod(m, uint8_t *), mtod(m0, uint8_t *), iplen); m->m_pkthdr.len = m->m_len = iplen; /* Adjust the size of the original mbuf */ m_adj(m0, iplen); m0 = m_defrag(m0, M_WAITOK); UHSO_DPRINTF(3, "New mbuf=%p, len=%d/%d, m0=%p, " "m0_len=%d/%d\n", m, m->m_pkthdr.len, m->m_len, m0, m0->m_pkthdr.len, m0->m_len); } else if (iplen > m->m_pkthdr.len) { UHSO_DPRINTF(3, "Deferred mbuf=%p, len=%d\n", m, m->m_pkthdr.len); mwait = m; m = NULL; mtx_lock(&sc->sc_mtx); continue; } if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); m->m_pkthdr.rcvif = ifp; /* Dispatch to IP layer */ BPF_MTAP(sc->sc_ifp, m); M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); m = m0 != NULL ? m0 : NULL; mtx_lock(&sc->sc_mtx); } + NET_EPOCH_EXIT(et); sc->sc_mwait = mwait; } static void uhso_ifnet_write_callback(struct usb_xfer *xfer, usb_error_t error) { struct uhso_softc *sc = usbd_xfer_softc(xfer); struct ifnet *ifp = sc->sc_ifp; struct usb_page_cache *pc; struct mbuf *m; int actlen; usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL); UHSO_DPRINTF(3, "status %d, actlen=%d\n", USB_GET_STATE(xfer), actlen); switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; case USB_ST_SETUP: tr_setup: IFQ_DRV_DEQUEUE(&ifp->if_snd, m); if (m == NULL) break; ifp->if_drv_flags |= IFF_DRV_OACTIVE; if (m->m_pkthdr.len > MCLBYTES) m->m_pkthdr.len = MCLBYTES; usbd_xfer_set_frame_len(xfer, 0, m->m_pkthdr.len); pc = usbd_xfer_get_frame(xfer, 0); usbd_m_copy_in(pc, 0, m, 0, m->m_pkthdr.len); usbd_transfer_submit(xfer); BPF_MTAP(ifp, m); m_freem(m); break; default: UHSO_DPRINTF(0, "error: %s\n", usbd_errstr(error)); if (error == USB_ERR_CANCELLED) break; usbd_xfer_set_stall(xfer); goto tr_setup; } } static int uhso_if_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct uhso_softc *sc; sc = ifp->if_softc; switch (cmd) { case SIOCSIFFLAGS: if (ifp->if_flags & IFF_UP) { if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { uhso_if_init(sc); } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { mtx_lock(&sc->sc_mtx); uhso_if_stop(sc); mtx_unlock(&sc->sc_mtx); } } break; case SIOCSIFADDR: case SIOCADDMULTI: case SIOCDELMULTI: break; default: return (EINVAL); } return (0); } static void uhso_if_init(void *priv) { struct uhso_softc *sc = priv; struct ifnet *ifp = sc->sc_ifp; mtx_lock(&sc->sc_mtx); uhso_if_stop(sc); ifp = sc->sc_ifp; ifp->if_flags |= IFF_UP; ifp->if_drv_flags |= IFF_DRV_RUNNING; mtx_unlock(&sc->sc_mtx); UHSO_DPRINTF(2, "ifnet initialized\n"); } static int uhso_if_output(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, struct route *ro) { int error; /* Only IPv4/6 support */ if (dst->sa_family != AF_INET #ifdef INET6 && dst->sa_family != AF_INET6 #endif ) { return (EAFNOSUPPORT); } error = (ifp->if_transmit)(ifp, m0); if (error) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); return (0); } static void uhso_if_start(struct ifnet *ifp) { struct uhso_softc *sc = ifp->if_softc; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { UHSO_DPRINTF(1, "Not running\n"); return; } mtx_lock(&sc->sc_mtx); usbd_transfer_start(sc->sc_if_xfer[UHSO_IFNET_READ]); usbd_transfer_start(sc->sc_if_xfer[UHSO_IFNET_WRITE]); mtx_unlock(&sc->sc_mtx); UHSO_DPRINTF(3, "interface started\n"); } static void uhso_if_stop(struct uhso_softc *sc) { usbd_transfer_stop(sc->sc_if_xfer[UHSO_IFNET_READ]); usbd_transfer_stop(sc->sc_if_xfer[UHSO_IFNET_WRITE]); sc->sc_ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); } diff --git a/sys/dev/usb/net/usb_ethernet.c b/sys/dev/usb/net/usb_ethernet.c index b637f1ace0e1..b1d1ff202fc4 100644 --- a/sys/dev/usb/net/usb_ethernet.c +++ b/sys/dev/usb/net/usb_ethernet.c @@ -1,671 +1,670 @@ /* $FreeBSD$ */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009 Andrew Thompson (thompsa@FreeBSD.org) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static SYSCTL_NODE(_net, OID_AUTO, ue, CTLFLAG_RD, 0, "USB Ethernet parameters"); #define UE_LOCK(_ue) mtx_lock((_ue)->ue_mtx) #define UE_UNLOCK(_ue) mtx_unlock((_ue)->ue_mtx) #define UE_LOCK_ASSERT(_ue, t) mtx_assert((_ue)->ue_mtx, t) MODULE_DEPEND(uether, usb, 1, 1, 1); MODULE_DEPEND(uether, miibus, 1, 1, 1); static struct unrhdr *ueunit; static usb_proc_callback_t ue_attach_post_task; static usb_proc_callback_t ue_promisc_task; static usb_proc_callback_t ue_setmulti_task; static usb_proc_callback_t ue_ifmedia_task; static usb_proc_callback_t ue_tick_task; static usb_proc_callback_t ue_start_task; static usb_proc_callback_t ue_stop_task; static void ue_init(void *); static void ue_start(struct ifnet *); static int ue_ifmedia_upd(struct ifnet *); static void ue_watchdog(void *); /* * Return values: * 0: success * Else: device has been detached */ uint8_t uether_pause(struct usb_ether *ue, unsigned int _ticks) { if (usb_proc_is_gone(&ue->ue_tq)) { /* nothing to do */ return (1); } usb_pause_mtx(ue->ue_mtx, _ticks); return (0); } static void ue_queue_command(struct usb_ether *ue, usb_proc_callback_t *fn, struct usb_proc_msg *t0, struct usb_proc_msg *t1) { struct usb_ether_cfg_task *task; UE_LOCK_ASSERT(ue, MA_OWNED); if (usb_proc_is_gone(&ue->ue_tq)) { return; /* nothing to do */ } /* * NOTE: The task cannot get executed before we drop the * "sc_mtx" mutex. It is safe to update fields in the message * structure after that the message got queued. */ task = (struct usb_ether_cfg_task *) usb_proc_msignal(&ue->ue_tq, t0, t1); /* Setup callback and self pointers */ task->hdr.pm_callback = fn; task->ue = ue; /* * Start and stop must be synchronous! */ if ((fn == ue_start_task) || (fn == ue_stop_task)) usb_proc_mwait(&ue->ue_tq, t0, t1); } struct ifnet * uether_getifp(struct usb_ether *ue) { return (ue->ue_ifp); } struct mii_data * uether_getmii(struct usb_ether *ue) { return (device_get_softc(ue->ue_miibus)); } void * uether_getsc(struct usb_ether *ue) { return (ue->ue_sc); } static int ue_sysctl_parent(SYSCTL_HANDLER_ARGS) { struct usb_ether *ue = arg1; const char *name; name = device_get_nameunit(ue->ue_dev); return SYSCTL_OUT_STR(req, name); } int uether_ifattach(struct usb_ether *ue) { int error; /* check some critical parameters */ if ((ue->ue_dev == NULL) || (ue->ue_udev == NULL) || (ue->ue_mtx == NULL) || (ue->ue_methods == NULL)) return (EINVAL); error = usb_proc_create(&ue->ue_tq, ue->ue_mtx, device_get_nameunit(ue->ue_dev), USB_PRI_MED); if (error) { device_printf(ue->ue_dev, "could not setup taskqueue\n"); goto error; } /* fork rest of the attach code */ UE_LOCK(ue); ue_queue_command(ue, ue_attach_post_task, &ue->ue_sync_task[0].hdr, &ue->ue_sync_task[1].hdr); UE_UNLOCK(ue); error: return (error); } void uether_ifattach_wait(struct usb_ether *ue) { UE_LOCK(ue); usb_proc_mwait(&ue->ue_tq, &ue->ue_sync_task[0].hdr, &ue->ue_sync_task[1].hdr); UE_UNLOCK(ue); } static void ue_attach_post_task(struct usb_proc_msg *_task) { struct usb_ether_cfg_task *task = (struct usb_ether_cfg_task *)_task; struct usb_ether *ue = task->ue; struct ifnet *ifp; int error; char num[14]; /* sufficient for 32 bits */ /* first call driver's post attach routine */ ue->ue_methods->ue_attach_post(ue); UE_UNLOCK(ue); ue->ue_unit = alloc_unr(ueunit); usb_callout_init_mtx(&ue->ue_watchdog, ue->ue_mtx, 0); sysctl_ctx_init(&ue->ue_sysctl_ctx); mbufq_init(&ue->ue_rxq, 0 /* unlimited length */); error = 0; CURVNET_SET_QUIET(vnet0); ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(ue->ue_dev, "could not allocate ifnet\n"); goto fail; } ifp->if_softc = ue; if_initname(ifp, "ue", ue->ue_unit); if (ue->ue_methods->ue_attach_post_sub != NULL) { ue->ue_ifp = ifp; error = ue->ue_methods->ue_attach_post_sub(ue); } else { ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; if (ue->ue_methods->ue_ioctl != NULL) ifp->if_ioctl = ue->ue_methods->ue_ioctl; else ifp->if_ioctl = uether_ioctl; ifp->if_start = ue_start; ifp->if_init = ue_init; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_snd.ifq_drv_maxlen = ifqmaxlen; IFQ_SET_READY(&ifp->if_snd); ue->ue_ifp = ifp; if (ue->ue_methods->ue_mii_upd != NULL && ue->ue_methods->ue_mii_sts != NULL) { /* device_xxx() depends on this */ mtx_lock(&Giant); error = mii_attach(ue->ue_dev, &ue->ue_miibus, ifp, ue_ifmedia_upd, ue->ue_methods->ue_mii_sts, BMSR_DEFCAPMASK, MII_PHY_ANY, MII_OFFSET_ANY, 0); mtx_unlock(&Giant); } } if (error) { device_printf(ue->ue_dev, "attaching PHYs failed\n"); goto fail; } if_printf(ifp, " on %s\n", device_get_nameunit(ue->ue_dev)); ether_ifattach(ifp, ue->ue_eaddr); /* Tell upper layer we support VLAN oversized frames. */ if (ifp->if_capabilities & IFCAP_VLAN_MTU) ifp->if_hdrlen = sizeof(struct ether_vlan_header); CURVNET_RESTORE(); snprintf(num, sizeof(num), "%u", ue->ue_unit); ue->ue_sysctl_oid = SYSCTL_ADD_NODE(&ue->ue_sysctl_ctx, &SYSCTL_NODE_CHILDREN(_net, ue), OID_AUTO, num, CTLFLAG_RD, NULL, ""); SYSCTL_ADD_PROC(&ue->ue_sysctl_ctx, SYSCTL_CHILDREN(ue->ue_sysctl_oid), OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD, ue, 0, ue_sysctl_parent, "A", "parent device"); UE_LOCK(ue); return; fail: CURVNET_RESTORE(); /* drain mbuf queue */ mbufq_drain(&ue->ue_rxq); /* free unit */ free_unr(ueunit, ue->ue_unit); if (ue->ue_ifp != NULL) { if_free(ue->ue_ifp); ue->ue_ifp = NULL; } UE_LOCK(ue); return; } void uether_ifdetach(struct usb_ether *ue) { struct ifnet *ifp; /* wait for any post attach or other command to complete */ usb_proc_drain(&ue->ue_tq); /* read "ifnet" pointer after taskqueue drain */ ifp = ue->ue_ifp; if (ifp != NULL) { /* we are not running any more */ UE_LOCK(ue); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; UE_UNLOCK(ue); /* drain any callouts */ usb_callout_drain(&ue->ue_watchdog); /* detach miibus */ if (ue->ue_miibus != NULL) { mtx_lock(&Giant); /* device_xxx() depends on this */ device_delete_child(ue->ue_dev, ue->ue_miibus); mtx_unlock(&Giant); } /* detach ethernet */ ether_ifdetach(ifp); /* free interface instance */ if_free(ifp); /* free sysctl */ sysctl_ctx_free(&ue->ue_sysctl_ctx); /* drain mbuf queue */ mbufq_drain(&ue->ue_rxq); /* free unit */ free_unr(ueunit, ue->ue_unit); } /* free taskqueue, if any */ usb_proc_free(&ue->ue_tq); } uint8_t uether_is_gone(struct usb_ether *ue) { return (usb_proc_is_gone(&ue->ue_tq)); } void uether_init(void *arg) { ue_init(arg); } static void ue_init(void *arg) { struct usb_ether *ue = arg; UE_LOCK(ue); ue_queue_command(ue, ue_start_task, &ue->ue_sync_task[0].hdr, &ue->ue_sync_task[1].hdr); UE_UNLOCK(ue); } static void ue_start_task(struct usb_proc_msg *_task) { struct usb_ether_cfg_task *task = (struct usb_ether_cfg_task *)_task; struct usb_ether *ue = task->ue; struct ifnet *ifp = ue->ue_ifp; UE_LOCK_ASSERT(ue, MA_OWNED); ue->ue_methods->ue_init(ue); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; if (ue->ue_methods->ue_tick != NULL) usb_callout_reset(&ue->ue_watchdog, hz, ue_watchdog, ue); } static void ue_stop_task(struct usb_proc_msg *_task) { struct usb_ether_cfg_task *task = (struct usb_ether_cfg_task *)_task; struct usb_ether *ue = task->ue; UE_LOCK_ASSERT(ue, MA_OWNED); usb_callout_stop(&ue->ue_watchdog); ue->ue_methods->ue_stop(ue); } void uether_start(struct ifnet *ifp) { ue_start(ifp); } static void ue_start(struct ifnet *ifp) { struct usb_ether *ue = ifp->if_softc; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; UE_LOCK(ue); ue->ue_methods->ue_start(ue); UE_UNLOCK(ue); } static void ue_promisc_task(struct usb_proc_msg *_task) { struct usb_ether_cfg_task *task = (struct usb_ether_cfg_task *)_task; struct usb_ether *ue = task->ue; ue->ue_methods->ue_setpromisc(ue); } static void ue_setmulti_task(struct usb_proc_msg *_task) { struct usb_ether_cfg_task *task = (struct usb_ether_cfg_task *)_task; struct usb_ether *ue = task->ue; ue->ue_methods->ue_setmulti(ue); } int uether_ifmedia_upd(struct ifnet *ifp) { return (ue_ifmedia_upd(ifp)); } static int ue_ifmedia_upd(struct ifnet *ifp) { struct usb_ether *ue = ifp->if_softc; /* Defer to process context */ UE_LOCK(ue); ue_queue_command(ue, ue_ifmedia_task, &ue->ue_media_task[0].hdr, &ue->ue_media_task[1].hdr); UE_UNLOCK(ue); return (0); } static void ue_ifmedia_task(struct usb_proc_msg *_task) { struct usb_ether_cfg_task *task = (struct usb_ether_cfg_task *)_task; struct usb_ether *ue = task->ue; struct ifnet *ifp = ue->ue_ifp; ue->ue_methods->ue_mii_upd(ifp); } static void ue_watchdog(void *arg) { struct usb_ether *ue = arg; struct ifnet *ifp = ue->ue_ifp; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; ue_queue_command(ue, ue_tick_task, &ue->ue_tick_task[0].hdr, &ue->ue_tick_task[1].hdr); usb_callout_reset(&ue->ue_watchdog, hz, ue_watchdog, ue); } static void ue_tick_task(struct usb_proc_msg *_task) { struct usb_ether_cfg_task *task = (struct usb_ether_cfg_task *)_task; struct usb_ether *ue = task->ue; struct ifnet *ifp = ue->ue_ifp; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; ue->ue_methods->ue_tick(ue); } int uether_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct usb_ether *ue = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; struct mii_data *mii; int error = 0; switch (command) { case SIOCSIFFLAGS: UE_LOCK(ue); if (ifp->if_flags & IFF_UP) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) ue_queue_command(ue, ue_promisc_task, &ue->ue_promisc_task[0].hdr, &ue->ue_promisc_task[1].hdr); else ue_queue_command(ue, ue_start_task, &ue->ue_sync_task[0].hdr, &ue->ue_sync_task[1].hdr); } else { ue_queue_command(ue, ue_stop_task, &ue->ue_sync_task[0].hdr, &ue->ue_sync_task[1].hdr); } UE_UNLOCK(ue); break; case SIOCADDMULTI: case SIOCDELMULTI: UE_LOCK(ue); ue_queue_command(ue, ue_setmulti_task, &ue->ue_multi_task[0].hdr, &ue->ue_multi_task[1].hdr); UE_UNLOCK(ue); break; case SIOCGIFMEDIA: case SIOCSIFMEDIA: if (ue->ue_miibus != NULL) { mii = device_get_softc(ue->ue_miibus); error = ifmedia_ioctl(ifp, ifr, &mii->mii_media, command); } else error = ether_ioctl(ifp, command, data); break; default: error = ether_ioctl(ifp, command, data); break; } return (error); } static int uether_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: ueunit = new_unrhdr(0, INT_MAX, NULL); break; case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t uether_mod = { "uether", uether_modevent, 0 }; struct mbuf * uether_newbuf(void) { struct mbuf *m_new; m_new = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m_new == NULL) return (NULL); m_new->m_len = m_new->m_pkthdr.len = MCLBYTES; m_adj(m_new, ETHER_ALIGN); return (m_new); } int uether_rxmbuf(struct usb_ether *ue, struct mbuf *m, unsigned int len) { struct ifnet *ifp = ue->ue_ifp; UE_LOCK_ASSERT(ue, MA_OWNED); /* finalize mbuf */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = m->m_len = len; /* enqueue for later when the lock can be released */ (void)mbufq_enqueue(&ue->ue_rxq, m); return (0); } int uether_rxbuf(struct usb_ether *ue, struct usb_page_cache *pc, unsigned int offset, unsigned int len) { struct ifnet *ifp = ue->ue_ifp; struct mbuf *m; UE_LOCK_ASSERT(ue, MA_OWNED); if (len < ETHER_HDR_LEN || len > MCLBYTES - ETHER_ALIGN) return (1); m = uether_newbuf(); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); return (ENOMEM); } usbd_copy_out(pc, offset, mtod(m, uint8_t *), len); /* finalize mbuf */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = m->m_len = len; /* enqueue for later when the lock can be released */ (void)mbufq_enqueue(&ue->ue_rxq, m); return (0); } void uether_rxflush(struct usb_ether *ue) { struct ifnet *ifp = ue->ue_ifp; - struct mbuf *m; + struct epoch_tracker et; + struct mbuf *m, *n; UE_LOCK_ASSERT(ue, MA_OWNED); - for (;;) { - m = mbufq_dequeue(&ue->ue_rxq); - if (m == NULL) - break; - - /* - * The USB xfer has been resubmitted so its safe to unlock now. - */ - UE_UNLOCK(ue); + n = mbufq_flush(&ue->ue_rxq); + UE_UNLOCK(ue); + NET_EPOCH_ENTER(et); + while ((m = n) != NULL) { + n = STAILQ_NEXT(m, m_stailqpkt); + m->m_nextpkt = NULL; ifp->if_input(ifp, m); - UE_LOCK(ue); } + NET_EPOCH_EXIT(et); + UE_LOCK(ue); } /* * USB net drivers are run by DRIVER_MODULE() thus SI_SUB_DRIVERS, * SI_ORDER_MIDDLE. Run uether after that. */ DECLARE_MODULE(uether, uether_mod, SI_SUB_DRIVERS, SI_ORDER_ANY); MODULE_VERSION(uether, 1); diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 1f480462db2a..dbb6ab4d261a 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -1,4252 +1,4254 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2004 The FreeBSD Foundation * Copyright (c) 2004-2008 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 */ /* * Comments on the socket life cycle: * * soalloc() sets of socket layer state for a socket, called only by * socreate() and sonewconn(). Socket layer private. * * sodealloc() tears down socket layer state for a socket, called only by * sofree() and sonewconn(). Socket layer private. * * pru_attach() associates protocol layer state with an allocated socket; * called only once, may fail, aborting socket allocation. This is called * from socreate() and sonewconn(). Socket layer private. * * pru_detach() disassociates protocol layer state from an attached socket, * and will be called exactly once for sockets in which pru_attach() has * been successfully called. If pru_attach() returned an error, * pru_detach() will not be called. Socket layer private. * * pru_abort() and pru_close() notify the protocol layer that the last * consumer of a socket is starting to tear down the socket, and that the * protocol should terminate the connection. Historically, pru_abort() also * detached protocol state from the socket state, but this is no longer the * case. * * socreate() creates a socket and attaches protocol state. This is a public * interface that may be used by socket layer consumers to create new * sockets. * * sonewconn() creates a socket and attaches protocol state. This is a * public interface that may be used by protocols to create new sockets when * a new connection is received and will be available for accept() on a * listen socket. * * soclose() destroys a socket after possibly waiting for it to disconnect. * This is a public interface that socket consumers should use to close and * release a socket when done with it. * * soabort() destroys a socket without waiting for it to disconnect (used * only for incoming connections that are already partially or fully * connected). This is used internally by the socket layer when clearing * listen socket queues (due to overflow or close on the listen socket), but * is also a public interface protocols may use to abort connections in * their incomplete listen queues should they no longer be required. Sockets * placed in completed connection listen queues should not be aborted for * reasons described in the comment above the soclose() implementation. This * is not a general purpose close routine, and except in the specific * circumstances described here, should not be used. * * sofree() will free a socket and its protocol state if all references on * the socket have been released, and is the public interface to attempt to * free a socket when a reference is removed. This is a socket layer private * interface. * * NOTE: In addition to socreate() and soclose(), which provide a single * socket reference to the consumer to be managed as required, there are two * calls to explicitly manage socket references, soref(), and sorele(). * Currently, these are generally required only when transitioning a socket * from a listen queue to a file descriptor, in order to prevent garbage * collection of the socket at an untimely moment. For a number of reasons, * these interfaces are not preferred, and should be avoided. * * NOTE: With regard to VNETs the general rule is that callers do not set * curvnet. Exceptions to this rule include soabort(), sodisconnect(), * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() * and sorflush(), which are usually called from a pre-set VNET context. * sopoll() currently does not need a VNET context to be set. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include /* for struct knote */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include /* XXXGL: net_epoch should move out there */ +#include /* XXXGL: net_epoch should move out there */ #include #include #ifdef COMPAT_FREEBSD32 #include #include #include #endif static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); static void so_rdknl_lock(void *); static void so_rdknl_unlock(void *); static void so_rdknl_assert_locked(void *); static void so_rdknl_assert_unlocked(void *); static void so_wrknl_lock(void *); static void so_wrknl_unlock(void *); static void so_wrknl_assert_locked(void *); static void so_wrknl_assert_unlocked(void *); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); static int filt_soempty(struct knote *kn, long hint); static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); fo_kqfilter_t soo_kqfilter; static struct filterops soread_filtops = { .f_isfd = 1, .f_detach = filt_sordetach, .f_event = filt_soread, }; static struct filterops sowrite_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_sowrite, }; static struct filterops soempty_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_soempty, }; so_gen_t so_gencnt; /* generation count for sockets */ MALLOC_DEFINE(M_SONAME, "soname", "socket name"); MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); #define VNET_SO_ASSERT(so) \ VNET_ASSERT(curvnet != NULL, \ ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); #define V_socket_hhh VNET(socket_hhh) /* * Limit on the number of connections in the listen queue waiting * for accept(2). * NB: The original sysctl somaxconn is still available but hidden * to prevent confusion about the actual purpose of this number. */ static u_int somaxconn = SOMAXCONN; static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS) { int error; int val; val = somaxconn; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); /* * The purpose of the UINT_MAX / 3 limit, is so that the formula * 3 * so_qlimit / 2 * below, will not overflow. */ if (val < 1 || val > UINT_MAX / 3) return (EINVAL); somaxconn = val; return (0); } SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size"); SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size (compat)"); static int numopensockets; SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, &numopensockets, 0, "Number of open sockets"); /* * accept_mtx locks down per-socket fields relating to accept queues. See * socketvar.h for an annotation of the protected fields of struct socket. */ struct mtx accept_mtx; MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); /* * so_global_mtx protects so_gencnt, numopensockets, and the per-socket * so_gencnt field. */ static struct mtx so_global_mtx; MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); /* * General IPC sysctl name space, used by sockets and a variety of other IPC * types. */ SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); /* * Initialize the socket subsystem and set up the socket * memory allocator. */ static uma_zone_t socket_zone; int maxsockets; static void socket_zone_change(void *tag) { maxsockets = uma_zone_set_max(socket_zone, maxsockets); } static void socket_hhook_register(int subtype) { if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, &V_socket_hhh[subtype], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register hook\n", __func__); } static void socket_hhook_deregister(int subtype) { if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) printf("%s: WARNING: unable to deregister hook\n", __func__); } static void socket_init(void *tag) { socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); maxsockets = uma_zone_set_max(socket_zone, maxsockets); uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); static void socket_vnet_init(const void *unused __unused) { int i; /* We expect a contiguous range */ for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_register(i); } VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_init, NULL); static void socket_vnet_uninit(const void *unused __unused) { int i; for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_deregister(i); } VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_uninit, NULL); /* * Initialise maxsockets. This SYSINIT must be run after * tunable_mbinit(). */ static void init_maxsockets(void *ignored) { TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); maxsockets = imax(maxsockets, maxfiles); } SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); /* * Sysctl to get and set the maximum global sockets limit. Notify protocols * of the change so that they can update their dependent limits as required. */ static int sysctl_maxsockets(SYSCTL_HANDLER_ARGS) { int error, newmaxsockets; newmaxsockets = maxsockets; error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); if (error == 0 && req->newptr) { if (newmaxsockets > maxsockets && newmaxsockets <= maxfiles) { maxsockets = newmaxsockets; EVENTHANDLER_INVOKE(maxsockets_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW, &maxsockets, 0, sysctl_maxsockets, "IU", "Maximum number of sockets available"); /* * Socket operation routines. These routines are called by the routines in * sys_socket.c or from a system process, and implement the semantics of * socket operations by switching out to the protocol specific routines. */ /* * Get a socket structure from our zone, and initialize it. Note that it * would probably be better to allocate socket and PCB at the same time, but * I'm not convinced that all the protocols can be easily modified to do * this. * * soalloc() returns a socket with a ref count of 0. */ static struct socket * soalloc(struct vnet *vnet) { struct socket *so; so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); if (so == NULL) return (NULL); #ifdef MAC if (mac_socket_init(so, M_NOWAIT) != 0) { uma_zfree(socket_zone, so); return (NULL); } #endif if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { uma_zfree(socket_zone, so); return (NULL); } /* * The socket locking protocol allows to lock 2 sockets at a time, * however, the first one must be a listening socket. WITNESS lacks * a feature to change class of an existing lock, so we use DUPOK. */ mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); so->so_rcv.sb_sel = &so->so_rdsel; so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd.sb_sx, "so_snd_sx"); sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); TAILQ_INIT(&so->so_rcv.sb_aiojobq); TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); #ifdef VIMAGE VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet = vnet; #endif /* We shouldn't need the so_global_mtx */ if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { /* Do we need more comprehensive error returns? */ uma_zfree(socket_zone, so); return (NULL); } mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; #ifdef VIMAGE vnet->vnet_sockcnt++; #endif mtx_unlock(&so_global_mtx); return (so); } /* * Free the storage associated with a socket at the socket layer, tear down * locks, labels, etc. All protocol state is assumed already to have been * torn down (and possibly never set up) by the caller. */ static void sodealloc(struct socket *so) { KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ #ifdef VIMAGE VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet->vnet_sockcnt--; #endif mtx_unlock(&so_global_mtx); #ifdef MAC mac_socket_destroy(so); #endif hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); crfree(so->so_cred); khelp_destroy_osd(&so->osd); if (SOLISTENING(so)) { if (so->sol_accept_filter != NULL) accept_filt_setopt(so, NULL); } else { if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); sx_destroy(&so->so_snd.sb_sx); sx_destroy(&so->so_rcv.sb_sx); SOCKBUF_LOCK_DESTROY(&so->so_snd); SOCKBUF_LOCK_DESTROY(&so->so_rcv); } mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } /* * socreate returns a socket with a ref count of 1. The socket should be * closed with soclose(). */ int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td) { struct protosw *prp; struct socket *so; int error; if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); if (prp == NULL) { /* No support for domain. */ if (pffinddomain(dom) == NULL) return (EAFNOSUPPORT); /* No support for socket type. */ if (proto == 0 && type != 0) return (EPROTOTYPE); return (EPROTONOSUPPORT); } if (prp->pr_usrreqs->pru_attach == NULL || prp->pr_usrreqs->pru_attach == pru_attach_notsupp) return (EPROTONOSUPPORT); if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) return (EPROTONOSUPPORT); if (prp->pr_type != type) return (EPROTOTYPE); so = soalloc(CRED_TO_VNET(cred)); if (so == NULL) return (ENOBUFS); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || (prp->pr_domain->dom_family == PF_INET6) || (prp->pr_domain->dom_family == PF_ROUTE)) so->so_fibnum = td->td_proc->p_fibnum; else so->so_fibnum = 0; so->so_proto = prp; #ifdef MAC mac_socket_create(cred, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_locked, so_rdknl_assert_unlocked); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_locked, so_wrknl_assert_unlocked); /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ CURVNET_SET(so->so_vnet); error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); CURVNET_RESTORE(); if (error) { sodealloc(so); return (error); } soref(so); *aso = so; return (0); } #ifdef REGRESSION static int regression_sonewconn_earlytest = 1; SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); #endif /* * When an attempt at a new connection is noted on a socket which accepts * connections, sonewconn is called. If the connection is possible (subject * to space constraints, etc.) then we allocate a new structure, properly * linked into the data structure of the original socket, and return this. * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. * * Note: the ref count on the socket is 0 on return. */ struct socket * sonewconn(struct socket *head, int connstatus) { static struct timeval lastover; static struct timeval overinterval = { 60, 0 }; static int overcount; struct socket *so; u_int over; SOLISTEN_LOCK(head); over = (head->sol_qlen > 3 * head->sol_qlimit / 2); SOLISTEN_UNLOCK(head); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else if (over) { #endif overcount++; if (ratecheck(&lastover, &overinterval)) { log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences)\n", __func__, head->so_pcb, head->sol_qlen, overcount); overcount = 0; } return (NULL); } VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", __func__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_listen = head; so->so_type = head->so_type; so->so_linger = head->so_linger; so->so_state = head->so_state | SS_NOFDREF; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_locked, so_rdknl_assert_unlocked); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_locked, so_wrknl_assert_unlocked); VNET_SO_ASSERT(head); if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; so->so_snd.sb_lowat = head->sol_sbsnd_lowat; so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; so->so_snd.sb_timeo = head->sol_sbsnd_timeo; so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE; SOLISTEN_LOCK(head); if (head->sol_accept_filter != NULL) connstatus = 0; so->so_state |= connstatus; so->so_options = head->so_options & ~SO_ACCEPTCONN; soref(head); /* A socket on (in)complete queue refs head. */ if (connstatus) { TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); so->so_qstate = SQ_COMP; head->sol_qlen++; solisten_wakeup(head); /* unlocks */ } else { /* * Keep removing sockets from the head until there's room for * us to insert on the tail. In pre-locking revisions, this * was a simple if(), but as we could be racing with other * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ while (head->sol_incqlen > head->sol_qlimit) { struct socket *sp; sp = TAILQ_FIRST(&head->sol_incomp); TAILQ_REMOVE(&head->sol_incomp, sp, so_list); head->sol_incqlen--; SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); sorele(head); /* does SOLISTEN_UNLOCK, head stays */ soabort(sp); SOLISTEN_LOCK(head); } TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); so->so_qstate = SQ_INCOMP; head->sol_incqlen++; SOLISTEN_UNLOCK(head); } return (so); } #ifdef SCTP /* * Socket part of sctp_peeloff(). Detach a new socket from an * association. The new socket is returned with a reference. */ struct socket * sopeeloff(struct socket *head) { struct socket *so; VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", __func__, __LINE__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_type = head->so_type; so->so_options = head->so_options; so->so_linger = head->so_linger; so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_locked, so_rdknl_assert_unlocked); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_locked, so_wrknl_assert_unlocked); VNET_SO_ASSERT(head); if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; so->so_snd.sb_lowat = head->so_snd.sb_lowat; so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; so->so_snd.sb_timeo = head->so_snd.sb_timeo; so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; soref(so); return (so); } #endif /* SCTP */ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); CURVNET_RESTORE(); return (error); } int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td); CURVNET_RESTORE(); return (error); } /* * solisten() transitions a socket from a non-listening state to a listening * state, but can also be used to update the listen queue depth on an * existing listen socket. The protocol will call back into the sockets * layer using solisten_proto_check() and solisten_proto() to check and set * socket-layer listen state. Call backs are used so that the protocol can * acquire both protocol and socket layer locks in whatever order is required * by the protocol. * * Protocol implementors are advised to hold the socket lock across the * socket-layer test and set to avoid races at the socket layer. */ int solisten(struct socket *so, int backlog, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); CURVNET_RESTORE(); return (error); } int solisten_proto_check(struct socket *so) { SOCK_LOCK_ASSERT(so); if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) return (EINVAL); return (0); } void solisten_proto(struct socket *so, int backlog) { int sbrcv_lowat, sbsnd_lowat; u_int sbrcv_hiwat, sbsnd_hiwat; short sbrcv_flags, sbsnd_flags; sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); if (SOLISTENING(so)) goto listening; /* * Change this socket to listening state. */ sbrcv_lowat = so->so_rcv.sb_lowat; sbsnd_lowat = so->so_snd.sb_lowat; sbrcv_hiwat = so->so_rcv.sb_hiwat; sbsnd_hiwat = so->so_snd.sb_hiwat; sbrcv_flags = so->so_rcv.sb_flags; sbsnd_flags = so->so_snd.sb_flags; sbrcv_timeo = so->so_rcv.sb_timeo; sbsnd_timeo = so->so_snd.sb_timeo; sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); sx_destroy(&so->so_snd.sb_sx); sx_destroy(&so->so_rcv.sb_sx); SOCKBUF_LOCK_DESTROY(&so->so_snd); SOCKBUF_LOCK_DESTROY(&so->so_rcv); #ifdef INVARIANTS bzero(&so->so_rcv, sizeof(struct socket) - offsetof(struct socket, so_rcv)); #endif so->sol_sbrcv_lowat = sbrcv_lowat; so->sol_sbsnd_lowat = sbsnd_lowat; so->sol_sbrcv_hiwat = sbrcv_hiwat; so->sol_sbsnd_hiwat = sbsnd_hiwat; so->sol_sbrcv_flags = sbrcv_flags; so->sol_sbsnd_flags = sbsnd_flags; so->sol_sbrcv_timeo = sbrcv_timeo; so->sol_sbsnd_timeo = sbsnd_timeo; so->sol_qlen = so->sol_incqlen = 0; TAILQ_INIT(&so->sol_incomp); TAILQ_INIT(&so->sol_comp); so->sol_accept_filter = NULL; so->sol_accept_filter_arg = NULL; so->sol_accept_filter_str = NULL; so->sol_upcall = NULL; so->sol_upcallarg = NULL; so->so_options |= SO_ACCEPTCONN; listening: if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->sol_qlimit = backlog; } /* * Wakeup listeners/subsystems once we have a complete connection. * Enters with lock, returns unlocked. */ void solisten_wakeup(struct socket *sol) { if (sol->sol_upcall != NULL) (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); else { selwakeuppri(&sol->so_rdsel, PSOCK); KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); } SOLISTEN_UNLOCK(sol); wakeup_one(&sol->sol_comp); if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) pgsigio(&sol->so_sigio, SIGIO, 0); } /* * Return single connection off a listening socket queue. Main consumer of * the function is kern_accept4(). Some modules, that do their own accept * management also use the function. * * Listening socket must be locked on entry and is returned unlocked on * return. * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. */ int solisten_dequeue(struct socket *head, struct socket **ret, int flags) { struct socket *so; int error; SOLISTEN_LOCK_ASSERT(head); while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && head->so_error == 0) { error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH, "accept", 0); if (error != 0) { SOLISTEN_UNLOCK(head); return (error); } } if (head->so_error) { error = head->so_error; head->so_error = 0; } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) error = EWOULDBLOCK; else error = 0; if (error) { SOLISTEN_UNLOCK(head); return (error); } so = TAILQ_FIRST(&head->sol_comp); SOCK_LOCK(so); KASSERT(so->so_qstate == SQ_COMP, ("%s: so %p not SQ_COMP", __func__, so)); soref(so); head->sol_qlen--; so->so_qstate = SQ_NONE; so->so_listen = NULL; TAILQ_REMOVE(&head->sol_comp, so, so_list); if (flags & ACCEPT4_INHERIT) so->so_state |= (head->so_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; SOCK_UNLOCK(so); sorele(head); *ret = so; return (0); } /* * Evaluate the reference count and named references on a socket; if no * references remain, free it. This should be called whenever a reference is * released, such as in sorele(), but also when named reference flags are * cleared in socket or protocol code. * * sofree() will free the socket if: * * - There are no outstanding file descriptor references or related consumers * (so_count == 0). * * - The socket has been closed by user space, if ever open (SS_NOFDREF). * * - The protocol does not have an outstanding strong reference on the socket * (SS_PROTOREF). * * - The socket is not in a completed connection queue, so a process has been * notified that it is present. If it is removed, the user process may * block in accept() despite select() saying the socket was ready. */ void sofree(struct socket *so) { struct protosw *pr = so->so_proto; SOCK_LOCK_ASSERT(so); if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) { SOCK_UNLOCK(so); return; } if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) { struct socket *sol; sol = so->so_listen; KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so)); /* * To solve race between close of a listening socket and * a socket on its incomplete queue, we need to lock both. * The order is first listening socket, then regular. * Since we don't have SS_NOFDREF neither SS_PROTOREF, this * function and the listening socket are the only pointers * to so. To preserve so and sol, we reference both and then * relock. * After relock the socket may not move to so_comp since it * doesn't have PCB already, but it may be removed from * so_incomp. If that happens, we share responsiblity on * freeing the socket, but soclose() has already removed * it from queue. */ soref(sol); soref(so); SOCK_UNLOCK(so); SOLISTEN_LOCK(sol); SOCK_LOCK(so); if (so->so_qstate == SQ_INCOMP) { KASSERT(so->so_listen == sol, ("%s: so %p migrated out of sol %p", __func__, so, sol)); TAILQ_REMOVE(&sol->sol_incomp, so, so_list); sol->sol_incqlen--; /* This is guarenteed not to be the last. */ refcount_release(&sol->so_count); so->so_qstate = SQ_NONE; so->so_listen = NULL; } else KASSERT(so->so_listen == NULL, ("%s: so %p not on (in)comp with so_listen", __func__, so)); sorele(sol); KASSERT(so->so_count == 1, ("%s: so %p count %u", __func__, so, so->so_count)); so->so_count = 0; } if (SOLISTENING(so)) so->so_error = ECONNABORTED; SOCK_UNLOCK(so); if (so->so_dtor != NULL) so->so_dtor(so); VNET_SO_ASSERT(so); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(so); if (pr->pr_usrreqs->pru_detach != NULL) (*pr->pr_usrreqs->pru_detach)(so); /* * From this point on, we assume that no other references to this * socket exist anywhere else in the stack. Therefore, no locks need * to be acquired or held. * * We used to do a lot of socket buffer and socket locking here, as * well as invoke sorflush() and perform wakeups. The direct call to * dom_dispose() and sbdestroy() are an inlining of what was * necessary from sorflush(). * * Notice that the socket buffer and kqueue state are torn down * before calling pru_detach. This means that protocols shold not * assume they can perform socket wakeups, etc, in their detach code. */ if (!SOLISTENING(so)) { sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); } seldrain(&so->so_rdsel); seldrain(&so->so_wrsel); knlist_destroy(&so->so_rdsel.si_note); knlist_destroy(&so->so_wrsel.si_note); sodealloc(so); } /* * Close a socket on last file table reference removal. Initiate disconnect * if connected. Free socket when disconnect complete. * * This function will sorele() the socket. Note that soclose() may be called * prior to the ref count reaching zero. The actual socket structure will * not be freed until the ref count reaches zero. */ int soclose(struct socket *so) { struct accept_queue lqueue; bool listening; int error = 0; KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error) { if (error == ENOTCONN) error = 0; goto drop; } } if (so->so_options & SO_LINGER) { if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { error = tsleep(&so->so_timeo, PSOCK | PCATCH, "soclos", so->so_linger * hz); if (error) break; } } } drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); SOCK_LOCK(so); if ((listening = (so->so_options & SO_ACCEPTCONN))) { struct socket *sp; TAILQ_INIT(&lqueue); TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); so->sol_qlen = so->sol_incqlen = 0; TAILQ_FOREACH(sp, &lqueue, so_list) { SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); /* Guaranteed not to be the last. */ refcount_release(&so->so_count); } } KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; sorele(so); if (listening) { struct socket *sp, *tsp; TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) { SOCK_LOCK(sp); if (sp->so_count == 0) { SOCK_UNLOCK(sp); soabort(sp); } else /* sp is now in sofree() */ SOCK_UNLOCK(sp); } } CURVNET_RESTORE(); return (error); } /* * soabort() is used to abruptly tear down a connection, such as when a * resource limit is reached (listen queue depth exceeded), or if a listen * socket is closed while there are sockets waiting to be accepted. * * This interface is tricky, because it is called on an unreferenced socket, * and must be called only by a thread that has actually removed the socket * from the listen queue it was on, or races with other threads are risked. * * This interface will call into the protocol code, so must not be called * with any socket locks held. Protocols do call it while holding their own * recursible protocol mutexes, but this is something that should be subject * to review in the future. */ void soabort(struct socket *so) { /* * In as much as is possible, assert that no references to this * socket are held. This is not quite the same as asserting that the * current thread is responsible for arranging for no references, but * is as close as we can get for now. */ KASSERT(so->so_count == 0, ("soabort: so_count")); KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); VNET_SO_ASSERT(so); if (so->so_proto->pr_usrreqs->pru_abort != NULL) (*so->so_proto->pr_usrreqs->pru_abort)(so); SOCK_LOCK(so); sofree(so); } int soaccept(struct socket *so, struct sockaddr **nam) { int error; SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); so->so_state &= ~SS_NOFDREF; SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); CURVNET_RESTORE(); return (error); } int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (soconnectat(AT_FDCWD, so, nam, td)); } int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); CURVNET_SET(so->so_vnet); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. This allows * user to disconnect by connecting to, e.g., a null address. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) { error = EISCONN; } else { /* * Prevent accumulated error from previous connection from * biting us. */ so->so_error = 0; if (fd == AT_FDCWD) { error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); } else { error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd, so, nam, td); } } CURVNET_RESTORE(); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { int error; CURVNET_SET(so1->so_vnet); error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); CURVNET_RESTORE(); return (error); } int sodisconnect(struct socket *so) { int error; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) return (EALREADY); VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); return (error); } #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); KASSERT(so->so_proto->pr_flags & PR_ATOMIC, ("sosend_dgram: !PR_ATOMIC")); if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. */ if (resid < 0) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection-based * socket if it supports implied connect. Return ENOTCONN if * not connected and no address is supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto out; } } else if (addr == NULL) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; SOCKBUF_UNLOCK(&so->so_snd); goto out; } } /* * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a * problem and need fixing. */ space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; space -= clen; SOCKBUF_UNLOCK(&so->so_snd); if (resid > space) { error = EMSGSIZE; goto out; } if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf chain. * If no data is to be copied in, a single empty mbuf * is returned. */ top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); if (top == NULL) { error = EFAULT; /* only possible error */ goto out; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } KASSERT(resid == 0, ("sosend_dgram: resid != 0")); /* * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock * than with. */ if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously done could be out * of date. We could have received a reset packet in an interrupt or * maybe we slept while doing page faults in uiomove() etc. We could * probably recheck again inside the locking protection here, but * there are probably other places that this also happens. We must * rethink this. */ VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_send)(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands this flag and * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } /* * Send on a socket. If send must go all at once and message is larger than * send buffering, then hard error. Lock against other senders. If must go * all at once and not enough room now, then inform user that this would * block and do nothing. Otherwise, if nonblocking, send as much as * possible. The data to be sent is described by "uio" if nonzero, otherwise * by the mbuf chain "top" (which must be null if uio is not). Data provided * in mbuf chain must be small enough to send all at once. * * Returns nonzero on error, timeout or signal; callers must check for short * counts if EINTR/ERESTART are returned. Data and control buffers are freed * on return. */ int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; int pru_flag; #ifdef KERN_TLS struct ktls_session *tls; int tls_enq_cnt, tls_pruflag; uint8_t tls_rtype; tls = NULL; tls_rtype = TLS_RLTYPE_APP; #endif if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. * * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) goto out; #ifdef KERN_TLS tls_pruflag = 0; tls = ktls_hold(so->so_snd.sb_tls_info); if (tls != NULL) { if (tls->sw_encrypt != NULL) tls_pruflag = PRUS_NOTREADY; if (control != NULL) { struct cmsghdr *cm = mtod(control, struct cmsghdr *); if (clen >= sizeof(*cm) && cm->cmsg_type == TLS_SET_RECORD_TYPE) { tls_rtype = *((uint8_t *)CMSG_DATA(cm)); clen = 0; m_freem(control); control = NULL; atomic = 1; } } } #endif restart: do { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto release; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection- * based socket if it supports implied connect. * Return ENOTCONN if not connected and no address is * supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto release; } } else if (addr == NULL) { SOCKBUF_UNLOCK(&so->so_snd); if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; goto release; } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) { SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; goto release; } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto release; } error = sbwait(&so->so_snd); SOCKBUF_UNLOCK(&so->so_snd); if (error) goto release; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); space -= clen; do { if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf * chain. If resid is 0, which can happen * only if we have control to send, then * a single empty mbuf is returned. This * is a workaround to prevent protocol send * methods to panic. */ #ifdef KERN_TLS if (tls != NULL) { top = m_uiotombuf(uio, M_WAITOK, space, tls->params.max_frame_len, M_NOMAP | ((flags & MSG_EOR) ? M_EOR : 0)); if (top != NULL) { error = ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); if (error) { m_freem(top); goto release; } } tls_rtype = TLS_RLTYPE_APP; } else #endif top = m_uiotombuf(uio, M_WAITOK, space, (atomic ? max_hdr : 0), (atomic ? M_PKTHDR : 0) | ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { error = EFAULT; /* only possible error */ goto release; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date. We could have received * a reset packet in an interrupt or maybe we slept * while doing page faults in uiomove() etc. We * could probably recheck again inside the locking * protection here, but there are probably other * places that this also happens. We must rethink * this. */ VNET_SO_ASSERT(so); pru_flag = (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands * this flag and nothing left to send then use * PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; #ifdef KERN_TLS pru_flag |= tls_pruflag; #endif error = (*so->so_proto->pr_usrreqs->pru_send)(so, pru_flag, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } #ifdef KERN_TLS if (tls != NULL && tls->sw_encrypt != NULL) { /* * Note that error is intentionally * ignored. * * Like sendfile(), we rely on the * completion routine (pru_ready()) * to free the mbufs in the event that * pru_send() encountered an error and * did not append them to the sockbuf. */ soref(so); ktls_enqueue(top, so, tls_enq_cnt); } #endif clen = 0; control = NULL; top = NULL; if (error) goto release; } while (resid && space > 0); } while (resid); release: sbunlock(&so->so_snd); out: #ifdef KERN_TLS if (tls != NULL) ktls_free(tls); #endif if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { int error; CURVNET_SET(so->so_vnet); if (!SOLISTENING(so)) error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, control, flags, td); else { m_freem(top); m_freem(control); error = ENOTCONN; } CURVNET_RESTORE(); return (error); } /* * The part of soreceive() that implements reading non-inline out-of-band * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is * unable to return an mbuf chain to the caller. */ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { struct protosw *pr = so->so_proto; struct mbuf *m; int error; KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); VNET_SO_ASSERT(so); m = m_get(M_WAITOK, MT_DATA); error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) goto bad; do { error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: if (m != NULL) m_freem(m); return (error); } /* * Following replacement or removal of the first mbuf on the first mbuf chain * of a socket buffer, push necessary state changes back into the socket * buffer so that other consumers see the values consistently. 'nextrecord' * is the callers locally stored value of the original value of * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. * NOTE: 'nextrecord' may be NULL. */ static __inline void sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) { SOCKBUF_LOCK_ASSERT(sb); /* * First, update for the new value of nextrecord. If necessary, make * it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect the new * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the * addition of a second clause that takes care of the case where * sb_mb has been updated, but remains the last record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } /* * Implement receive operations on a socket. We depend on the way that * records are added to the sockbuf by sbappend. In particular, each record * (mbufs linked through m_next) must begin with an address if the protocol * so specifies, followed by an optional mbuf or mbufs containing ancillary * data, and then zero or more mbufs of data. In order to allow parallelism * between network receive and copying to user space, as well as avoid * sleeping with a mutex held, we release the socket buffer mutex during the * user space copy. Although the sockbuf is locked, new data may still be * appended, and thus we must maintain consistency of the sockbuf during that * time. * * The caller may receive the data as a single mbuf chain by supplying an * mbuf **mp0 for use in returning the chain. The uio is then used only for * the count in uio_resid. */ int soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, **mp; int flags, error, offset; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; ssize_t orig_resid = uio->uio_resid; mp = mp0; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp != NULL) *mp = NULL; if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) && uio->uio_resid) { VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, 0); } error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); if (error) return (error); restart: SOCKBUF_LOCK(&so->so_rcv); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more (subject * to any timeout) if: * 1. the current count is less than the low water mark, or * 2. MSG_DONTWAIT is not set */ if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && sbavail(&so->so_rcv) < uio->uio_resid) && sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { KASSERT(m != NULL || !sbavail(&so->so_rcv), ("receive: m == %p sbavail == %u", m, sbavail(&so->so_rcv))); if (so->so_error) { if (m != NULL) goto dontblock; error = so->so_error; if ((flags & MSG_PEEK) == 0) so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); goto release; } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { if (m == NULL) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } else goto dontblock; } for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENOTCONN; goto release; } if (uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (error) goto release; goto restart; } dontblock: /* * From this point onward, we maintain 'nextrecord' as a cache of the * pointer to the next record in the socket buffer. We must keep the * various socket buffer pointers and local stack versions of the * pointers in sync, pushing out modifications before dropping the * socket buffer mutex, and re-reading them when picking it up. * * Otherwise, we will race with the network stack appending new data * or records onto the socket buffer by using inconsistent/stale * versions of the field, possibly resulting in socket buffer * corruption. * * By holding the high-level sblock(), we prevent simultaneous * readers from pulling off the front of the socket buffer. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); orig_resid = 0; if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; sockbuf_pushsync(&so->so_rcv, nextrecord); } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization (or freeing if controlp == NULL). */ if (m != NULL && m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { if (flags & MSG_PEEK) { if (controlp != NULL) { *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); controlp = &(*controlp)->m_next; } m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sockbuf_pushsync(&so->so_rcv, nextrecord); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); SOCKBUF_LOCK(&so->so_rcv); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { orig_resid = 0; while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } if (m != NULL) nextrecord = so->so_rcv.sb_mb->m_nextpkt; else nextrecord = so->so_rcv.sb_mb; orig_resid = 0; } if (m != NULL) { if ((flags & MSG_PEEK) == 0) { KASSERT(m->m_nextpkt == nextrecord, ("soreceive: post-control, nextrecord !sync")); if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_mb == m, ("soreceive: post-control, sb_mb!=m")); KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive: post-control, lastrecord!=m")); } } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; } else { if ((flags & MSG_PEEK) == 0) { KASSERT(so->so_rcv.sb_mb == nextrecord, ("soreceive: sb_mb != nextrecord")); if (so->so_rcv.sb_mb == NULL) { KASSERT(so->so_rcv.sb_lastrecord == NULL, ("soreceive: sb_lastercord != NULL")); } } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * Now continue to read any data mbufs off of the head of the socket * buffer until the read request is satisfied. Note that 'type' is * used to store the type of any mbuf reads that have happened so far * such that soreceive() can stop reading if the type changes, which * causes soreceive() to return only one of regular data and inline * out-of-band data in a single socket receive operation. */ moff = 0; offset = 0; while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 && error == 0) { /* * If the type of mbuf has changed since the last mbuf * examined ('type'), end the receive operation. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { if (type != m->m_type) break; } else if (type == MT_OOBDATA) break; else KASSERT(m->m_type == MT_DATA, ("m->m_type == %d", m->m_type)); so->so_rcv.sb_state &= ~SBS_RCVATMARK; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. Otherwise copy * them out via the uio, then free. Sockbuf must be * consistent here (points to current mbuf, it points to next * record) when we drop priority; we must note any additions * to the sockbuf when we block interrupts again. */ if (mp == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if ((m->m_flags & M_NOMAP) != 0) error = m_unmappedtouio(m, moff, uio, (int)len); else error = uiomove(mtod(m, char *) + moff, (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) { /* * The MT_SONAME mbuf has already been removed * from the record, so it is necessary to * remove the data mbufs, if any, to preserve * the invariant in the case of PR_ADDR that * requires MT_SONAME mbufs at the head of * each record. */ if (pr->pr_flags & PR_ATOMIC && ((flags & MSG_PEEK) == 0)) (void)sbdroprecord_locked(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } else uio->uio_resid -= len; SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); if (mp != NULL) { m->m_nextpkt = NULL; *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } sockbuf_pushsync(&so->so_rcv, nextrecord); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); } } else { if (flags & MSG_PEEK) moff += len; else { if (mp != NULL) { if (flags & MSG_DONTWAIT) { *mp = m_copym(m, 0, len, M_NOWAIT); if (*mp == NULL) { /* * m_copym() couldn't * allocate an mbuf. * Adjust uio_resid back * (it was adjusted * down by len bytes, * which we didn't end * up "copying" over). */ uio->uio_resid += len; break; } } else { SOCKBUF_UNLOCK(&so->so_rcv); *mp = m_copym(m, 0, len, M_WAITOK); SOCKBUF_LOCK(&so->so_rcv); } } sbcut_locked(&so->so_rcv, len); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_rcv.sb_state |= SBS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), we * must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return with a * short count but without error. Keep sockbuf locked * against other readers. */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) break; /* * Notify the protocol that some data has been * drained before blocking. */ if (pr->pr_flags & PR_WANTRCVD) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * We could receive some data while was notifying * the protocol. Skip blocking in this case. */ if (so->so_rcv.sb_mb == NULL) { error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } m = so->so_rcv.sb_mb; if (m != NULL) nextrecord = m->m_nextpkt; } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m != NULL && pr->pr_flags & PR_ATOMIC) { flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord_locked(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * If soreceive() is being done from the socket callback, * then don't need to generate ACK to peer to update window, * since ACK will be generated on return to TCP. */ if (!(flags & MSG_SOCALLBCK) && (pr->pr_flags & PR_WANTRCVD)) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto restart; } SOCKBUF_UNLOCK(&so->so_rcv); if (flagsp != NULL) *flagsp |= flags; release: sbunlock(&so->so_rcv); return (error); } /* * Optimized version of soreceive() for stream (TCP) sockets. */ int soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int len = 0, error = 0, flags, oresid; struct sockbuf *sb; struct mbuf *m, *n = NULL; /* We only do stream sockets. */ if (so->so_type != SOCK_STREAM) return (EINVAL); if (psa != NULL) *psa = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (controlp != NULL) *controlp = NULL; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp0 != NULL) *mp0 = NULL; sb = &so->so_rcv; /* Prevent other readers from entering the socket. */ error = sblock(sb, SBLOCKWAIT(flags)); if (error) return (error); SOCKBUF_LOCK(sb); /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) { error = EINVAL; goto out; } oresid = uio->uio_resid; /* We will never ever get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { error = ENOTCONN; goto out; } restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); /* Abort if socket has reported problems. */ if (so->so_error) { if (sbavail(sb) > 0) goto deliver; if (oresid > uio->uio_resid) goto out; error = so->so_error; if (!(flags & MSG_PEEK)) so->so_error = 0; goto out; } /* Door is closed. Deliver what is left, if any. */ if (sb->sb_state & SBS_CANTRCVMORE) { if (sbavail(sb) > 0) goto deliver; else goto out; } /* Socket buffer is empty and we shall not block. */ if (sbavail(sb) == 0 && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { error = EAGAIN; goto out; } /* Socket buffer got some data that we shall deliver now. */ if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)) || sbavail(sb) >= sb->sb_lowat || sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat) ) { goto deliver; } /* On MSG_WAITALL we must wait until all data or error arrives. */ if ((flags & MSG_WAITALL) && (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) goto deliver; /* * Wait and block until (more) data comes in. * NB: Drops the sockbuf lock during wait. */ error = sbwait(sb); if (error) goto out; goto restart; deliver: SOCKBUF_LOCK_ASSERT(&so->so_rcv); KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); /* Statistics. */ if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; /* Fill uio until full or current end of socket buffer is reached. */ len = min(uio->uio_resid, sbavail(sb)); if (mp0 != NULL) { /* Dequeue as many mbufs as possible. */ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { if (*mp0 == NULL) *mp0 = sb->sb_mb; else m_cat(*mp0, sb->sb_mb); for (m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p not available", __func__, m)); len -= m->m_len; uio->uio_resid -= m->m_len; sbfree(sb, m); n = m; } n->m_next = NULL; sb->sb_mb = m; sb->sb_lastrecord = sb->sb_mb; if (sb->sb_mb == NULL) SB_EMPTY_FIXUP(sb); } /* Copy the remainder. */ if (len > 0) { KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else uio->uio_resid -= len; if (*mp0 != NULL) m_cat(*mp0, m); else *mp0 = m; if (*mp0 == NULL) { error = ENOBUFS; goto out; } } } else { /* NB: Must unlock socket buffer as uiomove may sleep. */ SOCKBUF_UNLOCK(sb); error = m_mbuftouio(uio, sb->sb_mb, len); SOCKBUF_LOCK(sb); if (error) goto out; } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); /* * Remove the delivered data from the socket buffer unless we * were only peeking. */ if (!(flags & MSG_PEEK)) { if (len > 0) sbdrop_locked(sb, len); /* Notify protocol that we drained some data. */ if ((so->so_proto->pr_flags & PR_WANTRCVD) && (((flags & MSG_WAITALL) && uio->uio_resid > 0) || !(flags & MSG_SOCALLBCK))) { SOCKBUF_UNLOCK(sb); VNET_SO_ASSERT(so); (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(sb); } } /* * For MSG_WAITALL we may have to loop again and wait for * more data to come in. */ if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: SOCKBUF_LOCK_ASSERT(sb); SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); sbunlock(sb); return (error); } /* * Optimized version of soreceive() for simple datagram cases from userspace. * Unlike in the stream case, we're able to drop a datagram if copyout() * fails, and because we handle datagrams atomically, we don't need to use a * sleep lock to prevent I/O interlacing. */ int soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, *m2; int flags, error; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; /* * For any complicated cases, fall back to the full * soreceive_generic(). */ if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); /* * Enforce restrictions on use. */ KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, ("soreceive_dgram: wantrcvd")); KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, ("soreceive_dgram: SBS_RCVATMARK")); KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, ("soreceive_dgram: P_CONNREQUIRED")); /* * Loop blocking while waiting for a datagram. */ SOCKBUF_LOCK(&so->so_rcv); while ((m = so->so_rcv.sb_mb) == NULL) { KASSERT(sbavail(&so->so_rcv) == 0, ("soreceive_dgram: sb_mb NULL but sbavail %u", sbavail(&so->so_rcv))); if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); return (error); } if (so->so_rcv.sb_state & SBS_CANTRCVMORE || uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); return (0); } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); return (EWOULDBLOCK); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); return (error); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive_dgram: lastrecord != m")); } KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, ("soreceive_dgram: m_nextpkt != nextrecord")); /* * Pull 'm' and its chain off the front of the packet queue. */ so->so_rcv.sb_mb = NULL; sockbuf_pushsync(&so->so_rcv, nextrecord); /* * Walk 'm's chain and free that many bytes from the socket buffer. */ for (m2 = m; m2 != NULL; m2 = m2->m_next) sbfree(&so->so_rcv, m2); /* * Do a few last checks before we let go of the lock. */ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); m = m_free(m); } if (m == NULL) { /* XXXRW: Can this happen? */ return (0); } /* * Packet to copyout() is now in 'm' and it is disconnected from the * queue. * * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. We call into the * protocol to perform externalization (or freeing if controlp == * NULL). In some cases there can be only MT_CONTROL mbufs without * MT_DATA mbufs. */ if (m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { m2 = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = m2; } while (m != NULL && m->m_type == MT_CONTROL); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } } KASSERT(m == NULL || m->m_type == MT_DATA, ("soreceive_dgram: !data")); while (m != NULL && uio->uio_resid > 0) { len = uio->uio_resid; if (len > m->m_len) len = m->m_len; error = uiomove(mtod(m, char *), (int)len, uio); if (error) { m_freem(m); return (error); } if (len == m->m_len) m = m_free(m); else { m->m_data += len; m->m_len -= len; } } if (m != NULL) { flags |= MSG_TRUNC; m_freem(m); } if (flagsp != NULL) *flagsp |= flags; return (0); } int soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int error; CURVNET_SET(so->so_vnet); if (!SOLISTENING(so)) error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, controlp, flagsp)); else error = ENOTCONN; CURVNET_RESTORE(); return (error); } int soshutdown(struct socket *so, int how) { struct protosw *pr = so->so_proto; int error, soerror_enotconn; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) return (EINVAL); soerror_enotconn = 0; if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { /* * POSIX mandates us to return ENOTCONN when shutdown(2) is * invoked on a datagram sockets, however historically we would * actually tear socket down. This is known to be leveraged by * some applications to unblock process waiting in recvXXX(2) * by other process that it shares that socket with. Try to meet * both backward-compatibility and POSIX requirements by forcing * ENOTCONN but still asking protocol to perform pru_shutdown(). */ if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) return (ENOTCONN); soerror_enotconn = 1; } if (SOLISTENING(so)) { if (how != SHUT_WR) { SOLISTEN_LOCK(so); so->so_error = ECONNABORTED; solisten_wakeup(so); /* unlocks so */ } goto done; } CURVNET_SET(so->so_vnet); if (pr->pr_usrreqs->pru_flush != NULL) (*pr->pr_usrreqs->pru_flush)(so, how); if (how != SHUT_WR) sorflush(so); if (how != SHUT_RD) { error = (*pr->pr_usrreqs->pru_shutdown)(so); wakeup(&so->so_timeo); CURVNET_RESTORE(); return ((error == 0 && soerror_enotconn) ? ENOTCONN : error); } wakeup(&so->so_timeo); CURVNET_RESTORE(); done: return (soerror_enotconn ? ENOTCONN : 0); } void sorflush(struct socket *so) { struct sockbuf *sb = &so->so_rcv; struct protosw *pr = so->so_proto; struct socket aso; VNET_SO_ASSERT(so); /* * In order to avoid calling dom_dispose with the socket buffer mutex * held, and in order to generally avoid holding the lock for a long * time, we make a copy of the socket buffer and clear the original * (except locks, state). The new socket buffer copy won't have * initialized locks so we can only call routines that won't use or * assert those locks. * * Dislodge threads currently blocked in receive and wait to acquire * a lock against other simultaneous readers before clearing the * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. */ socantrcvmore(so); (void) sblock(sb, SBL_WAIT | SBL_NOINTR); /* * Invalidate/clear most of the sockbuf structure, but leave selinfo * and mutex data unchanged. */ SOCKBUF_LOCK(sb); bzero(&aso, sizeof(aso)); aso.so_pcb = so->so_pcb; bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); bzero(&sb->sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); SOCKBUF_UNLOCK(sb); sbunlock(sb); /* * Dispose of special rights and flush the copied socket. Don't call * any unsafe routines (that rely on locks being initialized) on aso. */ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(&aso); sbrelease_internal(&aso.so_rcv, so); } /* * Wrapper for Socket established helper hook. * Parameters: socket, context of the hook point, hook id. */ static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) { struct socket_hhook_data hhook_data = { .so = so, .hctx = hctx, .m = NULL, .status = 0 }; CURVNET_SET(so->so_vnet); HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); CURVNET_RESTORE(); /* Ugly but needed, since hhooks return void for now */ return (hhook_data.status); } /* * Perhaps this routine, and sooptcopyout(), below, ought to come in an * additional variant to handle the case where the option value needs to be * some kind of integer, but not a specific size. In addition to their use * here, these functions are also called by the protocol-level pr_ctloutput() * routines. */ int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) { size_t valsize; /* * If the user gives us more than we wanted, we ignore it, but if we * don't get the minimum length the caller wants, we return EINVAL. * On success, sopt->sopt_valsize is set to however much we actually * retrieved. */ if ((valsize = sopt->sopt_valsize) < minlen) return EINVAL; if (valsize > len) sopt->sopt_valsize = valsize = len; if (sopt->sopt_td != NULL) return (copyin(sopt->sopt_val, buf, valsize)); bcopy(sopt->sopt_val, buf, valsize); return (0); } /* * Kernel version of setsockopt(2). * * XXX: optlen is size_t, not socklen_t */ int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen) { struct sockopt sopt; sopt.sopt_level = level; sopt.sopt_name = optname; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = optval; sopt.sopt_valsize = optlen; sopt.sopt_td = NULL; return (sosetopt(so, &sopt)); } int sosetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; sbintime_t val; uint32_t val32; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_setopt(so, sopt); if (error) goto bad; break; case SO_LINGER: error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; if (l.l_linger < 0 || l.l_linger > USHRT_MAX || l.l_linger > (INT_MAX / hz)) { error = EDOM; goto bad; } SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) so->so_options |= SO_LINGER; else so->so_options &= ~SO_LINGER; SOCK_UNLOCK(so); break; case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; SOCK_LOCK(so); if (optval) so->so_options |= sopt->sopt_name; else so->so_options &= ~sopt->sopt_name; SOCK_UNLOCK(so); break; case SO_SETFIB: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval >= rt_numfibs) { error = EINVAL; goto bad; } if (((so->so_proto->pr_domain->dom_family == PF_INET) || (so->so_proto->pr_domain->dom_family == PF_INET6) || (so->so_proto->pr_domain->dom_family == PF_ROUTE))) so->so_fibnum = optval; else so->so_fibnum = 0; break; case SO_USER_COOKIE: error = sooptcopyin(sopt, &val32, sizeof val32, sizeof val32); if (error) goto bad; so->so_user_cookie = val32; break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; /* * Values < 1 make no sense for any of these options, * so disallow them. */ if (optval < 1) { error = EINVAL; goto bad; } error = sbsetopt(so, sopt->sopt_name, optval); break; case SO_SNDTIMEO: case SO_RCVTIMEO: #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; error = sooptcopyin(sopt, &tv32, sizeof tv32, sizeof tv32); CP(tv32, tv, tv_sec); CP(tv32, tv, tv_usec); } else #endif error = sooptcopyin(sopt, &tv, sizeof tv, sizeof tv); if (error) goto bad; if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; goto bad; } if (tv.tv_sec > INT32_MAX) val = SBT_MAX; else val = tvtosbt(tv); switch (sopt->sopt_name) { case SO_SNDTIMEO: so->so_snd.sb_timeo = val; break; case SO_RCVTIMEO: so->so_rcv.sb_timeo = val; break; } break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof extmac, sizeof extmac); if (error) goto bad; error = mac_setsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); #else error = EOPNOTSUPP; #endif break; case SO_TS_CLOCK: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval > SO_TS_CLOCK_MAX) { error = EINVAL; goto bad; } so->so_ts_clock = optval; break; case SO_MAX_PACING_RATE: error = sooptcopyin(sopt, &val32, sizeof(val32), sizeof(val32)); if (error) goto bad; so->so_max_pacing_rate = val32; break; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } if (error == 0 && so->so_proto->pr_ctloutput != NULL) (void)(*so->so_proto->pr_ctloutput)(so, sopt); } bad: CURVNET_RESTORE(); return (error); } /* * Helper routine for getsockopt. */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) { int error; size_t valsize; error = 0; /* * Documented get behavior is that we always return a value, possibly * truncated to fit in the user's buffer. Traditional behavior is * that we always tell the user precisely how much we copied, rather * than something useful like the total amount we had available for * her. Note that this interface is not idempotent; the entire * answer must be generated ahead of time. */ valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != NULL) { if (sopt->sopt_td != NULL) error = copyout(buf, sopt->sopt_val, valsize); else bcopy(buf, sopt->sopt_val, valsize); } return (error); } int sogetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; CURVNET_RESTORE(); return (error); } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_getopt(so, sopt); break; case SO_LINGER: SOCK_LOCK(so); l.l_onoff = so->so_options & SO_LINGER; l.l_linger = so->so_linger; SOCK_UNLOCK(so); error = sooptcopyout(sopt, &l, sizeof l); break; case SO_USELOOPBACK: case SO_DONTROUTE: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_BROADCAST: case SO_OOBINLINE: case SO_ACCEPTCONN: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); break; case SO_DOMAIN: optval = so->so_proto->pr_domain->dom_family; goto integer; case SO_TYPE: optval = so->so_type; goto integer; case SO_PROTOCOL: optval = so->so_proto->pr_protocol; goto integer; case SO_ERROR: SOCK_LOCK(so); optval = so->so_error; so->so_error = 0; SOCK_UNLOCK(so); goto integer; case SO_SNDBUF: optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : so->so_snd.sb_hiwat; goto integer; case SO_RCVBUF: optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : so->so_rcv.sb_hiwat; goto integer; case SO_SNDLOWAT: optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : so->so_snd.sb_lowat; goto integer; case SO_RCVLOWAT: optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : so->so_rcv.sb_lowat; goto integer; case SO_SNDTIMEO: case SO_RCVTIMEO: tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? so->so_snd.sb_timeo : so->so_rcv.sb_timeo); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; CP(tv, tv32, tv_sec); CP(tv, tv32, tv_usec); error = sooptcopyout(sopt, &tv32, sizeof tv32); } else #endif error = sooptcopyout(sopt, &tv, sizeof tv); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_PEERLABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_peerlabel( sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_LISTENQLIMIT: optval = SOLISTENING(so) ? so->sol_qlimit : 0; goto integer; case SO_LISTENQLEN: optval = SOLISTENING(so) ? so->sol_qlen : 0; goto integer; case SO_LISTENINCQLEN: optval = SOLISTENING(so) ? so->sol_incqlen : 0; goto integer; case SO_TS_CLOCK: optval = so->so_ts_clock; goto integer; case SO_MAX_PACING_RATE: optval = so->so_max_pacing_rate; goto integer; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } } #ifdef MAC bad: #endif CURVNET_RESTORE(); return (error); } int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { struct mbuf *m, *m_prev; int sopt_size = sopt->sopt_valsize; MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) return ENOBUFS; if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; *mp = m; m_prev = m; while (sopt_size) { MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { m_freem(*mp); return ENOBUFS; } if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m_freem(*mp); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; m_prev->m_next = m; m_prev = m; } return (0); } int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; m = m->m_next; } if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ panic("ip6_sooptmcopyin"); return (0); } int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; size_t valsize = 0; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; valsize += m->m_len; m = m->m_next; } if (m != NULL) { /* enough soopt buffer should be given from user-land */ m_freem(m0); return(EINVAL); } sopt->sopt_valsize = valsize; return (0); } /* * sohasoutofband(): protocol notifies socket layer of the arrival of new * out-of-band data, which will then notify socket consumers. */ void sohasoutofband(struct socket *so) { if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); selwakeuppri(&so->so_rdsel, PSOCK); } int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { /* * We do not need to set or assert curvnet as long as everyone uses * sopoll_generic(). */ return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, td)); } int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { int revents; SOCK_LOCK(so); if (SOLISTENING(so)) { if (!(events & (POLLIN | POLLRDNORM))) revents = 0; else if (!TAILQ_EMPTY(&so->sol_comp)) revents = events & (POLLIN | POLLRDNORM); else if ((events & POLLINIGNEOF) == 0 && so->so_error) revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; else { selrecord(td, &so->so_rdsel); revents = 0; } } else { revents = 0; SOCKBUF_LOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (events & (POLLIN | POLLRDNORM)) if (soreadabledata(so)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (sowriteable(so)) revents |= events & (POLLOUT | POLLWRNORM); if (events & (POLLPRI | POLLRDBAND)) if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) revents |= events & (POLLPRI | POLLRDBAND); if ((events & POLLINIGNEOF) == 0) { if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { revents |= events & (POLLIN | POLLRDNORM); if (so->so_snd.sb_state & SBS_CANTSENDMORE) revents |= POLLHUP; } } if (revents == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { selrecord(td, &so->so_rdsel); so->so_rcv.sb_flags |= SB_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &so->so_wrsel); so->so_snd.sb_flags |= SB_SEL; } } SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); } SOCK_UNLOCK(so); return (revents); } int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; knl = &so->so_rdsel.si_note; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; default: return (EINVAL); } SOCK_LOCK(so); if (SOLISTENING(so)) { knlist_add(knl, kn, 1); } else { SOCKBUF_LOCK(sb); knlist_add(knl, kn, 1); sb->sb_flags |= SB_KNOTE; SOCKBUF_UNLOCK(sb); } SOCK_UNLOCK(so); return (0); } /* * Some routines that return EOPNOTSUPP for entry points that are not * supported by a protocol. Fill in as needed. */ int pru_accept_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job) { return EOPNOTSUPP; } int pru_attach_notsupp(struct socket *so, int proto, struct thread *td) { return EOPNOTSUPP; } int pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect2_notsupp(struct socket *so1, struct socket *so2) { return EOPNOTSUPP; } int pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return EOPNOTSUPP; } int pru_disconnect_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) { return EOPNOTSUPP; } int pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_rcvd_notsupp(struct socket *so, int flags) { return EOPNOTSUPP; } int pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) { return EOPNOTSUPP; } int pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { return EOPNOTSUPP; } int pru_ready_notsupp(struct socket *so, struct mbuf *m, int count) { return (EOPNOTSUPP); } /* * This isn't really a ``null'' operation, but it's the default one and * doesn't do anything destructive. */ int pru_sense_null(struct socket *so, struct stat *sb) { sb->st_blksize = so->so_snd.sb_hiwat; return 0; } int pru_shutdown_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { return EOPNOTSUPP; } int pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { return EOPNOTSUPP; } int pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, struct thread *td) { return EOPNOTSUPP; } static void filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_rdknl_lock(so); knlist_remove(&so->so_rdsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; so_rdknl_unlock(so); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) { SOCK_LOCK_ASSERT(so); kn->kn_data = so->sol_qlen; if (so->so_error) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } return (!TAILQ_EMPTY(&so->sol_comp)); } SOCKBUF_LOCK_ASSERT(&so->so_rcv); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); if (kn->kn_sfflags & NOTE_LOWAT) { if (kn->kn_data >= kn->kn_sdata) return (1); } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) return (1); /* This hook returning non-zero indicates an event, not error */ return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); } static void filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_wrknl_lock(so); knlist_remove(&so->so_wrsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; so_wrknl_unlock(so); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (0); SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) return (0); else if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); else return (kn->kn_data >= so->so_snd.sb_lowat); } static int filt_soempty(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (1); SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbused(&so->so_snd); if (kn->kn_data == 0) return (1); else return (0); } int socheckuid(struct socket *so, uid_t uid) { if (so == NULL) return (EPERM); if (so->so_cred->cr_uid != uid) return (EPERM); return (0); } /* * These functions are used by protocols to notify the socket layer (and its * consumers) of state changes in the sockets driven by protocol-side events. */ /* * Procedures to manipulate state flags of socket and do appropriate wakeups. * * Normal sequence from the active (originating) side is that * soisconnecting() is called during processing of connect() call, resulting * in an eventual call to soisconnected() if/when the connection is * established. When the connection is torn down soisdisconnecting() is * called during processing of disconnect() call, and soisdisconnected() is * called when the connection to the peer is totally severed. The semantics * of these routines are such that connectionless protocols can call * soisconnected() and soisdisconnected() only, bypassing the in-progress * calls when setting up a ``connection'' takes no time. * * From the passive side, a socket is created with two queues of sockets: * so_incomp for connections in progress and so_comp for connections already * made and awaiting user acceptance. As a protocol is preparing incoming * connections, it creates a socket structure queued on so_incomp by calling * sonewconn(). When the connection is established, soisconnected() is * called, and transfers the socket structure to so_comp, making it available * to accept(). * * If a socket is closed with sockets on either so_incomp or so_comp, these * sockets are dropped. * * If higher-level protocols are implemented in the kernel, the wakeups done * here will sometimes cause software-interrupt process scheduling. */ void soisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } void soisconnected(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; if (so->so_qstate == SQ_INCOMP) { struct socket *head = so->so_listen; int ret; KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); /* * Promoting a socket from incomplete queue to complete, we * need to go through reverse order of locking. We first do * trylock, and if that doesn't succeed, we go the hard way * leaving a reference and rechecking consistency after proper * locking. */ if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { soref(head); SOCK_UNLOCK(so); SOLISTEN_LOCK(head); SOCK_LOCK(so); if (__predict_false(head != so->so_listen)) { /* * The socket went off the listen queue, * should be lost race to close(2) of sol. * The socket is about to soabort(). */ SOCK_UNLOCK(so); sorele(head); return; } /* Not the last one, as so holds a ref. */ refcount_release(&head->so_count); } again: if ((so->so_options & SO_ACCEPTFILTER) == 0) { TAILQ_REMOVE(&head->sol_incomp, so, so_list); head->sol_incqlen--; TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); head->sol_qlen++; so->so_qstate = SQ_COMP; SOCK_UNLOCK(so); solisten_wakeup(head); /* unlocks */ } else { SOCKBUF_LOCK(&so->so_rcv); soupcall_set(so, SO_RCV, head->sol_accept_filter->accf_callback, head->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; ret = head->sol_accept_filter->accf_callback(so, head->sol_accept_filter_arg, M_NOWAIT); if (ret == SU_ISCONNECTED) { soupcall_clear(so, SO_RCV); SOCKBUF_UNLOCK(&so->so_rcv); goto again; } SOCKBUF_UNLOCK(&so->so_rcv); SOCK_UNLOCK(so); SOLISTEN_UNLOCK(head); } return; } SOCK_UNLOCK(so); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); } void soisdisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; if (!SOLISTENING(so)) { SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); SOCKBUF_LOCK(&so->so_snd); socantsendmore_locked(so); } SOCK_UNLOCK(so); wakeup(&so->so_timeo); } void soisdisconnected(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISDISCONNECTED; if (!SOLISTENING(so)) { SOCK_UNLOCK(so); SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); SOCKBUF_LOCK(&so->so_snd); sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); socantsendmore_locked(so); } else SOCK_UNLOCK(so); wakeup(&so->so_timeo); } /* * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. */ struct sockaddr * sodupsockaddr(const struct sockaddr *sa, int mflags) { struct sockaddr *sa2; sa2 = malloc(sa->sa_len, M_SONAME, mflags); if (sa2) bcopy(sa, sa2, sa->sa_len); return sa2; } /* * Register per-socket destructor. */ void sodtor_set(struct socket *so, so_dtor_t *func) { SOCK_LOCK_ASSERT(so); so->so_dtor = func; } /* * Register per-socket buffer upcalls. */ void soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_set: bad which"); } SOCKBUF_LOCK_ASSERT(sb); sb->sb_upcall = func; sb->sb_upcallarg = arg; sb->sb_flags |= SB_UPCALL; } void soupcall_clear(struct socket *so, int which) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_clear: bad which"); } SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_upcall != NULL, ("%s: so %p no upcall to clear", __func__, so)); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } void solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) { SOLISTEN_LOCK_ASSERT(so); so->sol_upcall = func; so->sol_upcallarg = arg; } static void so_rdknl_lock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK(so); else SOCKBUF_LOCK(&so->so_rcv); } static void so_rdknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK(so); else SOCKBUF_UNLOCK(&so->so_rcv); } static void so_rdknl_assert_locked(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK_ASSERT(so); else SOCKBUF_LOCK_ASSERT(&so->so_rcv); } static void so_rdknl_assert_unlocked(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK_ASSERT(so); else SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); } static void so_wrknl_lock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK(so); else SOCKBUF_LOCK(&so->so_snd); } static void so_wrknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK(so); else SOCKBUF_UNLOCK(&so->so_snd); } static void so_wrknl_assert_locked(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK_ASSERT(so); else SOCKBUF_LOCK_ASSERT(&so->so_snd); } static void so_wrknl_assert_unlocked(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK_ASSERT(so); else SOCKBUF_UNLOCK_ASSERT(&so->so_snd); } /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void sotoxsocket(struct socket *so, struct xsocket *xso) { bzero(xso, sizeof(*xso)); xso->xso_len = sizeof *xso; xso->xso_so = (uintptr_t)so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = (uintptr_t)so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; xso->so_uid = so->so_cred->cr_uid; xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; if (SOLISTENING(so)) { xso->so_qlen = so->sol_qlen; xso->so_incqlen = so->sol_incqlen; xso->so_qlimit = so->sol_qlimit; xso->so_oobmark = 0; } else { xso->so_state |= so->so_qstate; xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; xso->so_oobmark = so->so_oobmark; sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); } } struct sockbuf * so_sockbuf_rcv(struct socket *so) { return (&so->so_rcv); } struct sockbuf * so_sockbuf_snd(struct socket *so) { return (&so->so_snd); } int so_state_get(const struct socket *so) { return (so->so_state); } void so_state_set(struct socket *so, int val) { so->so_state = val; } int so_options_get(const struct socket *so) { return (so->so_options); } void so_options_set(struct socket *so, int val) { so->so_options = val; } int so_error_get(const struct socket *so) { return (so->so_error); } void so_error_set(struct socket *so, int val) { so->so_error = val; } int so_linger_get(const struct socket *so) { return (so->so_linger); } void so_linger_set(struct socket *so, int val) { KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), ("%s: val %d out of range", __func__, val)); so->so_linger = val; } struct protosw * so_protosw_get(const struct socket *so) { return (so->so_proto); } void so_protosw_set(struct socket *so, struct protosw *val) { so->so_proto = val; } void so_sorwakeup(struct socket *so) { sorwakeup(so); } void so_sowwakeup(struct socket *so) { sowwakeup(so); } void so_sorwakeup_locked(struct socket *so) { sorwakeup_locked(so); } void so_sowwakeup_locked(struct socket *so) { sowwakeup_locked(so); } void so_lock(struct socket *so) { SOCK_LOCK(so); } void so_unlock(struct socket *so) { SOCK_UNLOCK(so); } diff --git a/sys/net/if.c b/sys/net/if.c index 406c57c8efb4..089a481c3b1d 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -1,4600 +1,4593 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #ifdef INET6 #include #include #endif /* INET6 */ #endif /* INET || INET6 */ #include /* * Consumers of struct ifreq such as tcpdump assume no pad between ifr_name * and ifr_ifru when it is used in SIOCGIFCONF. */ _Static_assert(sizeof(((struct ifreq *)0)->ifr_name) == offsetof(struct ifreq, ifr_ifru), "gap between ifr_name and ifr_ifru"); __read_mostly epoch_t net_epoch_preempt; __read_mostly epoch_t net_epoch; #ifdef COMPAT_FREEBSD32 #include #include struct ifreq_buffer32 { uint32_t length; /* (size_t) */ uint32_t buffer; /* (void *) */ }; /* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The * remainder may be interface specific. */ struct ifreq32 { char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ union { struct sockaddr ifru_addr; struct sockaddr ifru_dstaddr; struct sockaddr ifru_broadaddr; struct ifreq_buffer32 ifru_buffer; short ifru_flags[2]; short ifru_index; int ifru_jid; int ifru_metric; int ifru_mtu; int ifru_phys; int ifru_media; uint32_t ifru_data; int ifru_cap[2]; u_int ifru_fib; u_char ifru_vlan_pcp; } ifr_ifru; }; CTASSERT(sizeof(struct ifreq) == sizeof(struct ifreq32)); CTASSERT(__offsetof(struct ifreq, ifr_ifru) == __offsetof(struct ifreq32, ifr_ifru)); struct ifgroupreq32 { char ifgr_name[IFNAMSIZ]; u_int ifgr_len; union { char ifgru_group[IFNAMSIZ]; uint32_t ifgru_groups; } ifgr_ifgru; }; struct ifmediareq32 { char ifm_name[IFNAMSIZ]; int ifm_current; int ifm_mask; int ifm_status; int ifm_active; int ifm_count; uint32_t ifm_ulist; /* (int *) */ }; #define SIOCGIFMEDIA32 _IOC_NEWTYPE(SIOCGIFMEDIA, struct ifmediareq32) #define SIOCGIFXMEDIA32 _IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32) #define _CASE_IOC_IFGROUPREQ_32(cmd) \ _IOC_NEWTYPE((cmd), struct ifgroupreq32): case #else /* !COMPAT_FREEBSD32 */ #define _CASE_IOC_IFGROUPREQ_32(cmd) #endif /* !COMPAT_FREEBSD32 */ #define CASE_IOC_IFGROUPREQ(cmd) \ _CASE_IOC_IFGROUPREQ_32(cmd) \ (cmd) union ifreq_union { struct ifreq ifr; #ifdef COMPAT_FREEBSD32 struct ifreq32 ifr32; #endif }; union ifgroupreq_union { struct ifgroupreq ifgr; #ifdef COMPAT_FREEBSD32 struct ifgroupreq32 ifgr32; #endif }; SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management"); SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, &ifqmaxlen, 0, "max send queue size"); /* Log link state change events */ static int log_link_state_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, &log_link_state_change, 0, "log interface link state change events"); /* Log promiscuous mode change events */ static int log_promisc_mode_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_promisc_mode_change, CTLFLAG_RDTUN, &log_promisc_mode_change, 1, "log promiscuous mode change events"); /* Interface description */ static unsigned int ifdescr_maxlen = 1024; SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW, &ifdescr_maxlen, 0, "administrative maximum length for interface description"); static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions"); /* global sx for non-critical path ifdescr */ static struct sx ifdescr_sx; SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr"); void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); void (*lagg_linkstate_p)(struct ifnet *ifp, int state); /* These are external hooks for CARP. */ void (*carp_linkstate_p)(struct ifnet *ifp); void (*carp_demote_adj_p)(int, char *); int (*carp_master_p)(struct ifaddr *); #if defined(INET) || defined(INET6) int (*carp_forus_p)(struct ifnet *ifp, u_char *dhost); int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa); int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); int (*carp_attach_p)(struct ifaddr *, int); void (*carp_detach_p)(struct ifaddr *, bool); #endif #ifdef INET int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr); #endif struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL; /* * XXX: Style; these should be sorted alphabetically, and unprototyped * static functions should be prototyped. Currently they are sorted by * declaration order. */ static void if_attachdomain(void *); static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); static void *if_grow(void); static void if_input_default(struct ifnet *, struct mbuf *); static int if_requestencap_default(struct ifnet *, struct if_encap_req *); static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); static int if_getgroupmembers(struct ifgroupreq *); static void if_delgroups(struct ifnet *); static void if_attach_internal(struct ifnet *, int, struct if_clone *); static int if_detach_internal(struct ifnet *, int, struct if_clone **); #ifdef VIMAGE static void if_vmove(struct ifnet *, struct vnet *); #endif #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif /* ipsec helper hooks */ VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); VNET_DEFINE(int, if_index); int ifqmaxlen = IFQ_MAXLEN; VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */ VNET_DEFINE(struct ifgrouphead, ifg_head); VNET_DEFINE_STATIC(int, if_indexlim) = 8; /* Table of ifnet by index. */ VNET_DEFINE(struct ifnet **, ifindex_table); #define V_if_indexlim VNET(if_indexlim) #define V_ifindex_table VNET(ifindex_table) /* * The global network interface list (V_ifnet) and related state (such as * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and * an rwlock. Either may be acquired shared to stablize the list, but both * must be acquired writable to modify the list. This model allows us to * both stablize the interface list during interrupt thread processing, but * also to stablize it over long-running ioctls, without introducing priority * inversions and deadlocks. */ struct rwlock ifnet_rwlock; RW_SYSINIT_FLAGS(ifnet_rw, &ifnet_rwlock, "ifnet_rw", RW_RECURSE); struct sx ifnet_sxlock; SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE); /* * The allocation of network interfaces is a rather non-atomic affair; we * need to select an index before we are ready to expose the interface for * use, so will use this pointer value to indicate reservation. */ #define IFNET_HOLD (void *)(uintptr_t)(-1) static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals"); MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); struct ifnet * ifnet_byindex_locked(u_short idx) { if (idx > V_if_index) return (NULL); if (V_ifindex_table[idx] == IFNET_HOLD) return (NULL); return (V_ifindex_table[idx]); } struct ifnet * ifnet_byindex(u_short idx) { struct ifnet *ifp; ifp = ifnet_byindex_locked(idx); return (ifp); } struct ifnet * ifnet_byindex_ref(u_short idx) { - struct epoch_tracker et; struct ifnet *ifp; - NET_EPOCH_ENTER(et); + NET_EPOCH_ASSERT(); + ifp = ifnet_byindex_locked(idx); - if (ifp == NULL || (ifp->if_flags & IFF_DYING)) { - NET_EPOCH_EXIT(et); + if (ifp == NULL || (ifp->if_flags & IFF_DYING)) return (NULL); - } if_ref(ifp); - NET_EPOCH_EXIT(et); return (ifp); } /* * Allocate an ifindex array entry; return 0 on success or an error on * failure. */ static u_short ifindex_alloc(void **old) { u_short idx; IFNET_WLOCK_ASSERT(); /* * Try to find an empty slot below V_if_index. If we fail, take the * next slot. */ for (idx = 1; idx <= V_if_index; idx++) { if (V_ifindex_table[idx] == NULL) break; } /* Catch if_index overflow. */ if (idx >= V_if_indexlim) { *old = if_grow(); return (USHRT_MAX); } if (idx > V_if_index) V_if_index = idx; return (idx); } static void ifindex_free_locked(u_short idx) { IFNET_WLOCK_ASSERT(); V_ifindex_table[idx] = NULL; while (V_if_index > 0 && V_ifindex_table[V_if_index] == NULL) V_if_index--; } static void ifindex_free(u_short idx) { IFNET_WLOCK(); ifindex_free_locked(idx); IFNET_WUNLOCK(); } static void ifnet_setbyindex(u_short idx, struct ifnet *ifp) { V_ifindex_table[idx] = ifp; } struct ifaddr * ifaddr_byindex(u_short idx) { - struct epoch_tracker et; struct ifnet *ifp; struct ifaddr *ifa = NULL; - NET_EPOCH_ENTER(et); + NET_EPOCH_ASSERT(); + ifp = ifnet_byindex_locked(idx); if (ifp != NULL && (ifa = ifp->if_addr) != NULL) ifa_ref(ifa); - NET_EPOCH_EXIT(et); return (ifa); } /* * Network interface utility routines. * * Routines with ifa_ifwith* names take sockaddr *'s as * parameters. */ static void vnet_if_init(const void *unused __unused) { void *old; CK_STAILQ_INIT(&V_ifnet); CK_STAILQ_INIT(&V_ifg_head); IFNET_WLOCK(); old = if_grow(); /* create initial table */ IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); vnet_if_clone_init(); } VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init, NULL); #ifdef VIMAGE static void vnet_if_uninit(const void *unused __unused) { VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p " "not empty", __func__, __LINE__, &V_ifnet)); VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p " "not empty", __func__, __LINE__, &V_ifg_head)); free((caddr_t)V_ifindex_table, M_IFNET); } VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_uninit, NULL); static void vnet_if_return(const void *unused __unused) { struct ifnet *ifp, *nifp; /* Return all inherited interfaces to their parent vnets. */ CK_STAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { if (ifp->if_home_vnet != ifp->if_vnet) if_vmove(ifp, ifp->if_home_vnet); } } VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_if_return, NULL); #endif static void * if_grow(void) { int oldlim; u_int n; struct ifnet **e; void *old; old = NULL; IFNET_WLOCK_ASSERT(); oldlim = V_if_indexlim; IFNET_WUNLOCK(); n = (oldlim << 1) * sizeof(*e); e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); IFNET_WLOCK(); if (V_if_indexlim != oldlim) { free(e, M_IFNET); return (NULL); } if (V_ifindex_table != NULL) { memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); old = V_ifindex_table; } V_if_indexlim <<= 1; V_ifindex_table = e; return (old); } /* * Allocate a struct ifnet and an index for an interface. A layer 2 * common structure will also be allocated if an allocation routine is * registered for the passed type. */ struct ifnet * if_alloc_domain(u_char type, int numa_domain) { struct ifnet *ifp; u_short idx; void *old; KASSERT(numa_domain <= IF_NODOM, ("numa_domain too large")); if (numa_domain == IF_NODOM) ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK | M_ZERO); else ifp = malloc_domainset(sizeof(struct ifnet), M_IFNET, DOMAINSET_PREF(numa_domain), M_WAITOK | M_ZERO); restart: IFNET_WLOCK(); idx = ifindex_alloc(&old); if (__predict_false(idx == USHRT_MAX)) { IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); goto restart; } ifnet_setbyindex(idx, IFNET_HOLD); IFNET_WUNLOCK(); ifp->if_index = idx; ifp->if_type = type; ifp->if_alloctype = type; ifp->if_numa_domain = numa_domain; #ifdef VIMAGE ifp->if_vnet = curvnet; #endif if (if_com_alloc[type] != NULL) { ifp->if_l2com = if_com_alloc[type](type, ifp); if (ifp->if_l2com == NULL) { free(ifp, M_IFNET); ifindex_free(idx); return (NULL); } } IF_ADDR_LOCK_INIT(ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); ifp->if_afdata_initialized = 0; IF_AFDATA_LOCK_INIT(ifp); CK_STAILQ_INIT(&ifp->if_addrhead); CK_STAILQ_INIT(&ifp->if_multiaddrs); CK_STAILQ_INIT(&ifp->if_groups); #ifdef MAC mac_ifnet_init(ifp); #endif ifq_init(&ifp->if_snd, ifp); refcount_init(&ifp->if_refcount, 1); /* Index reference. */ for (int i = 0; i < IFCOUNTERS; i++) ifp->if_counters[i] = counter_u64_alloc(M_WAITOK); ifp->if_get_counter = if_get_counter_default; ifp->if_pcp = IFNET_PCP_NONE; ifnet_setbyindex(ifp->if_index, ifp); return (ifp); } struct ifnet * if_alloc_dev(u_char type, device_t dev) { int numa_domain; if (dev == NULL || bus_get_domain(dev, &numa_domain) != 0) return (if_alloc_domain(type, IF_NODOM)); return (if_alloc_domain(type, numa_domain)); } struct ifnet * if_alloc(u_char type) { return (if_alloc_domain(type, IF_NODOM)); } /* * Do the actual work of freeing a struct ifnet, and layer 2 common * structure. This call is made when the last reference to an * interface is released. */ static void if_free_internal(struct ifnet *ifp) { KASSERT((ifp->if_flags & IFF_DYING), ("if_free_internal: interface not dying")); if (if_com_free[ifp->if_alloctype] != NULL) if_com_free[ifp->if_alloctype](ifp->if_l2com, ifp->if_alloctype); #ifdef MAC mac_ifnet_destroy(ifp); #endif /* MAC */ IF_AFDATA_DESTROY(ifp); IF_ADDR_LOCK_DESTROY(ifp); ifq_delete(&ifp->if_snd); for (int i = 0; i < IFCOUNTERS; i++) counter_u64_free(ifp->if_counters[i]); free(ifp->if_description, M_IFDESCR); free(ifp->if_hw_addr, M_IFADDR); if (ifp->if_numa_domain == IF_NODOM) free(ifp, M_IFNET); else free_domain(ifp, M_IFNET); } static void if_destroy(epoch_context_t ctx) { struct ifnet *ifp; ifp = __containerof(ctx, struct ifnet, if_epoch_ctx); if_free_internal(ifp); } /* * Deregister an interface and free the associated storage. */ void if_free(struct ifnet *ifp) { ifp->if_flags |= IFF_DYING; /* XXX: Locking */ CURVNET_SET_QUIET(ifp->if_vnet); IFNET_WLOCK(); KASSERT(ifp == ifnet_byindex_locked(ifp->if_index), ("%s: freeing unallocated ifnet", ifp->if_xname)); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); if (refcount_release(&ifp->if_refcount)) epoch_call(net_epoch_preempt, &ifp->if_epoch_ctx, if_destroy); CURVNET_RESTORE(); } /* * Interfaces to keep an ifnet type-stable despite the possibility of the * driver calling if_free(). If there are additional references, we defer * freeing the underlying data structure. */ void if_ref(struct ifnet *ifp) { /* We don't assert the ifnet list lock here, but arguably should. */ refcount_acquire(&ifp->if_refcount); } void if_rele(struct ifnet *ifp) { if (!refcount_release(&ifp->if_refcount)) return; epoch_call(net_epoch_preempt, &ifp->if_epoch_ctx, if_destroy); } void ifq_init(struct ifaltq *ifq, struct ifnet *ifp) { mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); if (ifq->ifq_maxlen == 0) ifq->ifq_maxlen = ifqmaxlen; ifq->altq_type = 0; ifq->altq_disc = NULL; ifq->altq_flags &= ALTQF_CANTCHANGE; ifq->altq_tbr = NULL; ifq->altq_ifp = ifp; } void ifq_delete(struct ifaltq *ifq) { mtx_destroy(&ifq->ifq_mtx); } /* * Perform generic interface initialization tasks and attach the interface * to the list of "active" interfaces. If vmove flag is set on entry * to if_attach_internal(), perform only a limited subset of initialization * tasks, given that we are moving from one vnet to another an ifnet which * has already been fully initialized. * * Note that if_detach_internal() removes group membership unconditionally * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL. * Thus, when if_vmove() is applied to a cloned interface, group membership * is lost while a cloned one always joins a group whose name is * ifc->ifc_name. To recover this after if_detach_internal() and * if_attach_internal(), the cloner should be specified to * if_attach_internal() via ifc. If it is non-NULL, if_attach_internal() * attempts to join a group whose name is ifc->ifc_name. * * XXX: * - The decision to return void and thus require this function to * succeed is questionable. * - We should probably do more sanity checking. For instance we don't * do anything to insure if_xname is unique or non-empty. */ void if_attach(struct ifnet *ifp) { if_attach_internal(ifp, 0, NULL); } /* * Compute the least common TSO limit. */ void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax) { /* * 1) If there is no limit currently, take the limit from * the network adapter. * * 2) If the network adapter has a limit below the current * limit, apply it. */ if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 && ifp->if_hw_tsomax < pmax->tsomaxbytes)) { pmax->tsomaxbytes = ifp->if_hw_tsomax; } if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 && ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) { pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; } if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 && ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) { pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } /* * Update TSO limit of a network adapter. * * Returns zero if no change. Else non-zero. */ int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax) { int retval = 0; if (ifp->if_hw_tsomax != pmax->tsomaxbytes) { ifp->if_hw_tsomax = pmax->tsomaxbytes; retval++; } if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) { ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize; retval++; } if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) { ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount; retval++; } return (retval); } static void if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc) { unsigned socksize, ifasize; int namelen, masklen; struct sockaddr_dl *sdl; struct ifaddr *ifa; if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index)) panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); #ifdef VIMAGE ifp->if_vnet = curvnet; if (ifp->if_home_vnet == NULL) ifp->if_home_vnet = curvnet; #endif if_addgroup(ifp, IFG_ALL); /* Restore group membership for cloned interfaces. */ if (vmove && ifc != NULL) if_clone_addgroup(ifp, ifc); getmicrotime(&ifp->if_lastchange); ifp->if_epoch = time_uptime; KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) || (ifp->if_transmit != NULL && ifp->if_qflush != NULL), ("transmit and qflush must both either be set or both be NULL")); if (ifp->if_transmit == NULL) { ifp->if_transmit = if_transmit; ifp->if_qflush = if_qflush; } if (ifp->if_input == NULL) ifp->if_input = if_input_default; if (ifp->if_requestencap == NULL) ifp->if_requestencap = if_requestencap_default; if (!vmove) { #ifdef MAC mac_ifnet_create(ifp); #endif /* * Create a Link Level name for this device. */ namelen = strlen(ifp->if_xname); /* * Always save enough space for any possiable name so we * can do a rename in place later. */ masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ; socksize = masklen + ifp->if_addrlen; if (socksize < sizeof(*sdl)) socksize = sizeof(*sdl); socksize = roundup2(socksize, sizeof(long)); ifasize = sizeof(*ifa) + 2 * socksize; ifa = ifa_alloc(ifasize, M_WAITOK); sdl = (struct sockaddr_dl *)(ifa + 1); sdl->sdl_len = socksize; sdl->sdl_family = AF_LINK; bcopy(ifp->if_xname, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; ifp->if_addr = ifa; ifa->ifa_ifp = ifp; ifa->ifa_addr = (struct sockaddr *)sdl; sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); ifa->ifa_netmask = (struct sockaddr *)sdl; sdl->sdl_len = masklen; while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; CK_STAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); /* Reliably crash if used uninitialized. */ ifp->if_broadcastaddr = NULL; if (ifp->if_type == IFT_ETHER) { ifp->if_hw_addr = malloc(ifp->if_addrlen, M_IFADDR, M_WAITOK | M_ZERO); } #if defined(INET) || defined(INET6) /* Use defaults for TSO, if nothing is set */ if (ifp->if_hw_tsomax == 0 && ifp->if_hw_tsomaxsegcount == 0 && ifp->if_hw_tsomaxsegsize == 0) { /* * The TSO defaults needs to be such that an * NFS mbuf list of 35 mbufs totalling just * below 64K works and that a chain of mbufs * can be defragged into at most 32 segments: */ ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); ifp->if_hw_tsomaxsegcount = 35; ifp->if_hw_tsomaxsegsize = 2048; /* 2K */ /* XXX some drivers set IFCAP_TSO after ethernet attach */ if (ifp->if_capabilities & IFCAP_TSO) { if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n", ifp->if_hw_tsomax, ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); } } #endif } #ifdef VIMAGE else { /* * Update the interface index in the link layer address * of the interface. */ for (ifa = ifp->if_addr; ifa != NULL; ifa = CK_STAILQ_NEXT(ifa, ifa_link)) { if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_index = ifp->if_index; } } } #endif IFNET_WLOCK(); CK_STAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); #ifdef VIMAGE curvnet->vnet_ifcnt++; #endif IFNET_WUNLOCK(); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); } static void if_epochalloc(void *dummy __unused) { net_epoch_preempt = epoch_alloc("Net preemptible", EPOCH_PREEMPT); net_epoch = epoch_alloc("Net", 0); } SYSINIT(ifepochalloc, SI_SUB_TASKQ + 1, SI_ORDER_ANY, if_epochalloc, NULL); static void if_attachdomain(void *dummy) { struct ifnet *ifp; CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) { struct domain *dp; /* * Since dp->dom_ifattach calls malloc() with M_WAITOK, we * cannot lock ifp->if_afdata initialization, entirely. */ IF_AFDATA_LOCK(ifp); if (ifp->if_afdata_initialized >= domain_init_status) { IF_AFDATA_UNLOCK(ifp); log(LOG_WARNING, "%s called more than once on %s\n", __func__, ifp->if_xname); return; } ifp->if_afdata_initialized = domain_init_status; IF_AFDATA_UNLOCK(ifp); /* address family dependent data region */ bzero(ifp->if_afdata, sizeof(ifp->if_afdata)); for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_ifattach) ifp->if_afdata[dp->dom_family] = (*dp->dom_ifattach)(ifp); } } /* * Remove any unicast or broadcast network addresses from an interface. */ void if_purgeaddrs(struct ifnet *ifp) { struct ifaddr *ifa; while (1) { struct epoch_tracker et; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) break; } NET_EPOCH_EXIT(et); if (ifa == NULL) break; #ifdef INET /* XXX: Ugly!! ad hoc just for INET */ if (ifa->ifa_addr->sa_family == AF_INET) { struct ifaliasreq ifr; bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL) == 0) continue; } #endif /* INET */ #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) { in6_purgeaddr(ifa); /* ifp_addrhead is already updated */ continue; } #endif /* INET6 */ IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } } /* * Remove any multicast network addresses from an interface when an ifnet * is going away. */ static void if_purgemaddrs(struct ifnet *ifp) { struct ifmultiaddr *ifma; IF_ADDR_WLOCK(ifp); while (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) { ifma = CK_STAILQ_FIRST(&ifp->if_multiaddrs); CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); if_delmulti_locked(ifp, ifma, 1); } IF_ADDR_WUNLOCK(ifp); } /* * Detach an interface, removing it from the list of "active" interfaces. * If vmove flag is set on entry to if_detach_internal(), perform only a * limited subset of cleanup tasks, given that we are moving an ifnet from * one vnet to another, where it must be fully operational. * * XXXRW: There are some significant questions about event ordering, and * how to prevent things from starting to use the interface during detach. */ void if_detach(struct ifnet *ifp) { CURVNET_SET_QUIET(ifp->if_vnet); if_detach_internal(ifp, 0, NULL); CURVNET_RESTORE(); } /* * The vmove flag, if set, indicates that we are called from a callpath * that is moving an interface to a different vnet instance. * * The shutdown flag, if set, indicates that we are called in the * process of shutting down a vnet instance. Currently only the * vnet_if_return SYSUNINIT function sets it. Note: we can be called * on a vnet instance shutdown without this flag being set, e.g., when * the cloned interfaces are destoyed as first thing of teardown. */ static int if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp) { struct ifaddr *ifa; int i; struct domain *dp; struct ifnet *iter; int found = 0; #ifdef VIMAGE bool shutdown; shutdown = ifp->if_vnet->vnet_shutdown; #endif IFNET_WLOCK(); CK_STAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { CK_STAILQ_REMOVE(&V_ifnet, ifp, ifnet, if_link); if (!vmove) ifp->if_flags |= IFF_DYING; found = 1; break; } IFNET_WUNLOCK(); if (!found) { /* * While we would want to panic here, we cannot * guarantee that the interface is indeed still on * the list given we don't hold locks all the way. */ return (ENOENT); #if 0 if (vmove) panic("%s: ifp=%p not on the ifnet tailq %p", __func__, ifp, &V_ifnet); else return; /* XXX this should panic as well? */ #endif } /* * At this point we know the interface still was on the ifnet list * and we removed it so we are in a stable state. */ #ifdef VIMAGE curvnet->vnet_ifcnt--; #endif epoch_wait_preempt(net_epoch_preempt); /* * Ensure all pending EPOCH(9) callbacks have been executed. This * fixes issues about late destruction of multicast options * which lead to leave group calls, which in turn access the * belonging ifnet structure: */ epoch_drain_callbacks(net_epoch_preempt); /* * In any case (destroy or vmove) detach us from the groups * and remove/wait for pending events on the taskq. * XXX-BZ in theory an interface could still enqueue a taskq change? */ if_delgroups(ifp); taskqueue_drain(taskqueue_swi, &ifp->if_linktask); /* * Check if this is a cloned interface or not. Must do even if * shutting down as a if_vmove_reclaim() would move the ifp and * the if_clone_addgroup() will have a corrupted string overwise * from a gibberish pointer. */ if (vmove && ifcp != NULL) *ifcp = if_clone_findifc(ifp); if_down(ifp); #ifdef VIMAGE /* * On VNET shutdown abort here as the stack teardown will do all * the work top-down for us. */ if (shutdown) { /* Give interface users the chance to clean up. */ EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); /* * In case of a vmove we are done here without error. * If we would signal an error it would lead to the same * abort as if we did not find the ifnet anymore. * if_detach() calls us in void context and does not care * about an early abort notification, so life is splendid :) */ goto finish_vnet_shutdown; } #endif /* * At this point we are not tearing down a VNET and are either * going to destroy or vmove the interface and have to cleanup * accordingly. */ /* * Remove routes and flush queues. */ #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_disable(&ifp->if_snd); if (ALTQ_IS_ATTACHED(&ifp->if_snd)) altq_detach(&ifp->if_snd); #endif if_purgeaddrs(ifp); #ifdef INET in_ifdetach(ifp); #endif #ifdef INET6 /* * Remove all IPv6 kernel structs related to ifp. This should be done * before removing routing entries below, since IPv6 interface direct * routes are expected to be removed by the IPv6-specific kernel API. * Otherwise, the kernel will detect some inconsistency and bark it. */ in6_ifdetach(ifp); #endif if_purgemaddrs(ifp); /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); if (!vmove) { /* * Prevent further calls into the device driver via ifnet. */ if_dead(ifp); /* * Clean up all addresses. */ IF_ADDR_WLOCK(ifp); if (!CK_STAILQ_EMPTY(&ifp->if_addrhead)) { ifa = CK_STAILQ_FIRST(&ifp->if_addrhead); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } else IF_ADDR_WUNLOCK(ifp); } rt_flushifroutes(ifp); #ifdef VIMAGE finish_vnet_shutdown: #endif /* * We cannot hold the lock over dom_ifdetach calls as they might * sleep, for example trying to drain a callout, thus open up the * theoretical race with re-attaching. */ IF_AFDATA_LOCK(ifp); i = ifp->if_afdata_initialized; ifp->if_afdata_initialized = 0; IF_AFDATA_UNLOCK(ifp); for (dp = domains; i > 0 && dp; dp = dp->dom_next) { if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) { (*dp->dom_ifdetach)(ifp, ifp->if_afdata[dp->dom_family]); ifp->if_afdata[dp->dom_family] = NULL; } } return (0); } #ifdef VIMAGE /* * if_vmove() performs a limited version of if_detach() in current * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg. * An attempt is made to shrink if_index in current vnet, find an * unused if_index in target vnet and calls if_grow() if necessary, * and finally find an unused if_xname for the target vnet. */ static void if_vmove(struct ifnet *ifp, struct vnet *new_vnet) { struct if_clone *ifc; u_int bif_dlt, bif_hdrlen; void *old; int rc; /* * if_detach_internal() will call the eventhandler to notify * interface departure. That will detach if_bpf. We need to * safe the dlt and hdrlen so we can re-attach it later. */ bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen); /* * Detach from current vnet, but preserve LLADDR info, do not * mark as dead etc. so that the ifnet can be reattached later. * If we cannot find it, we lost the race to someone else. */ rc = if_detach_internal(ifp, 1, &ifc); if (rc != 0) return; /* * Unlink the ifnet from ifindex_table[] in current vnet, and shrink * the if_index for that vnet if possible. * * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized, * or we'd lock on one vnet and unlock on another. */ IFNET_WLOCK(); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); /* * Perform interface-specific reassignment tasks, if provided by * the driver. */ if (ifp->if_reassign != NULL) ifp->if_reassign(ifp, new_vnet, NULL); /* * Switch to the context of the target vnet. */ CURVNET_SET_QUIET(new_vnet); restart: IFNET_WLOCK(); ifp->if_index = ifindex_alloc(&old); if (__predict_false(ifp->if_index == USHRT_MAX)) { IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); goto restart; } ifnet_setbyindex(ifp->if_index, ifp); IFNET_WUNLOCK(); if_attach_internal(ifp, 1, ifc); if (ifp->if_bpf == NULL) bpfattach(ifp, bif_dlt, bif_hdrlen); CURVNET_RESTORE(); } /* * Move an ifnet to or from another child prison/vnet, specified by the jail id. */ static int if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid) { struct prison *pr; struct ifnet *difp; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Do not try to move the iface from and to the same prison. */ if (pr->pr_vnet == ifp->if_vnet) { prison_free(pr); return (EEXIST); } /* Make sure the named iface does not exists in the dst. prison/vnet. */ /* XXX Lock interfaces to avoid races. */ CURVNET_SET_QUIET(pr->pr_vnet); difp = ifunit(ifname); if (difp != NULL) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ if (ifp->if_vnet->vnet_shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } CURVNET_RESTORE(); /* Move the interface into the child jail/vnet. */ if_vmove(ifp, pr->pr_vnet); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } static int if_vmove_reclaim(struct thread *td, char *ifname, int jid) { struct prison *pr; struct vnet *vnet_dst; struct ifnet *ifp; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Make sure the named iface exists in the source prison/vnet. */ CURVNET_SET(pr->pr_vnet); ifp = ifunit(ifname); /* XXX Lock to avoid races. */ if (ifp == NULL) { CURVNET_RESTORE(); prison_free(pr); return (ENXIO); } /* Do not try to move the iface from and to the same prison. */ vnet_dst = TD_TO_VNET(td); if (vnet_dst == ifp->if_vnet) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ if (ifp->if_vnet->vnet_shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } /* Get interface back from child jail/vnet. */ if_vmove(ifp, vnet_dst); CURVNET_RESTORE(); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } #endif /* VIMAGE */ /* * Add a group to an interface */ int if_addgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; int new = 0; if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' && groupname[strlen(groupname) - 1] <= '9') return (EINVAL); IFNET_WLOCK(); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) { IFNET_WUNLOCK(); return (EEXIST); } if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP, M_NOWAIT)) == NULL) { IFNET_WUNLOCK(); return (ENOMEM); } if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, groupname)) break; if (ifg == NULL) { if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); free(ifgm, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group)); ifg->ifg_refcnt = 0; CK_STAILQ_INIT(&ifg->ifg_members); CK_STAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); new = 1; } ifg->ifg_refcnt++; ifgl->ifgl_group = ifg; ifgm->ifgm_ifp = ifp; IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); CK_STAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); IFNET_WUNLOCK(); if (new) EVENTHANDLER_INVOKE(group_attach_event, ifg); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove a group from an interface */ int if_delgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_member *ifgm; int freeifgl; IFNET_WLOCK(); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) break; if (ifgl == NULL) { IFNET_WUNLOCK(); return (ENOENT); } freeifgl = 0; IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next); IF_ADDR_WUNLOCK(ifp); CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member, ifgm_next); if (--ifgl->ifgl_group->ifg_refcnt == 0) { CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next); freeifgl = 1; } IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); if (freeifgl) { EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } free(ifgm, M_TEMP); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove an interface from all groups */ static void if_delgroups(struct ifnet *ifp) { struct ifg_list *ifgl; struct ifg_member *ifgm; char groupname[IFNAMSIZ]; int ifglfree; IFNET_WLOCK(); while (!CK_STAILQ_EMPTY(&ifp->if_groups)) { ifgl = CK_STAILQ_FIRST(&ifp->if_groups); strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ); IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next); IF_ADDR_WUNLOCK(ifp); CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member, ifgm_next); ifglfree = 0; if (--ifgl->ifgl_group->ifg_refcnt == 0) { CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next); ifglfree = 1; } IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(ifgm, M_TEMP); if (ifglfree) { EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } EVENTHANDLER_INVOKE(group_change_event, groupname); IFNET_WLOCK(); } IFNET_WUNLOCK(); } static char * ifgr_group_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (&ifgrup->ifgr32.ifgr_ifgru.ifgru_group[0]); #endif return (&ifgrup->ifgr.ifgr_ifgru.ifgru_group[0]); } static struct ifg_req * ifgr_groups_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((struct ifg_req *)(uintptr_t) ifgrup->ifgr32.ifgr_ifgru.ifgru_groups); #endif return (ifgrup->ifgr.ifgr_ifgru.ifgru_groups); } /* * Stores all groups from an interface in memory pointed to by ifgr. */ static int if_getgroup(struct ifgroupreq *ifgr, struct ifnet *ifp) { - struct epoch_tracker et; int len, error; struct ifg_list *ifgl; struct ifg_req ifgrq, *ifgp; + NET_EPOCH_ASSERT(); + if (ifgr->ifgr_len == 0) { - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); - NET_EPOCH_EXIT(et); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); /* XXX: wire */ - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { - if (len < sizeof(ifgrq)) { - NET_EPOCH_EXIT(et); + if (len < sizeof(ifgrq)) return (EINVAL); - } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); - if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { - NET_EPOCH_EXIT(et); + if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) return (error); - } len -= sizeof(ifgrq); ifgp++; } - NET_EPOCH_EXIT(et); return (0); } /* * Stores all members of a group in memory pointed to by igfr */ static int if_getgroupmembers(struct ifgroupreq *ifgr) { struct ifg_group *ifg; struct ifg_member *ifgm; struct ifg_req ifgrq, *ifgp; int len, error; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, ifgr->ifgr_name)) break; if (ifg == NULL) { IFNET_RUNLOCK(); return (ENOENT); } if (ifgr->ifgr_len == 0) { CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) ifgr->ifgr_len += sizeof(ifgrq); IFNET_RUNLOCK(); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) { if (len < sizeof(ifgrq)) { IFNET_RUNLOCK(); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname, sizeof(ifgrq.ifgrq_member)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IFNET_RUNLOCK(); return (error); } len -= sizeof(ifgrq); ifgp++; } IFNET_RUNLOCK(); return (0); } /* * Return counter values from counter(9)s stored in ifnet. */ uint64_t if_get_counter_default(struct ifnet *ifp, ift_counter cnt) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); return (counter_u64_fetch(ifp->if_counters[cnt])); } /* * Increase an ifnet counter. Usually used for counters shared * between the stack and a driver, but function supports them all. */ void if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); counter_u64_add(ifp->if_counters[cnt], inc); } /* * Copy data from ifnet to userland API structure if_data. */ void if_data_copy(struct ifnet *ifp, struct if_data *ifd) { ifd->ifi_type = ifp->if_type; ifd->ifi_physical = 0; ifd->ifi_addrlen = ifp->if_addrlen; ifd->ifi_hdrlen = ifp->if_hdrlen; ifd->ifi_link_state = ifp->if_link_state; ifd->ifi_vhid = 0; ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_mtu = ifp->if_mtu; ifd->ifi_metric = ifp->if_metric; ifd->ifi_baudrate = ifp->if_baudrate; ifd->ifi_hwassist = ifp->if_hwassist; ifd->ifi_epoch = ifp->if_epoch; ifd->ifi_lastchange = ifp->if_lastchange; ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS); ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS); ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); } /* * Wrapper functions for struct ifnet address list locking macros. These are * used by kernel modules to avoid encoding programming interface or binary * interface assumptions that may be violated when kernel-internal locking * approaches change. */ void if_addr_rlock(struct ifnet *ifp) { epoch_enter_preempt(net_epoch_preempt, curthread->td_et); } void if_addr_runlock(struct ifnet *ifp) { epoch_exit_preempt(net_epoch_preempt, curthread->td_et); } void if_maddr_rlock(if_t ifp) { epoch_enter_preempt(net_epoch_preempt, curthread->td_et); } void if_maddr_runlock(if_t ifp) { epoch_exit_preempt(net_epoch_preempt, curthread->td_et); } /* * Initialization, destruction and refcounting functions for ifaddrs. */ struct ifaddr * ifa_alloc(size_t size, int flags) { struct ifaddr *ifa; KASSERT(size >= sizeof(struct ifaddr), ("%s: invalid size %zu", __func__, size)); ifa = malloc(size, M_IFADDR, M_ZERO | flags); if (ifa == NULL) return (NULL); if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL) goto fail; refcount_init(&ifa->ifa_refcnt, 1); return (ifa); fail: /* free(NULL) is okay */ counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); return (NULL); } void ifa_ref(struct ifaddr *ifa) { refcount_acquire(&ifa->ifa_refcnt); } static void ifa_destroy(epoch_context_t ctx) { struct ifaddr *ifa; ifa = __containerof(ctx, struct ifaddr, ifa_epoch_ctx); counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); } void ifa_free(struct ifaddr *ifa) { if (refcount_release(&ifa->ifa_refcnt)) epoch_call(net_epoch_preempt, &ifa->ifa_epoch_ctx, ifa_destroy); } static int ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa, struct sockaddr *ia) { struct epoch_tracker et; int error; struct rt_addrinfo info; struct sockaddr_dl null_sdl; struct ifnet *ifp; ifp = ifa->ifa_ifp; bzero(&info, sizeof(info)); if (cmd != RTM_DELETE) info.rti_ifp = V_loif; if (cmd == RTM_ADD) { /* explicitly specify (loopback) ifa */ if (info.rti_ifp != NULL) { NET_EPOCH_ENTER(et); info.rti_ifa = ifaof_ifpforaddr(ifa->ifa_addr, info.rti_ifp); if (info.rti_ifa != NULL) ifa_ref(info.rti_ifa); NET_EPOCH_EXIT(et); } } info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type); error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib); if (error != 0 && !(cmd == RTM_ADD && error == EEXIST) && !(cmd == RTM_DELETE && error == ENOENT)) if_printf(ifp, "%s failed: %d\n", otype, error); return (error); } int ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_ADD, "insertion", ifa, ia)); } int ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_DELETE, "deletion", ifa, ia)); } int ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_CHANGE, "switch", ifa, ia)); } /* * XXX: Because sockaddr_dl has deeper structure than the sockaddr * structs used to represent other address families, it is necessary * to perform a different comparison. */ #define sa_dl_equal(a1, a2) \ ((((const struct sockaddr_dl *)(a1))->sdl_len == \ ((const struct sockaddr_dl *)(a2))->sdl_len) && \ (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)), \ CLLADDR((const struct sockaddr_dl *)(a2)), \ ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0)) /* * Locate an interface based on a complete address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithaddr(const struct sockaddr *addr) { struct ifnet *ifp; struct ifaddr *ifa; - MPASS(in_epoch(net_epoch_preempt)); + NET_EPOCH_ASSERT(); + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (sa_equal(addr, ifa->ifa_addr)) { goto done; } /* IP6 doesn't have broadcast */ if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { goto done; } } } ifa = NULL; done: return (ifa); } int ifa_ifwithaddr_check(const struct sockaddr *addr) { struct epoch_tracker et; int rc; NET_EPOCH_ENTER(et); rc = (ifa_ifwithaddr(addr) != NULL); NET_EPOCH_EXIT(et); return (rc); } /* * Locate an interface based on the broadcast address. */ /* ARGSUSED */ struct ifaddr * ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { goto done; } } } ifa = NULL; done: return (ifa); } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { goto done; } } } ifa = NULL; done: return (ifa); } /* * Find an interface on a specific network. If many, choice * is most specific found. */ struct ifaddr * ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; const char *addr_data = addr->sa_data, *cplim; MPASS(in_epoch(net_epoch_preempt)); /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. */ if (af == AF_LINK) { const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr; if (sdl->sdl_index && sdl->sdl_index <= V_if_index) return (ifaddr_byindex(sdl->sdl_index)); } /* * Scan though each interface, looking for ones that have addresses * in this address family and the requested fib. */ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { const char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af) next: continue; if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) { /* * This is a bit broken as it doesn't * take into account that the remote end may * be a single node in the network we are * looking for. * The trouble is that we don't know the * netmask for the remote end. */ if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { goto done; } } else { /* * Scan all the bits in the ifa's address. * If a bit dissagrees with what we are * looking for, mask it with the netmask * to see if it really matters. * (A byte at a time) */ if (ifa->ifa_netmask == 0) continue; cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; while (cp3 < cplim) if ((*cp++ ^ *cp2++) & *cp3++) goto next; /* next address! */ /* * If the netmask of what we just found * is more specific than what we had before * (if we had one), or if the virtual status * of new prefix is better than of the old one, * then remember the new one before continuing * to search for an even better one. */ if (ifa_maybe == NULL || ifa_preferred(ifa_maybe, ifa) || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) { ifa_maybe = ifa; } } } } ifa = ifa_maybe; ifa_maybe = NULL; done: return (ifa); } /* * Find an interface address specific to an interface best matching * a given address. */ struct ifaddr * ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa; const char *cp, *cp2, *cp3; char *cplim; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; if (af >= AF_MAX) return (NULL); MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != af) continue; if (ifa_maybe == NULL) ifa_maybe = ifa; if (ifa->ifa_netmask == 0) { if (sa_equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && sa_equal(addr, ifa->ifa_dstaddr))) goto done; continue; } if (ifp->if_flags & IFF_POINTOPOINT) { if (sa_equal(addr, ifa->ifa_dstaddr)) goto done; } else { cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) goto done; } } ifa = ifa_maybe; done: return (ifa); } /* * See whether new ifa is better than current one: * 1) A non-virtual one is preferred over virtual. * 2) A virtual in master state preferred over any other state. * * Used in several address selecting functions. */ int ifa_preferred(struct ifaddr *cur, struct ifaddr *next) { return (cur->ifa_carp && (!next->ifa_carp || ((*carp_master_p)(next) && !(*carp_master_p)(cur)))); } struct sockaddr_dl * link_alloc_sdl(size_t size, int flags) { return (malloc(size, M_TEMP, flags)); } void link_free_sdl(struct sockaddr *sa) { free(sa, M_TEMP); } /* * Fills in given sdl with interface basic info. * Returns pointer to filled sdl. */ struct sockaddr_dl * link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)paddr; memset(sdl, 0, sizeof(struct sockaddr_dl)); sdl->sdl_len = sizeof(struct sockaddr_dl); sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = iftype; return (sdl); } /* * Mark an interface down and notify protocols of * the transition. */ static void if_unroute(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP")); ifp->if_flags &= ~flag; getmicrotime(&ifp->if_lastchange); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFDOWN, ifa->ifa_addr); ifp->if_qflush(ifp); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); } /* * Mark an interface up and notify protocols of * the transition. */ static void if_route(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP")); ifp->if_flags |= flag; getmicrotime(&ifp->if_lastchange); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFUP, ifa->ifa_addr); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); #ifdef INET6 in6_if_up(ifp); #endif } void (*vlan_link_state_p)(struct ifnet *); /* XXX: private from if_vlan */ void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */ struct ifnet *(*vlan_trunkdev_p)(struct ifnet *); struct ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t); int (*vlan_tag_p)(struct ifnet *, uint16_t *); int (*vlan_pcp_p)(struct ifnet *, uint16_t *); int (*vlan_setcookie_p)(struct ifnet *, void *); void *(*vlan_cookie_p)(struct ifnet *); /* * Handle a change in the interface link state. To avoid LORs * between driver lock and upper layer locks, as well as possible * recursions, we post event to taskqueue, and all job * is done in static do_link_state_change(). */ void if_link_state_change(struct ifnet *ifp, int link_state) { /* Return if state hasn't changed. */ if (ifp->if_link_state == link_state) return; ifp->if_link_state = link_state; + /* XXXGL: reference ifp? */ taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask); } static void do_link_state_change(void *arg, int pending) { - struct ifnet *ifp = (struct ifnet *)arg; - int link_state = ifp->if_link_state; - CURVNET_SET(ifp->if_vnet); + struct epoch_tracker et; + struct ifnet *ifp; + int link_state; + + NET_EPOCH_ENTER(et); + ifp = arg; + link_state = ifp->if_link_state; - /* Notify that the link state has changed. */ + CURVNET_SET(ifp->if_vnet); rt_ifmsg(ifp); if (ifp->if_vlantrunk != NULL) (*vlan_link_state_p)(ifp); if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) && ifp->if_l2com != NULL) (*ng_ether_link_state_p)(ifp, link_state); if (ifp->if_carp) (*carp_linkstate_p)(ifp); if (ifp->if_bridge) ifp->if_bridge_linkstate(ifp); if (ifp->if_lagg) (*lagg_linkstate_p)(ifp, link_state); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) if_printf(ifp, "%d link states coalesced\n", pending); if (log_link_state_change) if_printf(ifp, "link state changed to %s\n", (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); EVENTHANDLER_INVOKE(ifnet_link_event, ifp, link_state); CURVNET_RESTORE(); + NET_EPOCH_EXIT(et); } /* * Mark an interface down and notify protocols of * the transition. */ void if_down(struct ifnet *ifp) { EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_DOWN); if_unroute(ifp, IFF_UP, AF_UNSPEC); } /* * Mark an interface up and notify protocols of * the transition. */ void if_up(struct ifnet *ifp) { if_route(ifp, IFF_UP, AF_UNSPEC); EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_UP); } /* * Flush an interface queue. */ void if_qflush(struct ifnet *ifp) { struct mbuf *m, *n; struct ifaltq *ifq; ifq = &ifp->if_snd; IFQ_LOCK(ifq); #ifdef ALTQ if (ALTQ_IS_ENABLED(ifq)) ALTQ_PURGE(ifq); #endif n = ifq->ifq_head; while ((m = n) != NULL) { n = m->m_nextpkt; m_freem(m); } ifq->ifq_head = 0; ifq->ifq_tail = 0; ifq->ifq_len = 0; IFQ_UNLOCK(ifq); } /* * Map interface name to interface structure pointer, with or without * returning a reference. */ struct ifnet * ifunit_ref(const char *name) { struct epoch_tracker et; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 && !(ifp->if_flags & IFF_DYING)) break; } if (ifp != NULL) if_ref(ifp); NET_EPOCH_EXIT(et); return (ifp); } struct ifnet * ifunit(const char *name) { struct epoch_tracker et; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } NET_EPOCH_EXIT(et); return (ifp); } static void * ifr_buffer_get_buffer(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.buffer); } static void ifr_buffer_set_buffer_null(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer = 0; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.buffer = NULL; } static size_t ifr_buffer_get_length(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (ifrup->ifr32.ifr_ifru.ifru_buffer.length); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.length); } static void ifr_buffer_set_length(void *data, size_t len) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.length = len; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.length = len; } void * ifr_data_get_ptr(void *ifrp) { union ifreq_union *ifrup; ifrup = ifrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_data); #endif return (ifrup->ifr.ifr_ifru.ifru_data); } /* * Hardware specific interface ioctls. */ int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; int error = 0, do_ifup = 0; int new_flags, temp_flags; size_t namelen, onamelen; size_t descrlen; char *descrbuf, *odescrbuf; char new_name[IFNAMSIZ]; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifr = (struct ifreq *)data; switch (cmd) { case SIOCGIFINDEX: ifr->ifr_index = ifp->if_index; break; case SIOCGIFFLAGS: temp_flags = ifp->if_flags | ifp->if_drv_flags; ifr->ifr_flags = temp_flags & 0xffff; ifr->ifr_flagshigh = temp_flags >> 16; break; case SIOCGIFCAP: ifr->ifr_reqcap = ifp->if_capabilities; ifr->ifr_curcap = ifp->if_capenable; break; #ifdef MAC case SIOCGIFMAC: error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp); break; #endif case SIOCGIFMETRIC: ifr->ifr_metric = ifp->if_metric; break; case SIOCGIFMTU: ifr->ifr_mtu = ifp->if_mtu; break; case SIOCGIFPHYS: /* XXXGL: did this ever worked? */ ifr->ifr_phys = 0; break; case SIOCGIFDESCR: error = 0; sx_slock(&ifdescr_sx); if (ifp->if_description == NULL) error = ENOMSG; else { /* space for terminating nul */ descrlen = strlen(ifp->if_description) + 1; if (ifr_buffer_get_length(ifr) < descrlen) ifr_buffer_set_buffer_null(ifr); else error = copyout(ifp->if_description, ifr_buffer_get_buffer(ifr), descrlen); ifr_buffer_set_length(ifr, descrlen); } sx_sunlock(&ifdescr_sx); break; case SIOCSIFDESCR: error = priv_check(td, PRIV_NET_SETIFDESCR); if (error) return (error); /* * Copy only (length-1) bytes to make sure that * if_description is always nul terminated. The * length parameter is supposed to count the * terminating nul in. */ if (ifr_buffer_get_length(ifr) > ifdescr_maxlen) return (ENAMETOOLONG); else if (ifr_buffer_get_length(ifr) == 0) descrbuf = NULL; else { descrbuf = malloc(ifr_buffer_get_length(ifr), M_IFDESCR, M_WAITOK | M_ZERO); error = copyin(ifr_buffer_get_buffer(ifr), descrbuf, ifr_buffer_get_length(ifr) - 1); if (error) { free(descrbuf, M_IFDESCR); break; } } sx_xlock(&ifdescr_sx); odescrbuf = ifp->if_description; ifp->if_description = descrbuf; sx_xunlock(&ifdescr_sx); getmicrotime(&ifp->if_lastchange); free(odescrbuf, M_IFDESCR); break; case SIOCGIFFIB: ifr->ifr_fib = ifp->if_fib; break; case SIOCSIFFIB: error = priv_check(td, PRIV_NET_SETIFFIB); if (error) return (error); if (ifr->ifr_fib >= rt_numfibs) return (EINVAL); ifp->if_fib = ifr->ifr_fib; break; case SIOCSIFFLAGS: error = priv_check(td, PRIV_NET_SETIFFLAGS); if (error) return (error); /* * Currently, no driver owned flags pass the IFF_CANTCHANGE * check, so we don't need special handling here yet. */ new_flags = (ifr->ifr_flags & 0xffff) | (ifr->ifr_flagshigh << 16); if (ifp->if_flags & IFF_UP && (new_flags & IFF_UP) == 0) { if_down(ifp); } else if (new_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { do_ifup = 1; } /* See if permanently promiscuous mode bit is about to flip */ if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) { if (new_flags & IFF_PPROMISC) ifp->if_flags |= IFF_PROMISC; else if (ifp->if_pcount == 0) ifp->if_flags &= ~IFF_PROMISC; if (log_promisc_mode_change) if_printf(ifp, "permanently promiscuous mode %s\n", ((new_flags & IFF_PPROMISC) ? "enabled" : "disabled")); } ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | (new_flags &~ IFF_CANTCHANGE); if (ifp->if_ioctl) { (void) (*ifp->if_ioctl)(ifp, cmd, data); } if (do_ifup) if_up(ifp); getmicrotime(&ifp->if_lastchange); break; case SIOCSIFCAP: error = priv_check(td, PRIV_NET_SETIFCAP); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); if (ifr->ifr_reqcap & ~ifp->if_capabilities) return (EINVAL); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; #ifdef MAC case SIOCSIFMAC: error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp); break; #endif case SIOCSIFNAME: error = priv_check(td, PRIV_NET_SETIFNAME); if (error) return (error); error = copyinstr(ifr_data_get_ptr(ifr), new_name, IFNAMSIZ, NULL); if (error != 0) return (error); if (new_name[0] == '\0') return (EINVAL); if (new_name[IFNAMSIZ-1] != '\0') { new_name[IFNAMSIZ-1] = '\0'; if (strlen(new_name) == IFNAMSIZ-1) return (EINVAL); } if (strcmp(new_name, ifp->if_xname) == 0) break; if (ifunit(new_name) != NULL) return (EEXIST); /* * XXX: Locking. Nothing else seems to lock if_flags, * and there are numerous other races with the * ifunit() checks not being atomic with namespace * changes (renames, vmoves, if_attach, etc). */ ifp->if_flags |= IFF_RENAMING; /* Announce the departure of the interface. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if_printf(ifp, "changing name to '%s'\n", new_name); IF_ADDR_WLOCK(ifp); strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname)); ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; namelen = strlen(new_name); onamelen = sdl->sdl_nlen; /* * Move the address if needed. This is safe because we * allocate space for a name of length IFNAMSIZ when we * create this in if_attach(). */ if (namelen != onamelen) { bcopy(sdl->sdl_data + onamelen, sdl->sdl_data + namelen, sdl->sdl_alen); } bcopy(new_name, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl = (struct sockaddr_dl *)ifa->ifa_netmask; bzero(sdl->sdl_data, onamelen); while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); /* Announce the return of the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); ifp->if_flags &= ~IFF_RENAMING; break; #ifdef VIMAGE case SIOCSIFVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error) return (error); error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid); break; #endif case SIOCSIFMETRIC: error = priv_check(td, PRIV_NET_SETIFMETRIC); if (error) return (error); ifp->if_metric = ifr->ifr_metric; getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYS: error = priv_check(td, PRIV_NET_SETIFPHYS); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFMTU: { u_long oldmtu = ifp->if_mtu; error = priv_check(td, PRIV_NET_SETIFMTU); if (error) return (error); if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) return (EINVAL); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) { getmicrotime(&ifp->if_lastchange); rt_ifmsg(ifp); #ifdef INET NETDUMP_REINIT(ifp); #endif } /* * If the link MTU changed, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif rt_updatemtu(ifp); } break; } case SIOCADDMULTI: case SIOCDELMULTI: if (cmd == SIOCADDMULTI) error = priv_check(td, PRIV_NET_ADDMULTI); else error = priv_check(td, PRIV_NET_DELMULTI); if (error) return (error); /* Don't allow group membership on non-multicast interfaces. */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); /* Don't let users screw up protocols' entries. */ if (ifr->ifr_addr.sa_family != AF_LINK) return (EINVAL); if (cmd == SIOCADDMULTI) { struct epoch_tracker et; struct ifmultiaddr *ifma; /* * Userland is only permitted to join groups once * via the if_addmulti() KPI, because it cannot hold * struct ifmultiaddr * between calls. It may also * lose a race while we check if the membership * already exists. */ NET_EPOCH_ENTER(et); ifma = if_findmulti(ifp, &ifr->ifr_addr); NET_EPOCH_EXIT(et); if (ifma != NULL) error = EADDRINUSE; else error = if_addmulti(ifp, &ifr->ifr_addr, &ifma); } else { error = if_delmulti(ifp, &ifr->ifr_addr); } if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif case SIOCSIFMEDIA: case SIOCSIFGENERIC: error = priv_check(td, PRIV_NET_HWIOCTL); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCGIFSTATUS: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: case SIOCGIFGENERIC: case SIOCGIFRSSKEY: case SIOCGIFRSSHASH: case SIOCGIFDOWNREASON: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); break; case SIOCSIFLLADDR: error = priv_check(td, PRIV_NET_SETLLADDR); if (error) return (error); error = if_setlladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); break; case SIOCGHWADDR: error = if_gethwaddr(ifp, ifr); break; case CASE_IOC_IFGROUPREQ(SIOCAIFGROUP): error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); if ((error = if_addgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; case CASE_IOC_IFGROUPREQ(SIOCGIFGROUP): - if ((error = if_getgroup((struct ifgroupreq *)data, ifp))) - return (error); + { + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); + error = if_getgroup((struct ifgroupreq *)data, ifp); + NET_EPOCH_EXIT(et); break; + } case CASE_IOC_IFGROUPREQ(SIOCDIFGROUP): error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); if ((error = if_delgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; default: error = ENOIOCTL; break; } return (error); } #ifdef COMPAT_FREEBSD32 struct ifconf32 { int32_t ifc_len; union { uint32_t ifcu_buf; uint32_t ifcu_req; } ifc_ifcu; }; #define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) #endif #ifdef COMPAT_FREEBSD32 static void ifmr_init(struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; memcpy(ifmr->ifm_name, ifmr32->ifm_name, sizeof(ifmr->ifm_name)); ifmr->ifm_current = ifmr32->ifm_current; ifmr->ifm_mask = ifmr32->ifm_mask; ifmr->ifm_status = ifmr32->ifm_status; ifmr->ifm_active = ifmr32->ifm_active; ifmr->ifm_count = ifmr32->ifm_count; ifmr->ifm_ulist = (int *)(uintptr_t)ifmr32->ifm_ulist; } static void ifmr_update(const struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; ifmr32->ifm_current = ifmr->ifm_current; ifmr32->ifm_mask = ifmr->ifm_mask; ifmr32->ifm_status = ifmr->ifm_status; ifmr32->ifm_active = ifmr->ifm_active; ifmr32->ifm_count = ifmr->ifm_count; } #endif /* * Interface ioctls. */ int ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) { #ifdef COMPAT_FREEBSD32 caddr_t saved_data = NULL; struct ifmediareq ifmr; - struct ifmediareq *ifmrp; + struct ifmediareq *ifmrp = NULL; #endif struct ifnet *ifp; struct ifreq *ifr; int error; int oif_flags; CURVNET_SET(so->so_vnet); #ifdef VIMAGE /* Make sure the VNET is stable. */ if (so->so_vnet->vnet_shutdown) { CURVNET_RESTORE(); return (EBUSY); } #endif - switch (cmd) { case SIOCGIFCONF: error = ifconf(cmd, data); - CURVNET_RESTORE(); - return (error); + goto out_noref; #ifdef COMPAT_FREEBSD32 case SIOCGIFCONF32: { struct ifconf32 *ifc32; struct ifconf ifc; ifc32 = (struct ifconf32 *)data; ifc.ifc_len = ifc32->ifc_len; ifc.ifc_buf = PTRIN(ifc32->ifc_buf); error = ifconf(SIOCGIFCONF, (void *)&ifc); - CURVNET_RESTORE(); if (error == 0) ifc32->ifc_len = ifc.ifc_len; - return (error); + goto out_noref; } #endif } #ifdef COMPAT_FREEBSD32 - ifmrp = NULL; switch (cmd) { case SIOCGIFMEDIA32: case SIOCGIFXMEDIA32: ifmrp = &ifmr; ifmr_init(ifmrp, data); cmd = _IOC_NEWTYPE(cmd, struct ifmediareq); saved_data = data; data = (caddr_t)ifmrp; } #endif ifr = (struct ifreq *)data; switch (cmd) { #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error == 0) error = if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid); goto out_noref; #endif case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); if (error == 0) error = if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ? ifr_data_get_ptr(ifr) : NULL); goto out_noref; case SIOCIFDESTROY: error = priv_check(td, PRIV_NET_IFDESTROY); if (error == 0) error = if_clone_destroy(ifr->ifr_name); goto out_noref; case SIOCIFGCLONERS: error = if_clone_list((struct if_clonereq *)data); goto out_noref; case CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB): error = if_getgroupmembers((struct ifgroupreq *)data); goto out_noref; #if defined(INET) || defined(INET6) case SIOCSVH: case SIOCGVH: if (carp_ioctl_p == NULL) error = EPROTONOSUPPORT; else error = (*carp_ioctl_p)(ifr, cmd, td); goto out_noref; #endif } ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) { error = ENXIO; goto out_noref; } error = ifhwioctl(cmd, ifp, data, td); if (error != ENOIOCTL) goto out_ref; oif_flags = ifp->if_flags; if (so->so_proto == NULL) { error = EOPNOTSUPP; goto out_ref; } /* * Pass the request on to the socket control method, and if the * latter returns EOPNOTSUPP, directly to the interface. * * Make an exception for the legacy SIOCSIF* requests. Drivers * trust SIOCSIFADDR et al to come from an already privileged * layer, and do not perform any credentials checks or input * validation. */ error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, ifp, td)); if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) error = (*ifp->if_ioctl)(ifp, cmd, data); if ((oif_flags ^ ifp->if_flags) & IFF_UP) { #ifdef INET6 if (ifp->if_flags & IFF_UP) in6_if_up(ifp); #endif } out_ref: if_rele(ifp); out_noref: #ifdef COMPAT_FREEBSD32 if (ifmrp != NULL) { KASSERT((cmd == SIOCGIFMEDIA || cmd == SIOCGIFXMEDIA), ("ifmrp non-NULL, but cmd is not an ifmedia req 0x%lx", cmd)); data = saved_data; ifmr_update(ifmrp, data); } #endif CURVNET_RESTORE(); return (error); } /* * The code common to handling reference counted flags, * e.g., in ifpromisc() and if_allmulti(). * The "pflag" argument can specify a permanent mode flag to check, * such as IFF_PPROMISC for promiscuous mode; should be 0 if none. * * Only to be used on stack-owned flags, not driver-owned flags. */ static int if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch) { struct ifreq ifr; int error; int oldflags, oldcount; /* Sanity checks to catch programming errors */ KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0, ("%s: setting driver-owned flag %d", __func__, flag)); if (onswitch) KASSERT(*refcount >= 0, ("%s: increment negative refcount %d for flag %d", __func__, *refcount, flag)); else KASSERT(*refcount > 0, ("%s: decrement non-positive refcount %d for flag %d", __func__, *refcount, flag)); /* In case this mode is permanent, just touch refcount */ if (ifp->if_flags & pflag) { *refcount += onswitch ? 1 : -1; return (0); } /* Save ifnet parameters for if_ioctl() may fail */ oldcount = *refcount; oldflags = ifp->if_flags; /* * See if we aren't the only and touching refcount is enough. * Actually toggle interface flag if we are the first or last. */ if (onswitch) { if ((*refcount)++) return (0); ifp->if_flags |= flag; } else { if (--(*refcount)) return (0); ifp->if_flags &= ~flag; } /* Call down the driver since we've changed interface flags */ if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto recover; } ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); if (error) goto recover; /* Notify userland that interface flags have changed */ rt_ifmsg(ifp); return (0); recover: /* Recover after driver error */ *refcount = oldcount; ifp->if_flags = oldflags; return (error); } /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first * "on" request actually has an effect, as does the final "off" request. * Results are undefined if the "off" and "on" requests are not matched. */ int ifpromisc(struct ifnet *ifp, int pswitch) { int error; int oldflags = ifp->if_flags; error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC, &ifp->if_pcount, pswitch); /* If promiscuous mode status has changed, log a message */ if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC) && log_promisc_mode_change) if_printf(ifp, "promiscuous mode %s\n", (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled"); return (error); } /* * Return interface configuration * of system. List may be used * in later ioctl's (above) to get * other information. */ /*ARGSUSED*/ static int ifconf(u_long cmd, caddr_t data) { struct ifconf *ifc = (struct ifconf *)data; struct ifnet *ifp; struct ifaddr *ifa; struct ifreq ifr; struct sbuf *sb; int error, full = 0, valid_len, max_len; /* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */ max_len = MAXPHYS - 1; /* Prevent hostile input from being able to crash the system */ if (ifc->ifc_len <= 0) return (EINVAL); again: if (ifc->ifc_len <= max_len) { max_len = ifc->ifc_len; full = 1; } sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); max_len = 0; valid_len = 0; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { struct epoch_tracker et; int addrs; /* * Zero the ifr to make sure we don't disclose the contents * of the stack. */ memset(&ifr, 0, sizeof(ifr)); if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)) >= sizeof(ifr.ifr_name)) { sbuf_delete(sb); IFNET_RUNLOCK(); return (ENAMETOOLONG); } addrs = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (prison_if(curthread->td_ucred, sa) != 0) continue; addrs++; if (sa->sa_len <= sizeof(*sa)) { if (sa->sa_len < sizeof(*sa)) { memset(&ifr.ifr_ifru.ifru_addr, 0, sizeof(ifr.ifr_ifru.ifru_addr)); memcpy(&ifr.ifr_ifru.ifru_addr, sa, sa->sa_len); } else ifr.ifr_ifru.ifru_addr = *sa; sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); } else { sbuf_bcat(sb, &ifr, offsetof(struct ifreq, ifr_addr)); max_len += offsetof(struct ifreq, ifr_addr); sbuf_bcat(sb, sa, sa->sa_len); max_len += sa->sa_len; } if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } NET_EPOCH_EXIT(et); if (addrs == 0) { sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } } IFNET_RUNLOCK(); /* * If we didn't allocate enough space (uncommon), try again. If * we have already allocated as much space as we are allowed, * return what we've got. */ if (valid_len != max_len && !full) { sbuf_delete(sb); goto again; } ifc->ifc_len = valid_len; sbuf_finish(sb); error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len); sbuf_delete(sb); return (error); } /* * Just like ifpromisc(), but for all-multicast-reception mode. */ int if_allmulti(struct ifnet *ifp, int onswitch) { return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch)); } struct ifmultiaddr * if_findmulti(struct ifnet *ifp, const struct sockaddr *sa) { struct ifmultiaddr *ifma; IF_ADDR_LOCK_ASSERT(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (sa->sa_family == AF_LINK) { if (sa_dl_equal(ifma->ifma_addr, sa)) break; } else { if (sa_equal(ifma->ifma_addr, sa)) break; } } return ifma; } /* * Allocate a new ifmultiaddr and initialize based on passed arguments. We * make copies of passed sockaddrs. The ifmultiaddr will not be added to * the ifnet multicast address list here, so the caller must do that and * other setup work (such as notifying the device driver). The reference * count is initialized to 1. */ static struct ifmultiaddr * if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa, int mflags) { struct ifmultiaddr *ifma; struct sockaddr *dupsa; ifma = malloc(sizeof *ifma, M_IFMADDR, mflags | M_ZERO); if (ifma == NULL) return (NULL); dupsa = malloc(sa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma, M_IFMADDR); return (NULL); } bcopy(sa, dupsa, sa->sa_len); ifma->ifma_addr = dupsa; ifma->ifma_ifp = ifp; ifma->ifma_refcount = 1; ifma->ifma_protospec = NULL; if (llsa == NULL) { ifma->ifma_lladdr = NULL; return (ifma); } dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); return (NULL); } bcopy(llsa, dupsa, llsa->sa_len); ifma->ifma_lladdr = dupsa; return (ifma); } /* * if_freemulti: free ifmultiaddr structure and possibly attached related * addresses. The caller is responsible for implementing reference * counting, notifying the driver, handling routing messages, and releasing * any dependent link layer state. */ #ifdef MCAST_VERBOSE extern void kdb_backtrace(void); #endif static void if_freemulti_internal(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d", ifma->ifma_refcount)); if (ifma->ifma_lladdr != NULL) free(ifma->ifma_lladdr, M_IFMADDR); #ifdef MCAST_VERBOSE kdb_backtrace(); printf("%s freeing ifma: %p\n", __func__, ifma); #endif free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); } static void if_destroymulti(epoch_context_t ctx) { struct ifmultiaddr *ifma; ifma = __containerof(ctx, struct ifmultiaddr, ifma_epoch_ctx); if_freemulti_internal(ifma); } void if_freemulti(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti_epoch: refcount %d", ifma->ifma_refcount)); epoch_call(net_epoch_preempt, &ifma->ifma_epoch_ctx, if_destroymulti); } /* * Register an additional multicast address with a network interface. * * - If the address is already present, bump the reference count on the * address and return. * - If the address is not link-layer, look up a link layer address. * - Allocate address structures for one or both addresses, and attach to the * multicast address list on the interface. If automatically adding a link * layer address, the protocol address will own a reference to the link * layer address, to be freed when it is freed. * - Notify the network device driver of an addition to the multicast address * list. * * 'sa' points to caller-owned memory with the desired multicast address. * * 'retifma' will be used to return a pointer to the resulting multicast * address reference, if desired. */ int if_addmulti(struct ifnet *ifp, struct sockaddr *sa, struct ifmultiaddr **retifma) { struct ifmultiaddr *ifma, *ll_ifma; struct sockaddr *llsa; struct sockaddr_dl sdl; int error; #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif #ifdef INET6 IN6_MULTI_LIST_UNLOCK_ASSERT(); #endif /* * If the address is already present, return a new reference to it; * otherwise, allocate storage and set up a new address. */ IF_ADDR_WLOCK(ifp); ifma = if_findmulti(ifp, sa); if (ifma != NULL) { ifma->ifma_refcount++; if (retifma != NULL) *retifma = ifma; IF_ADDR_WUNLOCK(ifp); return (0); } /* * The address isn't already present; resolve the protocol address * into a link layer address, and then look that up, bump its * refcount or allocate an ifma for that also. * Most link layer resolving functions returns address data which * fits inside default sockaddr_dl structure. However callback * can allocate another sockaddr structure, in that case we need to * free it later. */ llsa = NULL; ll_ifma = NULL; if (ifp->if_resolvemulti != NULL) { /* Provide called function with buffer size information */ sdl.sdl_len = sizeof(sdl); llsa = (struct sockaddr *)&sdl; error = ifp->if_resolvemulti(ifp, &llsa, sa); if (error) goto unlock_out; } /* * Allocate the new address. Don't hook it up yet, as we may also * need to allocate a link layer multicast address. */ ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT); if (ifma == NULL) { error = ENOMEM; goto free_llsa_out; } /* * If a link layer address is found, we'll need to see if it's * already present in the address list, or allocate is as well. * When this block finishes, the link layer address will be on the * list. */ if (llsa != NULL) { ll_ifma = if_findmulti(ifp, llsa); if (ll_ifma == NULL) { ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT); if (ll_ifma == NULL) { --ifma->ifma_refcount; if_freemulti(ifma); error = ENOMEM; goto free_llsa_out; } ll_ifma->ifma_flags |= IFMA_F_ENQUEUED; CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma, ifma_link); } else ll_ifma->ifma_refcount++; ifma->ifma_llifma = ll_ifma; } /* * We now have a new multicast address, ifma, and possibly a new or * referenced link layer address. Add the primary address to the * ifnet address list. */ ifma->ifma_flags |= IFMA_F_ENQUEUED; CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); if (retifma != NULL) *retifma = ifma; /* * Must generate the message while holding the lock so that 'ifma' * pointer is still valid. */ rt_newmaddrmsg(RTM_NEWMADDR, ifma); IF_ADDR_WUNLOCK(ifp); /* * We are certain we have added something, so call down to the * interface to let them know about it. */ if (ifp->if_ioctl != NULL) { (void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); } if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); return (0); free_llsa_out: if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); unlock_out: IF_ADDR_WUNLOCK(ifp); return (error); } /* * Delete a multicast group membership by network-layer group address. * * Returns ENOENT if the entry could not be found. If ifp no longer * exists, results are undefined. This entry point should only be used * from subsystems which do appropriate locking to hold ifp for the * duration of the call. * Network-layer protocol domains must use if_delmulti_ifma(). */ int if_delmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; int lastref; #ifdef INVARIANTS - struct epoch_tracker et; struct ifnet *oifp; - NET_EPOCH_ENTER(et); + NET_EPOCH_ASSERT(); + CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; - NET_EPOCH_EXIT(et); KASSERT(ifp != NULL, ("%s: ifnet went away", __func__)); #endif if (ifp == NULL) return (ENOENT); IF_ADDR_WLOCK(ifp); lastref = 0; ifma = if_findmulti(ifp, sa); if (ifma != NULL) lastref = if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); if (ifma == NULL) return (ENOENT); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } return (0); } /* * Delete all multicast group membership for an interface. * Should be used to quickly flush all multicast filters. */ void if_delallmulti(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); } void if_delmulti_ifma(struct ifmultiaddr *ifma) { if_delmulti_ifma_flags(ifma, 0); } /* * Delete a multicast group membership by group membership pointer. * Network-layer protocol domains must use this routine. * * It is safe to call this routine if the ifp disappeared. */ void if_delmulti_ifma_flags(struct ifmultiaddr *ifma, int flags) { struct ifnet *ifp; int lastref; MCDPRINTF("%s freeing ifma: %p\n", __func__, ifma); #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif ifp = ifma->ifma_ifp; #ifdef DIAGNOSTIC if (ifp == NULL) { printf("%s: ifma_ifp seems to be detached\n", __func__); } else { - struct epoch_tracker et; struct ifnet *oifp; - NET_EPOCH_ENTER(et); + NET_EPOCH_ASSERT(); + CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; - NET_EPOCH_EXIT(et); } #endif /* * If and only if the ifnet instance exists: Acquire the address lock. */ if (ifp != NULL) IF_ADDR_WLOCK(ifp); lastref = if_delmulti_locked(ifp, ifma, flags); if (ifp != NULL) { /* * If and only if the ifnet instance exists: * Release the address lock. * If the group was left: update the hardware hash filter. */ IF_ADDR_WUNLOCK(ifp); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } } } /* * Perform deletion of network-layer and/or link-layer multicast address. * * Return 0 if the reference count was decremented. * Return 1 if the final reference was released, indicating that the * hardware hash filter should be reprogrammed. */ static int if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) { struct ifmultiaddr *ll_ifma; if (ifp != NULL && ifma->ifma_ifp != NULL) { KASSERT(ifma->ifma_ifp == ifp, ("%s: inconsistent ifp %p", __func__, ifp)); IF_ADDR_WLOCK_ASSERT(ifp); } ifp = ifma->ifma_ifp; MCDPRINTF("%s freeing %p from %s \n", __func__, ifma, ifp ? ifp->if_xname : ""); /* * If the ifnet is detaching, null out references to ifnet, * so that upper protocol layers will notice, and not attempt * to obtain locks for an ifnet which no longer exists. The * routing socket announcement must happen before the ifnet * instance is detached from the system. */ if (detaching) { #ifdef DIAGNOSTIC printf("%s: detaching ifnet instance %p\n", __func__, ifp); #endif /* * ifp may already be nulled out if we are being reentered * to delete the ll_ifma. */ if (ifp != NULL) { rt_newmaddrmsg(RTM_DELMADDR, ifma); ifma->ifma_ifp = NULL; } } if (--ifma->ifma_refcount > 0) return 0; if (ifp != NULL && detaching == 0 && (ifma->ifma_flags & IFMA_F_ENQUEUED)) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } /* * If this ifma is a network-layer ifma, a link-layer ifma may * have been associated with it. Release it first if so. */ ll_ifma = ifma->ifma_llifma; if (ll_ifma != NULL) { KASSERT(ifma->ifma_lladdr != NULL, ("%s: llifma w/o lladdr", __func__)); if (detaching) ll_ifma->ifma_ifp = NULL; /* XXX */ if (--ll_ifma->ifma_refcount == 0) { if (ifp != NULL) { if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } } if_freemulti(ll_ifma); } } #ifdef INVARIANTS if (ifp) { struct ifmultiaddr *ifmatmp; CK_STAILQ_FOREACH(ifmatmp, &ifp->if_multiaddrs, ifma_link) MPASS(ifma != ifmatmp); } #endif if_freemulti(ifma); /* * The last reference to this instance of struct ifmultiaddr * was released; the hardware should be notified of this change. */ return 1; } /* * Set the link layer address on an interface. * * At this time we only support certain types of interfaces, * and we don't allow the length of the address to change. * * Set noinline to be dtrace-friendly */ __noinline int if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) { struct sockaddr_dl *sdl; struct ifaddr *ifa; struct ifreq ifr; - struct epoch_tracker et; int rc; + NET_EPOCH_ASSERT(); + rc = 0; - NET_EPOCH_ENTER(et); ifa = ifp->if_addr; if (ifa == NULL) { rc = EINVAL; goto out; } sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) { rc = EINVAL; goto out; } if (len != sdl->sdl_alen) { /* don't allow length to change */ rc = EINVAL; goto out; } switch (ifp->if_type) { case IFT_ETHER: case IFT_XETHER: case IFT_L2VLAN: case IFT_BRIDGE: case IFT_IEEE8023ADLAG: bcopy(lladdr, LLADDR(sdl), len); break; default: rc = ENODEV; goto out; } /* * If the interface is already up, we need * to re-init it in order to reprogram its * address filter. */ - NET_EPOCH_EXIT(et); if ((ifp->if_flags & IFF_UP) != 0) { if (ifp->if_ioctl) { ifp->if_flags &= ~IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); ifp->if_flags |= IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); } } EVENTHANDLER_INVOKE(iflladdr_event, ifp); return (0); - out: - NET_EPOCH_EXIT(et); +out: return (rc); } /* * Compat function for handling basic encapsulation requests. * Not converted stacks (FDDI, IB, ..) supports traditional * output model: ARP (and other similar L2 protocols) are handled * inside output routine, arpresolve/nd6_resolve() returns MAC * address instead of full prepend. * * This function creates calculated header==MAC for IPv4/IPv6 and * returns EAFNOSUPPORT (which is then handled in ARP code) for other * address families. */ static int if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req) { if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < req->lladdr_len) return (ENOMEM); switch (req->family) { case AF_INET: case AF_INET6: break; default: return (EAFNOSUPPORT); } /* Copy lladdr to storage as is */ memmove(req->buf, req->lladdr, req->lladdr_len); req->bufsize = req->lladdr_len; req->lladdr_off = 0; return (0); } /* * Tunnel interfaces can nest, also they may cause infinite recursion * calls when misconfigured. We'll prevent this by detecting loops. * High nesting level may cause stack exhaustion. We'll prevent this * by introducing upper limit. * * Return 0, if tunnel nesting count is equal or less than limit. */ int if_tunnel_check_nesting(struct ifnet *ifp, struct mbuf *m, uint32_t cookie, int limit) { struct m_tag *mtag; int count; count = 1; mtag = NULL; while ((mtag = m_tag_locate(m, cookie, 0, mtag)) != NULL) { if (*(struct ifnet **)(mtag + 1) == ifp) { log(LOG_NOTICE, "%s: loop detected\n", if_name(ifp)); return (EIO); } count++; } if (count > limit) { log(LOG_NOTICE, "%s: if_output recursively called too many times(%d)\n", if_name(ifp), count); return (EIO); } mtag = m_tag_alloc(cookie, 0, sizeof(struct ifnet *), M_NOWAIT); if (mtag == NULL) return (ENOMEM); *(struct ifnet **)(mtag + 1) = ifp; m_tag_prepend(m, mtag); return (0); } /* * Get the link layer address that was read from the hardware at attach. * * This is only set by Ethernet NICs (IFT_ETHER), but laggX interfaces re-type * their component interfaces as IFT_IEEE8023ADLAG. */ int if_gethwaddr(struct ifnet *ifp, struct ifreq *ifr) { if (ifp->if_hw_addr == NULL) return (ENODEV); switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE8023ADLAG: bcopy(ifp->if_hw_addr, ifr->ifr_addr.sa_data, ifp->if_addrlen); return (0); default: return (ENODEV); } } /* * The name argument must be a pointer to storage which will last as * long as the interface does. For physical devices, the result of * device_get_name(dev) is a good choice and for pseudo-devices a * static string works well. */ void if_initname(struct ifnet *ifp, const char *name, int unit) { ifp->if_dname = name; ifp->if_dunit = unit; if (unit != IF_DUNIT_NONE) snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit); else strlcpy(ifp->if_xname, name, IFNAMSIZ); } int if_printf(struct ifnet *ifp, const char *fmt, ...) { char if_fmt[256]; va_list ap; snprintf(if_fmt, sizeof(if_fmt), "%s: %s", ifp->if_xname, fmt); va_start(ap, fmt); vlog(LOG_INFO, if_fmt, ap); va_end(ap); return (0); } void if_start(struct ifnet *ifp) { (*(ifp)->if_start)(ifp); } /* * Backwards compatibility interface for drivers * that have not implemented it */ static int if_transmit(struct ifnet *ifp, struct mbuf *m) { int error; IFQ_HANDOFF(ifp, m, error); return (error); } static void if_input_default(struct ifnet *ifp __unused, struct mbuf *m) { m_freem(m); } int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) { int active = 0; IF_LOCK(ifq); if (_IF_QFULL(ifq)) { IF_UNLOCK(ifq); if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); m_freem(m); return (0); } if (ifp != NULL) { if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust); if (m->m_flags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); active = ifp->if_drv_flags & IFF_DRV_OACTIVE; } _IF_ENQUEUE(ifq, m); IF_UNLOCK(ifq); if (ifp != NULL && !active) (*(ifp)->if_start)(ifp); return (1); } void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, ("if_register_com_alloc: %d free already registered", type)); if_com_alloc[type] = a; if_com_free[type] = f; } void if_deregister_com_alloc(u_char type) { KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, ("if_deregister_com_alloc: %d free not registered", type)); if_com_alloc[type] = NULL; if_com_free[type] = NULL; } /* API for driver access to network stack owned ifnet.*/ uint64_t if_setbaudrate(struct ifnet *ifp, uint64_t baudrate) { uint64_t oldbrate; oldbrate = ifp->if_baudrate; ifp->if_baudrate = baudrate; return (oldbrate); } uint64_t if_getbaudrate(if_t ifp) { return (((struct ifnet *)ifp)->if_baudrate); } int if_setcapabilities(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capabilities = capabilities; return (0); } int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit) { ((struct ifnet *)ifp)->if_capabilities |= setbit; ((struct ifnet *)ifp)->if_capabilities &= ~clearbit; return (0); } int if_getcapabilities(if_t ifp) { return ((struct ifnet *)ifp)->if_capabilities; } int if_setcapenable(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capenable = capabilities; return (0); } int if_setcapenablebit(if_t ifp, int setcap, int clearcap) { if(setcap) ((struct ifnet *)ifp)->if_capenable |= setcap; if(clearcap) ((struct ifnet *)ifp)->if_capenable &= ~clearcap; return (0); } const char * if_getdname(if_t ifp) { return ((struct ifnet *)ifp)->if_dname; } int if_togglecapenable(if_t ifp, int togglecap) { ((struct ifnet *)ifp)->if_capenable ^= togglecap; return (0); } int if_getcapenable(if_t ifp) { return ((struct ifnet *)ifp)->if_capenable; } /* * This is largely undesirable because it ties ifnet to a device, but does * provide flexiblity for an embedded product vendor. Should be used with * the understanding that it violates the interface boundaries, and should be * a last resort only. */ int if_setdev(if_t ifp, void *dev) { return (0); } int if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags) { ((struct ifnet *)ifp)->if_drv_flags |= set_flags; ((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags; return (0); } int if_getdrvflags(if_t ifp) { return ((struct ifnet *)ifp)->if_drv_flags; } int if_setdrvflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_drv_flags = flags; return (0); } int if_setflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_flags = flags; return (0); } int if_setflagbits(if_t ifp, int set, int clear) { ((struct ifnet *)ifp)->if_flags |= set; ((struct ifnet *)ifp)->if_flags &= ~clear; return (0); } int if_getflags(if_t ifp) { return ((struct ifnet *)ifp)->if_flags; } int if_clearhwassist(if_t ifp) { ((struct ifnet *)ifp)->if_hwassist = 0; return (0); } int if_sethwassistbits(if_t ifp, int toset, int toclear) { ((struct ifnet *)ifp)->if_hwassist |= toset; ((struct ifnet *)ifp)->if_hwassist &= ~toclear; return (0); } int if_sethwassist(if_t ifp, int hwassist_bit) { ((struct ifnet *)ifp)->if_hwassist = hwassist_bit; return (0); } int if_gethwassist(if_t ifp) { return ((struct ifnet *)ifp)->if_hwassist; } int if_setmtu(if_t ifp, int mtu) { ((struct ifnet *)ifp)->if_mtu = mtu; return (0); } int if_getmtu(if_t ifp) { return ((struct ifnet *)ifp)->if_mtu; } int if_getmtu_family(if_t ifp, int family) { struct domain *dp; for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_family == family && dp->dom_ifmtu != NULL) return (dp->dom_ifmtu((struct ifnet *)ifp)); } return (((struct ifnet *)ifp)->if_mtu); } int if_setsoftc(if_t ifp, void *softc) { ((struct ifnet *)ifp)->if_softc = softc; return (0); } void * if_getsoftc(if_t ifp) { return ((struct ifnet *)ifp)->if_softc; } void if_setrcvif(struct mbuf *m, if_t ifp) { MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (struct ifnet *)ifp; } void if_setvtag(struct mbuf *m, uint16_t tag) { m->m_pkthdr.ether_vtag = tag; } uint16_t if_getvtag(struct mbuf *m) { return (m->m_pkthdr.ether_vtag); } int if_sendq_empty(if_t ifp) { return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd); } struct ifaddr * if_getifaddr(if_t ifp) { return ((struct ifnet *)ifp)->if_addr; } int if_getamcount(if_t ifp) { return ((struct ifnet *)ifp)->if_amcount; } int if_setsendqready(if_t ifp) { IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd); return (0); } int if_setsendqlen(if_t ifp, int tx_desc_count) { IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count); ((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count; return (0); } int if_vlantrunkinuse(if_t ifp) { return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0; } int if_input(if_t ifp, struct mbuf* sendmp) { (*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp); return (0); } /* XXX */ #ifndef ETH_ADDR_LEN #define ETH_ADDR_LEN 6 #endif int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max) { struct ifmultiaddr *ifma; uint8_t *lmta = (uint8_t *)mta; int mcnt = 0; CK_STAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == max) break; bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &lmta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN); mcnt++; } *cnt = mcnt; return (0); } int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max) { int error; if_maddr_rlock(ifp); error = if_setupmultiaddr(ifp, mta, cnt, max); if_maddr_runlock(ifp); return (error); } int if_multiaddr_count(if_t ifp, int max) { struct ifmultiaddr *ifma; int count; count = 0; if_maddr_rlock(ifp); CK_STAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; count++; if (count == max) break; } if_maddr_runlock(ifp); return (count); } int if_multi_apply(struct ifnet *ifp, int (*filter)(void *, struct ifmultiaddr *, int), void *arg) { struct ifmultiaddr *ifma; int cnt = 0; if_maddr_rlock(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) cnt += filter(arg, ifma, cnt); if_maddr_runlock(ifp); return (cnt); } struct mbuf * if_dequeue(if_t ifp) { struct mbuf *m; IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m); return (m); } int if_sendq_prepend(if_t ifp, struct mbuf *m) { IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m); return (0); } int if_setifheaderlen(if_t ifp, int len) { ((struct ifnet *)ifp)->if_hdrlen = len; return (0); } caddr_t if_getlladdr(if_t ifp) { return (IF_LLADDR((struct ifnet *)ifp)); } void * if_gethandle(u_char type) { return (if_alloc(type)); } void if_bpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; BPF_MTAP(ifp, m); } void if_etherbpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; ETHER_BPF_MTAP(ifp, m); } void if_vlancap(if_t ifh) { struct ifnet *ifp = (struct ifnet *)ifh; VLAN_CAPABILITIES(ifp); } int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax) { ((struct ifnet *)ifp)->if_hw_tsomax = if_hw_tsomax; return (0); } int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount) { ((struct ifnet *)ifp)->if_hw_tsomaxsegcount = if_hw_tsomaxsegcount; return (0); } int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize) { ((struct ifnet *)ifp)->if_hw_tsomaxsegsize = if_hw_tsomaxsegsize; return (0); } u_int if_gethwtsomax(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomax); } u_int if_gethwtsomaxsegcount(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegcount); } u_int if_gethwtsomaxsegsize(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegsize); } void if_setinitfn(if_t ifp, void (*init_fn)(void *)) { ((struct ifnet *)ifp)->if_init = init_fn; } void if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t)) { ((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn; } void if_setstartfn(if_t ifp, void (*start_fn)(if_t)) { ((struct ifnet *)ifp)->if_start = (void *)start_fn; } void if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn) { ((struct ifnet *)ifp)->if_transmit = start_fn; } void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn) { ((struct ifnet *)ifp)->if_qflush = flush_fn; } void if_setgetcounterfn(if_t ifp, if_get_counter_t fn) { ifp->if_get_counter = fn; } /* Revisit these - These are inline functions originally. */ int drbr_inuse_drv(if_t ifh, struct buf_ring *br) { return drbr_inuse(ifh, br); } struct mbuf* drbr_dequeue_drv(if_t ifh, struct buf_ring *br) { return drbr_dequeue(ifh, br); } int drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br) { return drbr_needs_enqueue(ifh, br); } int drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m) { return drbr_enqueue(ifh, br, m); } diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index 991ee0882625..7a750558890a 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -1,1443 +1,1447 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_netgraph.h" #include "opt_mbuf_profiling.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #endif #ifdef INET6 #include #endif #include #include #ifdef CTASSERT CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2); CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); #endif VNET_DEFINE(pfil_head_t, link_pfil_head); /* Packet filter hooks */ /* netgraph node hooks for ng_ether(4) */ void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m); int (*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_attach_p)(struct ifnet *ifp); void (*ng_ether_detach_p)(struct ifnet *ifp); void (*vlan_input_p)(struct ifnet *, struct mbuf *); /* if_bridge(4) support */ void (*bridge_dn_p)(struct mbuf *, struct ifnet *); /* if_lagg(4) support */ struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static int ether_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); #ifdef VIMAGE static void ether_reassign(struct ifnet *, struct vnet *, char *); #endif static int ether_requestencap(struct ifnet *, struct if_encap_req *); #define senderr(e) do { error = (e); goto bad;} while (0) static void update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst) { int csum_flags = 0; if (src->m_pkthdr.csum_flags & CSUM_IP) csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA) csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); if (src->m_pkthdr.csum_flags & CSUM_SCTP) csum_flags |= CSUM_SCTP_VALID; dst->m_pkthdr.csum_flags |= csum_flags; if (csum_flags & CSUM_DATA_VALID) dst->m_pkthdr.csum_data = 0xffff; } /* * Handle link-layer encapsulation requests. */ static int ether_requestencap(struct ifnet *ifp, struct if_encap_req *req) { struct ether_header *eh; struct arphdr *ah; uint16_t etype; const u_char *lladdr; if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < ETHER_HDR_LEN) return (ENOMEM); eh = (struct ether_header *)req->buf; lladdr = req->lladdr; req->lladdr_off = 0; switch (req->family) { case AF_INET: etype = htons(ETHERTYPE_IP); break; case AF_INET6: etype = htons(ETHERTYPE_IPV6); break; case AF_ARP: ah = (struct arphdr *)req->hdata; ah->ar_hrd = htons(ARPHRD_ETHER); switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: etype = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: etype = htons(ETHERTYPE_ARP); break; } if (req->flags & IFENCAP_FLAG_BROADCAST) lladdr = ifp->if_broadcastaddr; break; default: return (EAFNOSUPPORT); } memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type)); memcpy(eh->ether_dhost, lladdr, ETHER_ADDR_LEN); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); req->bufsize = sizeof(struct ether_header); return (0); } static int ether_resolve_addr(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro, u_char *phdr, uint32_t *pflags, struct llentry **plle) { struct ether_header *eh; uint32_t lleflags = 0; int error = 0; #if defined(INET) || defined(INET6) uint16_t etype; #endif if (plle) *plle = NULL; eh = (struct ether_header *)phdr; switch (dst->sa_family) { #ifdef INET case AF_INET: if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle); else { if (m->m_flags & M_BCAST) memcpy(eh->ether_dhost, ifp->if_broadcastaddr, ETHER_ADDR_LEN); else { const struct in_addr *a; a = &(((const struct sockaddr_in *)dst)->sin_addr); ETHER_MAP_IP_MULTICAST(a, eh->ether_dhost); } etype = htons(ETHERTYPE_IP); memcpy(&eh->ether_type, &etype, sizeof(etype)); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); } break; #endif #ifdef INET6 case AF_INET6: if ((m->m_flags & M_MCAST) == 0) error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags, plle); else { const struct in6_addr *a6; a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr); ETHER_MAP_IPV6_MULTICAST(a6, eh->ether_dhost); etype = htons(ETHERTYPE_IPV6); memcpy(&eh->ether_type, &etype, sizeof(etype)); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); } break; #endif default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); if (m != NULL) m_freem(m); return (EAFNOSUPPORT); } if (error == EHOSTDOWN) { if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0) error = EHOSTUNREACH; } if (error != 0) return (error); *pflags = RT_MAY_LOOP; if (lleflags & LLE_IFADDR) *pflags |= RT_L2_ME; return (0); } /* * Ethernet output routine. * Encapsulate a packet of type family for the local net. * Use trailer local net encapsulation if enough data in first * packet leaves a multiple of 512 bytes of data in remainder. */ int ether_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { int error = 0; char linkhdr[ETHER_HDR_LEN], *phdr; struct ether_header *eh; struct pf_mtag *t; int loop_copy = 1; int hlen; /* link layer header length */ uint32_t pflags; struct llentry *lle = NULL; int addref = 0; phdr = NULL; pflags = 0; if (ro != NULL) { /* XXX BPF uses ro_prepend */ if (ro->ro_prepend != NULL) { phdr = ro->ro_prepend; hlen = ro->ro_plen; } else if (!(m->m_flags & (M_BCAST | M_MCAST))) { if ((ro->ro_flags & RT_LLE_CACHE) != 0) { lle = ro->ro_lle; if (lle != NULL && (lle->la_flags & LLE_VALID) == 0) { LLE_FREE(lle); lle = NULL; /* redundant */ ro->ro_lle = NULL; } if (lle == NULL) { /* if we lookup, keep cache */ addref = 1; } else /* * Notify LLE code that * the entry was used * by datapath. */ llentry_mark_used(lle); } if (lle != NULL) { phdr = lle->r_linkdata; hlen = lle->r_hdrlen; pflags = lle->r_flags; } } } #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) senderr(error); #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) senderr(ENETDOWN); if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) senderr(ENETDOWN); if (phdr == NULL) { /* No prepend data supplied. Try to calculate ourselves. */ phdr = linkhdr; hlen = ETHER_HDR_LEN; error = ether_resolve_addr(ifp, m, dst, ro, phdr, &pflags, addref ? &lle : NULL); if (addref && lle != NULL) ro->ro_lle = lle; if (error != 0) return (error == EWOULDBLOCK ? 0 : error); } if ((pflags & RT_L2_ME) != 0) { update_mbuf_csumflags(m, m); return (if_simloop(ifp, m, dst->sa_family, 0)); } loop_copy = pflags & RT_MAY_LOOP; /* * Add local net header. If no space in first mbuf, * allocate another. * * Note that we do prepend regardless of RT_HAS_HEADER flag. * This is done because BPF code shifts m_data pointer * to the end of ethernet header prior to calling if_output(). */ M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) senderr(ENOBUFS); if ((pflags & RT_HAS_HEADER) == 0) { eh = mtod(m, struct ether_header *); memcpy(eh, phdr, hlen); } /* * If a simplex interface, and the packet is being sent to our * Ethernet address or a broadcast address, loopback a copy. * XXX To make a simplex device behave exactly like a duplex * device, we should copy in the case of sending to our own * ethernet address (thus letting the original actually appear * on the wire). However, we don't do that here for security * reasons and compatibility with the original behavior. */ if ((m->m_flags & M_BCAST) && loop_copy && (ifp->if_flags & IFF_SIMPLEX) && ((t = pf_find_mtag(m)) == NULL || !t->routed)) { struct mbuf *n; /* * Because if_simloop() modifies the packet, we need a * writable copy through m_dup() instead of a readonly * one as m_copy[m] would give us. The alternative would * be to modify if_simloop() to handle the readonly mbuf, * but performancewise it is mostly equivalent (trading * extra data copying vs. extra locking). * * XXX This is a local workaround. A number of less * often used kernel parts suffer from the same bug. * See PR kern/105943 for a proposed general solution. */ if ((n = m_dup(m, M_NOWAIT)) != NULL) { update_mbuf_csumflags(m, n); (void)if_simloop(ifp, n, dst->sa_family, hlen); } else if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); } /* * Bridges require special output handling. */ if (ifp->if_bridge) { BRIDGE_OUTPUT(ifp, m, error); return (error); } #if defined(INET) || defined(INET6) if (ifp->if_carp && (error = (*carp_output_p)(ifp, m, dst))) goto bad; #endif /* Handle ng_ether(4) processing, if any */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_output_p != NULL, ("ng_ether_output_p is NULL")); if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) { bad: if (m != NULL) m_freem(m); return (error); } if (m == NULL) return (0); } /* Continue with link-layer output */ return ether_output_frame(ifp, m); } static bool ether_set_pcp(struct mbuf **mp, struct ifnet *ifp, uint8_t pcp) { struct ether_header *eh; eh = mtod(*mp, struct ether_header *); if (ntohs(eh->ether_type) == ETHERTYPE_VLAN || ether_8021q_frame(mp, ifp, ifp, 0, pcp)) return (true); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (false); } /* * Ethernet link layer output routine to send a raw frame to the device. * * This assumes that the 14 byte Ethernet header is present and contiguous * in the first mbuf (if BRIDGE'ing). */ int ether_output_frame(struct ifnet *ifp, struct mbuf *m) { uint8_t pcp; pcp = ifp->if_pcp; if (pcp != IFNET_PCP_NONE && ifp->if_type != IFT_L2VLAN && !ether_set_pcp(&m, ifp, pcp)) return (0); if (PFIL_HOOKED_OUT(V_link_pfil_head)) switch (pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_OUT, NULL)) { case PFIL_DROPPED: return (EACCES); case PFIL_CONSUMED: return (0); } #ifdef EXPERIMENTAL #if defined(INET6) && defined(INET) /* draft-ietf-6man-ipv6only-flag */ /* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) { struct ether_header *eh; eh = mtod(m, struct ether_header *); switch (ntohs(eh->ether_type)) { case ETHERTYPE_IP: case ETHERTYPE_ARP: case ETHERTYPE_REVARP: m_freem(m); return (EAFNOSUPPORT); /* NOTREACHED */ break; }; } #endif #endif /* * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ return ((ifp->if_transmit)(ifp, m)); } /* * Process a received Ethernet packet; the packet is in the * mbuf chain m with the ethernet header at the front. */ static void ether_input_internal(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; u_short etype; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } #ifdef DIAGNOSTIC if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n"); m_freem(m); return; } #endif if (m->m_len < ETHER_HDR_LEN) { /* XXX maybe should pullup? */ if_printf(ifp, "discard frame w/o leading ethernet " "header (len %u pkt len %u)\n", m->m_len, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); random_harvest_queue_ether(m, sizeof(*m)); #ifdef EXPERIMENTAL #if defined(INET6) && defined(INET) /* draft-ietf-6man-ipv6only-flag */ /* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) { switch (etype) { case ETHERTYPE_IP: case ETHERTYPE_ARP: case ETHERTYPE_REVARP: m_freem(m); return; /* NOTREACHED */ break; }; } #endif #endif CURVNET_SET_QUIET(ifp->if_vnet); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* * Give bpf a chance at the packet. */ ETHER_BPF_MTAP(ifp, m); /* * If the CRC is still on the packet, trim it off. We do this once * and once only in case we are re-entered. Nothing else on the * Ethernet receive path expects to see the FCS. */ if (m->m_flags & M_HASFCS) { m_adj(m, -ETHER_CRC_LEN); m->m_flags &= ~M_HASFCS; } if (!(ifp->if_capenable & IFCAP_HWSTATS)) if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); CURVNET_RESTORE(); return; } /* Handle input from a lagg(4) port */ if (ifp->if_type == IFT_IEEE8023ADLAG) { KASSERT(lagg_input_p != NULL, ("%s: if_lagg not loaded!", __func__)); m = (*lagg_input_p)(ifp, m); if (m != NULL) ifp = m->m_pkthdr.rcvif; else { CURVNET_RESTORE(); return; } } /* * If the hardware did not process an 802.1Q tag, do this now, * to allow 802.1P priority frames to be passed to the main input * path correctly. * TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels. */ if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) { struct ether_vlan_header *evl; if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { #ifdef DIAGNOSTIC if_printf(ifp, "cannot pullup VLAN header\n"); #endif if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); CURVNET_RESTORE(); return; } evl = mtod(m, struct ether_vlan_header *); m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag); m->m_flags |= M_VLANTAG; bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); eh = mtod(m, struct ether_header *); } M_SETFIB(m, ifp->if_fib); /* Allow ng_ether(4) to claim this frame. */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_p != NULL, ("%s: ng_ether_input_p is NULL", __func__)); m->m_flags &= ~M_PROMISC; (*ng_ether_input_p)(ifp, &m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } /* * Allow if_bridge(4) to claim this frame. * The BRIDGE_INPUT() macro will update ifp if the bridge changed it * and the frame should be delivered locally. */ if (ifp->if_bridge != NULL) { m->m_flags &= ~M_PROMISC; BRIDGE_INPUT(ifp, m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } #if defined(INET) || defined(INET6) /* * Clear M_PROMISC on frame so that carp(4) will see it when the * mbuf flows up to Layer 3. * FreeBSD's implementation of carp(4) uses the inprotosw * to dispatch IPPROTO_CARP. carp(4) also allocates its own * Ethernet addresses of the form 00:00:5e:00:01:xx, which * is outside the scope of the M_PROMISC test below. * TODO: Maintain a hash table of ethernet addresses other than * ether_dhost which may be active on this ifp. */ if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) { m->m_flags &= ~M_PROMISC; } else #endif { /* * If the frame received was not for our MAC address, set the * M_PROMISC flag on the mbuf chain. The frame may need to * be seen by the rest of the Ethernet input path in case of * re-entry (e.g. bridge, vlan, netgraph) but should not be * seen by upper protocol layers. */ if (!ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0) m->m_flags |= M_PROMISC; } ether_demux(ifp, m); CURVNET_RESTORE(); } /* * Ethernet input dispatch; by default, direct dispatch here regardless of * global configuration. However, if RSS is enabled, hook up RSS affinity * so that when deferred or hybrid dispatch is enabled, we can redistribute * load based on RSS. * * XXXRW: Would be nice if the ifnet passed up a flag indicating whether or * not it had already done work distribution via multi-queue. Then we could * direct dispatch in the event load balancing was already complete and * handle the case of interfaces with different capabilities better. * * XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions * at multiple layers? * * XXXRW: For now, enable all this only if RSS is compiled in, although it * works fine without RSS. Need to characterise the performance overhead * of the detour through the netisr code in the event the result is always * direct dispatch. */ static void ether_nh_input(struct mbuf *m) { M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: NULL interface pointer", __func__)); ether_input_internal(m->m_pkthdr.rcvif, m); } static struct netisr_handler ether_nh = { .nh_name = "ether", .nh_handler = ether_nh_input, .nh_proto = NETISR_ETHER, #ifdef RSS .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_DIRECT, .nh_m2cpuid = rss_m2cpuid, #else .nh_policy = NETISR_POLICY_SOURCE, .nh_dispatch = NETISR_DISPATCH_DIRECT, #endif }; static void ether_init(__unused void *arg) { netisr_register(ðer_nh); } SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL); static void vnet_ether_init(__unused void *arg) { struct pfil_head_args args; args.pa_version = PFIL_VERSION; args.pa_flags = PFIL_IN | PFIL_OUT; args.pa_type = PFIL_TYPE_ETHERNET; args.pa_headname = PFIL_ETHER_NAME; V_link_pfil_head = pfil_head_register(&args); #ifdef VIMAGE netisr_register_vnet(ðer_nh); #endif } VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_init, NULL); #ifdef VIMAGE static void vnet_ether_pfil_destroy(__unused void *arg) { pfil_head_unregister(V_link_pfil_head); } VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY, vnet_ether_pfil_destroy, NULL); static void vnet_ether_destroy(__unused void *arg) { netisr_unregister_vnet(ðer_nh); } VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_destroy, NULL); #endif static void ether_input(struct ifnet *ifp, struct mbuf *m) { + struct epoch_tracker et; struct mbuf *mn; /* * The drivers are allowed to pass in a chain of packets linked with * m_nextpkt. We split them up into separate packets here and pass * them up. This allows the drivers to amortize the receive lock. */ + CURVNET_SET_QUIET(ifp->if_vnet); + NET_EPOCH_ENTER(et); while (m) { mn = m->m_nextpkt; m->m_nextpkt = NULL; /* - * We will rely on rcvif being set properly in the deferred context, - * so assert it is correct here. + * We will rely on rcvif being set properly in the deferred + * context, so assert it is correct here. */ MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p " "rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp)); - CURVNET_SET_QUIET(ifp->if_vnet); netisr_dispatch(NETISR_ETHER, m); - CURVNET_RESTORE(); m = mn; } + NET_EPOCH_EXIT(et); + CURVNET_RESTORE(); } /* * Upper layer processing for a received Ethernet packet. */ void ether_demux(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; int i, isr; u_short ether_type; + NET_EPOCH_ASSERT(); KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__)); /* Do not grab PROMISC frames in case we are re-entered. */ if (PFIL_HOOKED_IN(V_link_pfil_head) && !(m->m_flags & M_PROMISC)) { i = pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_IN, NULL); if (i != 0 || m == NULL) return; } eh = mtod(m, struct ether_header *); ether_type = ntohs(eh->ether_type); /* * If this frame has a VLAN tag other than 0, call vlan_input() * if its module is loaded. Otherwise, drop. */ if ((m->m_flags & M_VLANTAG) && EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) { if (ifp->if_vlantrunk == NULL) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!", __func__)); /* Clear before possibly re-entering ether_input(). */ m->m_flags &= ~M_PROMISC; (*vlan_input_p)(ifp, m); return; } /* * Pass promiscuously received frames to the upper layer if the user * requested this by setting IFF_PPROMISC. Otherwise, drop them. */ if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) { m_freem(m); return; } /* * Reset layer specific mbuf flags to avoid confusing upper layers. * Strip off Ethernet header. */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); m_adj(m, ETHER_HDR_LEN); /* * Dispatch frame to upper layer. */ switch (ether_type) { #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } isr = NETISR_ARP; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif default: goto discard; } netisr_dispatch(isr, m); return; discard: /* * Packet is to be discarded. If netgraph is present, * hand the packet to it for last chance processing; * otherwise dispose of it. */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_orphan_p != NULL, ("ng_ether_input_orphan_p is NULL")); /* * Put back the ethernet header so netgraph has a * consistent view of inbound packets. */ M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); (*ng_ether_input_orphan_p)(ifp, m); return; } m_freem(m); } /* * Convert Ethernet address to printable (loggable) representation. * This routine is for compatibility; it's better to just use * * printf("%6D", , ":"); * * since there's no static buffer involved. */ char * ether_sprintf(const u_char *ap) { static char etherbuf[18]; snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":"); return (etherbuf); } /* * Perform common duties while attaching to interface list */ void ether_ifattach(struct ifnet *ifp, const u_int8_t *lla) { int i; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifp->if_addrlen = ETHER_ADDR_LEN; ifp->if_hdrlen = ETHER_HDR_LEN; if_attach(ifp); ifp->if_mtu = ETHERMTU; ifp->if_output = ether_output; ifp->if_input = ether_input; ifp->if_resolvemulti = ether_resolvemulti; ifp->if_requestencap = ether_requestencap; #ifdef VIMAGE ifp->if_reassign = ether_reassign; #endif if (ifp->if_baudrate == 0) ifp->if_baudrate = IF_Mbps(10); /* just a default */ ifp->if_broadcastaddr = etherbroadcastaddr; ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_ETHER; sdl->sdl_alen = ifp->if_addrlen; bcopy(lla, LLADDR(sdl), ifp->if_addrlen); if (ifp->if_hw_addr != NULL) bcopy(lla, ifp->if_hw_addr, ifp->if_addrlen); bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); if (ng_ether_attach_p != NULL) (*ng_ether_attach_p)(ifp); /* Announce Ethernet MAC address if non-zero. */ for (i = 0; i < ifp->if_addrlen; i++) if (lla[i] != 0) break; if (i != ifp->if_addrlen) if_printf(ifp, "Ethernet address: %6D\n", lla, ":"); uuid_ether_add(LLADDR(sdl)); /* Add necessary bits are setup; announce it now. */ EVENTHANDLER_INVOKE(ether_ifattach_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("ETHERNET", ifp->if_xname, "IFATTACH", NULL); } /* * Perform common duties while detaching an Ethernet interface */ void ether_ifdetach(struct ifnet *ifp) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr); uuid_ether_del(LLADDR(sdl)); if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } bpfdetach(ifp); if_detach(ifp); } #ifdef VIMAGE void ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused) { if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } if (ng_ether_attach_p != NULL) { CURVNET_SET_QUIET(new_vnet); (*ng_ether_attach_p)(ifp); CURVNET_RESTORE(); } } #endif SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); #if 0 /* * This is for reference. We have a table-driven version * of the little-endian crc32 generator, which is faster * than the double-loop. */ uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { size_t i; uint32_t crc; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = (crc ^ data) & 1; crc >>= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_LE); } } return (crc); } #else uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { static const uint32_t crctab[] = { 0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, 0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c }; size_t i; uint32_t crc; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { crc ^= buf[i]; crc = (crc >> 4) ^ crctab[crc & 0xf]; crc = (crc >> 4) ^ crctab[crc & 0xf]; } return (crc); } #endif uint32_t ether_crc32_be(const uint8_t *buf, size_t len) { size_t i; uint32_t crc, carry; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01); crc <<= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_BE) | carry; } } return (crc); } int ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; switch (command) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0], ETHER_ADDR_LEN); break; case SIOCSIFMTU: /* * Set the interface MTU. */ if (ifr->ifr_mtu > ETHERMTU) { error = EINVAL; } else { ifp->if_mtu = ifr->ifr_mtu; } break; case SIOCSLANPCP: error = priv_check(curthread, PRIV_NET_SETLANPCP); if (error != 0) break; if (ifr->ifr_lan_pcp > 7 && ifr->ifr_lan_pcp != IFNET_PCP_NONE) { error = EINVAL; } else { ifp->if_pcp = ifr->ifr_lan_pcp; /* broadcast event about PCP change */ EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP); } break; case SIOCGLANPCP: ifr->ifr_lan_pcp = ifp->if_pcp; break; default: error = EINVAL; /* XXX netbsd has ENOTTY??? */ break; } return (error); } static int ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!ETHER_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; *llsa = NULL; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * An IP6 address of 0 means listen to all * of the Ethernet multicast address used for IP6. * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; *llsa = NULL; return 0; } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif default: /* * Well, the text isn't quite right, but it's the name * that counts... */ return EAFNOSUPPORT; } } static moduledata_t ether_mod = { .name = "ether", }; void ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen) { struct ether_vlan_header vlan; struct mbuf mv, mb; KASSERT((m->m_flags & M_VLANTAG) != 0, ("%s: vlan information not present", __func__)); KASSERT(m->m_len >= sizeof(struct ether_header), ("%s: mbuf not large enough for header", __func__)); bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header)); vlan.evl_proto = vlan.evl_encap_proto; vlan.evl_encap_proto = htons(ETHERTYPE_VLAN); vlan.evl_tag = htons(m->m_pkthdr.ether_vtag); m->m_len -= sizeof(struct ether_header); m->m_data += sizeof(struct ether_header); /* * If a data link has been supplied by the caller, then we will need to * re-create a stack allocated mbuf chain with the following structure: * * (1) mbuf #1 will contain the supplied data link * (2) mbuf #2 will contain the vlan header * (3) mbuf #3 will contain the original mbuf's packet data * * Otherwise, submit the packet and vlan header via bpf_mtap2(). */ if (data != NULL) { mv.m_next = m; mv.m_data = (caddr_t)&vlan; mv.m_len = sizeof(vlan); mb.m_next = &mv; mb.m_data = data; mb.m_len = dlen; bpf_mtap(bp, &mb); } else bpf_mtap2(bp, &vlan, sizeof(vlan), m); m->m_len += sizeof(struct ether_header); m->m_data -= sizeof(struct ether_header); } struct mbuf * ether_vlanencap(struct mbuf *m, uint16_t tag) { struct ether_vlan_header *evl; M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT); if (m == NULL) return (NULL); /* M_PREPEND takes care of m_len, m_pkthdr.len for us */ if (m->m_len < sizeof(*evl)) { m = m_pullup(m, sizeof(*evl)); if (m == NULL) return (NULL); } /* * Transform the Ethernet header into an Ethernet header * with 802.1Q encapsulation. */ evl = mtod(m, struct ether_vlan_header *); bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN, (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN); evl->evl_encap_proto = htons(ETHERTYPE_VLAN); evl->evl_tag = htons(tag); return (m); } static SYSCTL_NODE(_net_link, IFT_L2VLAN, vlan, CTLFLAG_RW, 0, "IEEE 802.1Q VLAN"); static SYSCTL_NODE(_net_link_vlan, PF_LINK, link, CTLFLAG_RW, 0, "for consistency"); VNET_DEFINE_STATIC(int, soft_pad); #define V_soft_pad VNET(soft_pad) SYSCTL_INT(_net_link_vlan, OID_AUTO, soft_pad, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(soft_pad), 0, "pad short frames before tagging"); /* * For now, make preserving PCP via an mbuf tag optional, as it increases * per-packet memory allocations and frees. In the future, it would be * preferable to reuse ether_vtag for this, or similar. */ int vlan_mtag_pcp = 0; SYSCTL_INT(_net_link_vlan, OID_AUTO, mtag_pcp, CTLFLAG_RW, &vlan_mtag_pcp, 0, "Retain VLAN PCP information as packets are passed up the stack"); bool ether_8021q_frame(struct mbuf **mp, struct ifnet *ife, struct ifnet *p, uint16_t vid, uint8_t pcp) { struct m_tag *mtag; int n; uint16_t tag; static const char pad[8]; /* just zeros */ /* * Pad the frame to the minimum size allowed if told to. * This option is in accord with IEEE Std 802.1Q, 2003 Ed., * paragraph C.4.4.3.b. It can help to work around buggy * bridges that violate paragraph C.4.4.3.a from the same * document, i.e., fail to pad short frames after untagging. * E.g., a tagged frame 66 bytes long (incl. FCS) is OK, but * untagging it will produce a 62-byte frame, which is a runt * and requires padding. There are VLAN-enabled network * devices that just discard such runts instead or mishandle * them somehow. */ if (V_soft_pad && p->if_type == IFT_ETHER) { for (n = ETHERMIN + ETHER_HDR_LEN - (*mp)->m_pkthdr.len; n > 0; n -= sizeof(pad)) { if (!m_append(*mp, min(n, sizeof(pad)), pad)) break; } if (n > 0) { m_freem(*mp); *mp = NULL; if_printf(ife, "cannot pad short frame"); return (false); } } /* * If underlying interface can do VLAN tag insertion itself, * just pass the packet along. However, we need some way to * tell the interface where the packet came from so that it * knows how to find the VLAN tag to use, so we attach a * packet tag that holds it. */ if (vlan_mtag_pcp && (mtag = m_tag_locate(*mp, MTAG_8021Q, MTAG_8021Q_PCP_OUT, NULL)) != NULL) tag = EVL_MAKETAG(vid, *(uint8_t *)(mtag + 1), 0); else tag = EVL_MAKETAG(vid, pcp, 0); if (p->if_capenable & IFCAP_VLAN_HWTAGGING) { (*mp)->m_pkthdr.ether_vtag = tag; (*mp)->m_flags |= M_VLANTAG; } else { *mp = ether_vlanencap(*mp, tag); if (*mp == NULL) { if_printf(ife, "unable to prepend 802.1Q header"); return (false); } } return (true); } /* * Allocate an address from the FreeBSD Foundation OUI. This uses a * cryptographic hash function on the containing jail's UUID and the interface * name to attempt to provide a unique but stable address. Pseudo-interfaces * which require a MAC address should use this function to allocate * non-locally-administered addresses. */ void ether_gen_addr(struct ifnet *ifp, struct ether_addr *hwaddr) { #define ETHER_GEN_ADDR_BUFSIZ HOSTUUIDLEN + IFNAMSIZ + 2 SHA1_CTX ctx; char buf[ETHER_GEN_ADDR_BUFSIZ]; char uuid[HOSTUUIDLEN + 1]; uint64_t addr; int i, sz; char digest[SHA1_RESULTLEN]; getcredhostuuid(curthread->td_ucred, uuid, sizeof(uuid)); sz = snprintf(buf, ETHER_GEN_ADDR_BUFSIZ, "%s-%s", uuid, ifp->if_xname); SHA1Init(&ctx); SHA1Update(&ctx, buf, sz); SHA1Final(digest, &ctx); addr = ((digest[0] << 16) | (digest[1] << 8) | digest[2]) & OUI_FREEBSD_GENERATED_MASK; addr = OUI_FREEBSD(addr); for (i = 0; i < ETHER_ADDR_LEN; ++i) { hwaddr->octet[i] = addr >> ((ETHER_ADDR_LEN - i - 1) * 8) & 0xFF; } } DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(ether, 1); diff --git a/sys/net/if_gif.c b/sys/net/if_gif.c index 6a5571a996f5..af911d09c945 100644 --- a/sys/net/if_gif.c +++ b/sys/net/if_gif.c @@ -1,702 +1,704 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2018 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: if_gif.c,v 1.87 2001/10/19 08:50:27 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #ifdef INET6 #ifndef INET #include #endif #include #include #include #include #endif /* INET6 */ #include #include #include #include #include static const char gifname[] = "gif"; MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface"); static struct sx gif_ioctl_sx; SX_SYSINIT(gif_ioctl_sx, &gif_ioctl_sx, "gif_ioctl"); void (*ng_gif_input_p)(struct ifnet *ifp, struct mbuf **mp, int af); void (*ng_gif_input_orphan_p)(struct ifnet *ifp, struct mbuf *m, int af); void (*ng_gif_attach_p)(struct ifnet *ifp); void (*ng_gif_detach_p)(struct ifnet *ifp); static void gif_delete_tunnel(struct gif_softc *); static int gif_ioctl(struct ifnet *, u_long, caddr_t); static int gif_transmit(struct ifnet *, struct mbuf *); static void gif_qflush(struct ifnet *); static int gif_clone_create(struct if_clone *, int, caddr_t); static void gif_clone_destroy(struct ifnet *); VNET_DEFINE_STATIC(struct if_clone *, gif_cloner); #define V_gif_cloner VNET(gif_cloner) SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_GIF, gif, CTLFLAG_RW, 0, "Generic Tunnel Interface"); #ifndef MAX_GIF_NEST /* * This macro controls the default upper limitation on nesting of gif tunnels. * Since, setting a large value to this macro with a careless configuration * may introduce system crash, we don't allow any nestings by default. * If you need to configure nested gif tunnels, you can define this macro * in your kernel configuration file. However, if you do so, please be * careful to configure the tunnels so that it won't make a loop. */ #define MAX_GIF_NEST 1 #endif VNET_DEFINE_STATIC(int, max_gif_nesting) = MAX_GIF_NEST; #define V_max_gif_nesting VNET(max_gif_nesting) SYSCTL_INT(_net_link_gif, OID_AUTO, max_nesting, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(max_gif_nesting), 0, "Max nested tunnels"); static int gif_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct gif_softc *sc; sc = malloc(sizeof(struct gif_softc), M_GIF, M_WAITOK | M_ZERO); sc->gif_fibnum = curthread->td_proc->p_fibnum; GIF2IFP(sc) = if_alloc(IFT_GIF); GIF2IFP(sc)->if_softc = sc; if_initname(GIF2IFP(sc), gifname, unit); GIF2IFP(sc)->if_addrlen = 0; GIF2IFP(sc)->if_mtu = GIF_MTU; GIF2IFP(sc)->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; GIF2IFP(sc)->if_ioctl = gif_ioctl; GIF2IFP(sc)->if_transmit = gif_transmit; GIF2IFP(sc)->if_qflush = gif_qflush; GIF2IFP(sc)->if_output = gif_output; GIF2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE; GIF2IFP(sc)->if_capenable |= IFCAP_LINKSTATE; if_attach(GIF2IFP(sc)); bpfattach(GIF2IFP(sc), DLT_NULL, sizeof(u_int32_t)); if (ng_gif_attach_p != NULL) (*ng_gif_attach_p)(GIF2IFP(sc)); return (0); } static void gif_clone_destroy(struct ifnet *ifp) { struct gif_softc *sc; sx_xlock(&gif_ioctl_sx); sc = ifp->if_softc; gif_delete_tunnel(sc); if (ng_gif_detach_p != NULL) (*ng_gif_detach_p)(ifp); bpfdetach(ifp); if_detach(ifp); ifp->if_softc = NULL; sx_xunlock(&gif_ioctl_sx); GIF_WAIT(); if_free(ifp); free(sc, M_GIF); } static void vnet_gif_init(const void *unused __unused) { V_gif_cloner = if_clone_simple(gifname, gif_clone_create, gif_clone_destroy, 0); #ifdef INET in_gif_init(); #endif #ifdef INET6 in6_gif_init(); #endif } VNET_SYSINIT(vnet_gif_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gif_init, NULL); static void vnet_gif_uninit(const void *unused __unused) { if_clone_detach(V_gif_cloner); #ifdef INET in_gif_uninit(); #endif #ifdef INET6 in6_gif_uninit(); #endif } VNET_SYSUNINIT(vnet_gif_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gif_uninit, NULL); static int gifmodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t gif_mod = { "if_gif", gifmodevent, 0 }; DECLARE_MODULE(if_gif, gif_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_gif, 1); struct gif_list * gif_hashinit(void) { struct gif_list *hash; int i; hash = malloc(sizeof(struct gif_list) * GIF_HASH_SIZE, M_GIF, M_WAITOK); for (i = 0; i < GIF_HASH_SIZE; i++) CK_LIST_INIT(&hash[i]); return (hash); } void gif_hashdestroy(struct gif_list *hash) { free(hash, M_GIF); } #define MTAG_GIF 1080679712 static int gif_transmit(struct ifnet *ifp, struct mbuf *m) { struct gif_softc *sc; struct etherip_header *eth; #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; uint32_t t; #endif uint32_t af; uint8_t proto, ecn; int error; GIF_RLOCK(); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); goto err; } #endif error = ENETDOWN; sc = ifp->if_softc; if ((ifp->if_flags & IFF_MONITOR) != 0 || (ifp->if_flags & IFF_UP) == 0 || (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || sc->gif_family == 0 || (error = if_tunnel_check_nesting(ifp, m, MTAG_GIF, V_max_gif_nesting)) != 0) { m_freem(m); goto err; } /* Now pull back the af that we stashed in the csum_data. */ if (ifp->if_bridge) af = AF_LINK; else af = m->m_pkthdr.csum_data; m->m_flags &= ~(M_BCAST|M_MCAST); M_SETFIB(m, sc->gif_fibnum); BPF_MTAP2(ifp, &af, sizeof(af), m); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); /* inner AF-specific encapsulation */ ecn = 0; switch (af) { #ifdef INET case AF_INET: proto = IPPROTO_IPV4; if (m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) { error = ENOBUFS; goto err; } ip = mtod(m, struct ip *); ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &ecn, &ip->ip_tos); break; #endif #ifdef INET6 case AF_INET6: proto = IPPROTO_IPV6; if (m->m_len < sizeof(struct ip6_hdr)) m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) { error = ENOBUFS; goto err; } t = 0; ip6 = mtod(m, struct ip6_hdr *); ip6_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &t, &ip6->ip6_flow); ecn = (ntohl(t) >> 20) & 0xff; break; #endif case AF_LINK: proto = IPPROTO_ETHERIP; M_PREPEND(m, sizeof(struct etherip_header), M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto err; } eth = mtod(m, struct etherip_header *); eth->eip_resvh = 0; eth->eip_ver = ETHERIP_VERSION; eth->eip_resvl = 0; break; default: error = EAFNOSUPPORT; m_freem(m); goto err; } /* XXX should we check if our outer source is legal? */ /* dispatch to output logic based on outer AF */ switch (sc->gif_family) { #ifdef INET case AF_INET: error = in_gif_output(ifp, m, proto, ecn); break; #endif #ifdef INET6 case AF_INET6: error = in6_gif_output(ifp, m, proto, ecn); break; #endif default: m_freem(m); } err: if (error) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); GIF_RUNLOCK(); return (error); } static void gif_qflush(struct ifnet *ifp __unused) { } int gif_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { uint32_t af; if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else af = dst->sa_family; /* * Now save the af in the inbound pkt csum data, this is a cheat since * we are using the inbound csum_data field to carry the af over to * the gif_transmit() routine, avoiding using yet another mtag. */ m->m_pkthdr.csum_data = af; return (ifp->if_transmit(ifp, m)); } void gif_input(struct mbuf *m, struct ifnet *ifp, int proto, uint8_t ecn) { struct etherip_header *eip; #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; uint32_t t; #endif struct ether_header *eh; struct ifnet *oldifp; int isr, n, af; + NET_EPOCH_ASSERT(); + if (ifp == NULL) { /* just in case */ m_freem(m); return; } m->m_pkthdr.rcvif = ifp; m_clrprotoflags(m); switch (proto) { #ifdef INET case IPPROTO_IPV4: af = AF_INET; if (m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) goto drop; ip = mtod(m, struct ip *); if (ip_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &ecn, &ip->ip_tos) == 0) { m_freem(m); goto drop; } break; #endif #ifdef INET6 case IPPROTO_IPV6: af = AF_INET6; if (m->m_len < sizeof(struct ip6_hdr)) m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) goto drop; t = htonl((uint32_t)ecn << 20); ip6 = mtod(m, struct ip6_hdr *); if (ip6_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &t, &ip6->ip6_flow) == 0) { m_freem(m); goto drop; } break; #endif case IPPROTO_ETHERIP: af = AF_LINK; break; default: m_freem(m); goto drop; } #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif if (bpf_peers_present(ifp->if_bpf)) { uint32_t af1 = af; bpf_mtap2(ifp->if_bpf, &af1, sizeof(af1), m); } if ((ifp->if_flags & IFF_MONITOR) != 0) { if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); m_freem(m); return; } if (ng_gif_input_p != NULL) { (*ng_gif_input_p)(ifp, &m, af); if (m == NULL) goto drop; } /* * Put the packet to the network layer input queue according to the * specified address family. * Note: older versions of gif_input directly called network layer * input functions, e.g. ip6_input, here. We changed the policy to * prevent too many recursive calls of such input functions, which * might cause kernel panic. But the change may introduce another * problem; if the input queue is full, packets are discarded. * The kernel stack overflow really happened, and we believed * queue-full rarely occurs, so we changed the policy. */ switch (af) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif case AF_LINK: n = sizeof(struct etherip_header) + sizeof(struct ether_header); if (n > m->m_len) m = m_pullup(m, n); if (m == NULL) goto drop; eip = mtod(m, struct etherip_header *); if (eip->eip_ver != ETHERIP_VERSION) { /* discard unknown versions */ m_freem(m); goto drop; } m_adj(m, sizeof(struct etherip_header)); m->m_flags &= ~(M_BCAST|M_MCAST); m->m_pkthdr.rcvif = ifp; if (ifp->if_bridge) { oldifp = ifp; eh = mtod(m, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } BRIDGE_INPUT(ifp, m); if (m != NULL && ifp != oldifp) { /* * The bridge gave us back itself or one of the * members for which the frame is addressed. */ ether_demux(ifp, m); return; } } if (m != NULL) m_freem(m); return; default: if (ng_gif_input_orphan_p != NULL) (*ng_gif_input_orphan_p)(ifp, m, af); else m_freem(m); return; } if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); return; drop: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } static int gif_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq*)data; struct gif_softc *sc; u_int options; int error; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; case SIOCADDMULTI: case SIOCDELMULTI: case SIOCGIFMTU: case SIOCSIFFLAGS: return (0); case SIOCSIFMTU: if (ifr->ifr_mtu < GIF_MTU_MIN || ifr->ifr_mtu > GIF_MTU_MAX) return (EINVAL); else ifp->if_mtu = ifr->ifr_mtu; return (0); } sx_xlock(&gif_ioctl_sx); sc = ifp->if_softc; if (sc == NULL) { error = ENXIO; goto bad; } error = 0; switch (cmd) { case SIOCDIFPHYADDR: if (sc->gif_family == 0) break; gif_delete_tunnel(sc); break; #ifdef INET case SIOCSIFPHYADDR: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: error = in_gif_ioctl(sc, cmd, data); break; #endif #ifdef INET6 case SIOCSIFPHYADDR_IN6: case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: error = in6_gif_ioctl(sc, cmd, data); break; #endif case SIOCGTUNFIB: ifr->ifr_fib = sc->gif_fibnum; break; case SIOCSTUNFIB: if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0) break; if (ifr->ifr_fib >= rt_numfibs) error = EINVAL; else sc->gif_fibnum = ifr->ifr_fib; break; case GIFGOPTS: options = sc->gif_options; error = copyout(&options, ifr_data_get_ptr(ifr), sizeof(options)); break; case GIFSOPTS: if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0) break; error = copyin(ifr_data_get_ptr(ifr), &options, sizeof(options)); if (error) break; if (options & ~GIF_OPTMASK) { error = EINVAL; break; } if (sc->gif_options != options) { switch (sc->gif_family) { #ifdef INET case AF_INET: error = in_gif_setopts(sc, options); break; #endif #ifdef INET6 case AF_INET6: error = in6_gif_setopts(sc, options); break; #endif default: /* No need to invoke AF-handler */ sc->gif_options = options; } } break; default: error = EINVAL; break; } if (error == 0 && sc->gif_family != 0) { if ( #ifdef INET cmd == SIOCSIFPHYADDR || #endif #ifdef INET6 cmd == SIOCSIFPHYADDR_IN6 || #endif 0) { if_link_state_change(ifp, LINK_STATE_UP); } } bad: sx_xunlock(&gif_ioctl_sx); return (error); } static void gif_delete_tunnel(struct gif_softc *sc) { sx_assert(&gif_ioctl_sx, SA_XLOCKED); if (sc->gif_family != 0) { CK_LIST_REMOVE(sc, srchash); CK_LIST_REMOVE(sc, chain); /* Wait until it become safe to free gif_hdr */ GIF_WAIT(); free(sc->gif_hdr, M_GIF); } sc->gif_family = 0; GIF2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(GIF2IFP(sc), LINK_STATE_DOWN); } diff --git a/sys/net/if_me.c b/sys/net/if_me.c index c9679d23e67b..05bc50cfb917 100644 --- a/sys/net/if_me.c +++ b/sys/net/if_me.c @@ -1,663 +1,665 @@ /*- * Copyright (c) 2014, 2018 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define MEMTU (1500 - sizeof(struct mobhdr)) static const char mename[] = "me"; static MALLOC_DEFINE(M_IFME, mename, "Minimal Encapsulation for IP"); /* Minimal forwarding header RFC 2004 */ struct mobhdr { uint8_t mob_proto; /* protocol */ uint8_t mob_flags; /* flags */ #define MOB_FLAGS_SP 0x80 /* source present */ uint16_t mob_csum; /* header checksum */ struct in_addr mob_dst; /* original destination address */ struct in_addr mob_src; /* original source addr (optional) */ } __packed; struct me_softc { struct ifnet *me_ifp; u_int me_fibnum; struct in_addr me_src; struct in_addr me_dst; CK_LIST_ENTRY(me_softc) chain; CK_LIST_ENTRY(me_softc) srchash; }; CK_LIST_HEAD(me_list, me_softc); #define ME2IFP(sc) ((sc)->me_ifp) #define ME_READY(sc) ((sc)->me_src.s_addr != 0) #define ME_RLOCK_TRACKER struct epoch_tracker me_et #define ME_RLOCK() epoch_enter_preempt(net_epoch_preempt, &me_et) #define ME_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &me_et) #define ME_WAIT() epoch_wait_preempt(net_epoch_preempt) #ifndef ME_HASH_SIZE #define ME_HASH_SIZE (1 << 4) #endif VNET_DEFINE_STATIC(struct me_list *, me_hashtbl) = NULL; VNET_DEFINE_STATIC(struct me_list *, me_srchashtbl) = NULL; #define V_me_hashtbl VNET(me_hashtbl) #define V_me_srchashtbl VNET(me_srchashtbl) #define ME_HASH(src, dst) (V_me_hashtbl[\ me_hashval((src), (dst)) & (ME_HASH_SIZE - 1)]) #define ME_SRCHASH(src) (V_me_srchashtbl[\ fnv_32_buf(&(src), sizeof(src), FNV1_32_INIT) & (ME_HASH_SIZE - 1)]) static struct sx me_ioctl_sx; SX_SYSINIT(me_ioctl_sx, &me_ioctl_sx, "me_ioctl"); static int me_clone_create(struct if_clone *, int, caddr_t); static void me_clone_destroy(struct ifnet *); VNET_DEFINE_STATIC(struct if_clone *, me_cloner); #define V_me_cloner VNET(me_cloner) static void me_qflush(struct ifnet *); static int me_transmit(struct ifnet *, struct mbuf *); static int me_ioctl(struct ifnet *, u_long, caddr_t); static int me_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int me_input(struct mbuf *, int, int, void *); static int me_set_tunnel(struct me_softc *, in_addr_t, in_addr_t); static void me_delete_tunnel(struct me_softc *); SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_TUNNEL, me, CTLFLAG_RW, 0, "Minimal Encapsulation for IP (RFC 2004)"); #ifndef MAX_ME_NEST #define MAX_ME_NEST 1 #endif VNET_DEFINE_STATIC(int, max_me_nesting) = MAX_ME_NEST; #define V_max_me_nesting VNET(max_me_nesting) SYSCTL_INT(_net_link_me, OID_AUTO, max_nesting, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_me_nesting), 0, "Max nested tunnels"); static uint32_t me_hashval(in_addr_t src, in_addr_t dst) { uint32_t ret; ret = fnv_32_buf(&src, sizeof(src), FNV1_32_INIT); return (fnv_32_buf(&dst, sizeof(dst), ret)); } static struct me_list * me_hashinit(void) { struct me_list *hash; int i; hash = malloc(sizeof(struct me_list) * ME_HASH_SIZE, M_IFME, M_WAITOK); for (i = 0; i < ME_HASH_SIZE; i++) CK_LIST_INIT(&hash[i]); return (hash); } static void vnet_me_init(const void *unused __unused) { V_me_cloner = if_clone_simple(mename, me_clone_create, me_clone_destroy, 0); } VNET_SYSINIT(vnet_me_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_me_init, NULL); static void vnet_me_uninit(const void *unused __unused) { if (V_me_hashtbl != NULL) { free(V_me_hashtbl, M_IFME); V_me_hashtbl = NULL; ME_WAIT(); free(V_me_srchashtbl, M_IFME); } if_clone_detach(V_me_cloner); } VNET_SYSUNINIT(vnet_me_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_me_uninit, NULL); static int me_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct me_softc *sc; sc = malloc(sizeof(struct me_softc), M_IFME, M_WAITOK | M_ZERO); sc->me_fibnum = curthread->td_proc->p_fibnum; ME2IFP(sc) = if_alloc(IFT_TUNNEL); ME2IFP(sc)->if_softc = sc; if_initname(ME2IFP(sc), mename, unit); ME2IFP(sc)->if_mtu = MEMTU;; ME2IFP(sc)->if_flags = IFF_POINTOPOINT|IFF_MULTICAST; ME2IFP(sc)->if_output = me_output; ME2IFP(sc)->if_ioctl = me_ioctl; ME2IFP(sc)->if_transmit = me_transmit; ME2IFP(sc)->if_qflush = me_qflush; ME2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE; ME2IFP(sc)->if_capenable |= IFCAP_LINKSTATE; if_attach(ME2IFP(sc)); bpfattach(ME2IFP(sc), DLT_NULL, sizeof(u_int32_t)); return (0); } static void me_clone_destroy(struct ifnet *ifp) { struct me_softc *sc; sx_xlock(&me_ioctl_sx); sc = ifp->if_softc; me_delete_tunnel(sc); bpfdetach(ifp); if_detach(ifp); ifp->if_softc = NULL; sx_xunlock(&me_ioctl_sx); ME_WAIT(); if_free(ifp); free(sc, M_IFME); } static int me_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *src, *dst; struct me_softc *sc; int error; switch (cmd) { case SIOCSIFMTU: if (ifr->ifr_mtu < 576) return (EINVAL); ifp->if_mtu = ifr->ifr_mtu; return (0); case SIOCSIFADDR: ifp->if_flags |= IFF_UP; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: return (0); } sx_xlock(&me_ioctl_sx); sc = ifp->if_softc; if (sc == NULL) { error = ENXIO; goto end; } error = 0; switch (cmd) { case SIOCSIFPHYADDR: src = &((struct in_aliasreq *)data)->ifra_addr; dst = &((struct in_aliasreq *)data)->ifra_dstaddr; if (src->sin_family != dst->sin_family || src->sin_family != AF_INET || src->sin_len != dst->sin_len || src->sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; break; } if (src->sin_addr.s_addr == INADDR_ANY || dst->sin_addr.s_addr == INADDR_ANY) { error = EADDRNOTAVAIL; break; } error = me_set_tunnel(sc, src->sin_addr.s_addr, dst->sin_addr.s_addr); break; case SIOCDIFPHYADDR: me_delete_tunnel(sc); break; case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: if (!ME_READY(sc)) { error = EADDRNOTAVAIL; break; } src = (struct sockaddr_in *)&ifr->ifr_addr; memset(src, 0, sizeof(*src)); src->sin_family = AF_INET; src->sin_len = sizeof(*src); switch (cmd) { case SIOCGIFPSRCADDR: src->sin_addr = sc->me_src; break; case SIOCGIFPDSTADDR: src->sin_addr = sc->me_dst; break; } error = prison_if(curthread->td_ucred, sintosa(src)); if (error != 0) memset(src, 0, sizeof(*src)); break; case SIOCGTUNFIB: ifr->ifr_fib = sc->me_fibnum; break; case SIOCSTUNFIB: if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; if (ifr->ifr_fib >= rt_numfibs) error = EINVAL; else sc->me_fibnum = ifr->ifr_fib; break; default: error = EINVAL; break; } end: sx_xunlock(&me_ioctl_sx); return (error); } static int me_lookup(const struct mbuf *m, int off, int proto, void **arg) { const struct ip *ip; struct me_softc *sc; if (V_me_hashtbl == NULL) return (0); MPASS(in_epoch(net_epoch_preempt)); ip = mtod(m, const struct ip *); CK_LIST_FOREACH(sc, &ME_HASH(ip->ip_dst.s_addr, ip->ip_src.s_addr), chain) { if (sc->me_src.s_addr == ip->ip_dst.s_addr && sc->me_dst.s_addr == ip->ip_src.s_addr) { if ((ME2IFP(sc)->if_flags & IFF_UP) == 0) return (0); *arg = sc; return (ENCAP_DRV_LOOKUP); } } return (0); } /* * Check that ingress address belongs to local host. */ static void me_set_running(struct me_softc *sc) { if (in_localip(sc->me_src)) ME2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING; else ME2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; } /* * ifaddr_event handler. * Clear IFF_DRV_RUNNING flag when ingress address disappears to prevent * source address spoofing. */ static void me_srcaddr(void *arg __unused, const struct sockaddr *sa, int event __unused) { const struct sockaddr_in *sin; struct me_softc *sc; /* Check that VNET is ready */ if (V_me_hashtbl == NULL) return; MPASS(in_epoch(net_epoch_preempt)); sin = (const struct sockaddr_in *)sa; CK_LIST_FOREACH(sc, &ME_SRCHASH(sin->sin_addr.s_addr), srchash) { if (sc->me_src.s_addr != sin->sin_addr.s_addr) continue; me_set_running(sc); } } static int me_set_tunnel(struct me_softc *sc, in_addr_t src, in_addr_t dst) { struct me_softc *tmp; sx_assert(&me_ioctl_sx, SA_XLOCKED); if (V_me_hashtbl == NULL) { V_me_hashtbl = me_hashinit(); V_me_srchashtbl = me_hashinit(); } if (sc->me_src.s_addr == src && sc->me_dst.s_addr == dst) return (0); CK_LIST_FOREACH(tmp, &ME_HASH(src, dst), chain) { if (tmp == sc) continue; if (tmp->me_src.s_addr == src && tmp->me_dst.s_addr == dst) return (EADDRNOTAVAIL); } me_delete_tunnel(sc); sc->me_dst.s_addr = dst; sc->me_src.s_addr = src; CK_LIST_INSERT_HEAD(&ME_HASH(src, dst), sc, chain); CK_LIST_INSERT_HEAD(&ME_SRCHASH(src), sc, srchash); me_set_running(sc); if_link_state_change(ME2IFP(sc), LINK_STATE_UP); return (0); } static void me_delete_tunnel(struct me_softc *sc) { sx_assert(&me_ioctl_sx, SA_XLOCKED); if (ME_READY(sc)) { CK_LIST_REMOVE(sc, chain); CK_LIST_REMOVE(sc, srchash); ME_WAIT(); sc->me_src.s_addr = 0; sc->me_dst.s_addr = 0; ME2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(ME2IFP(sc), LINK_STATE_DOWN); } } static uint16_t me_in_cksum(uint16_t *p, int nwords) { uint32_t sum = 0; while (nwords-- > 0) sum += *p++; sum = (sum >> 16) + (sum & 0xffff); sum += (sum >> 16); return (~sum); } static int me_input(struct mbuf *m, int off, int proto, void *arg) { struct me_softc *sc = arg; struct mobhdr *mh; struct ifnet *ifp; struct ip *ip; int hlen; + NET_EPOCH_ASSERT(); + ifp = ME2IFP(sc); /* checks for short packets */ hlen = sizeof(struct mobhdr); if (m->m_pkthdr.len < sizeof(struct ip) + hlen) hlen -= sizeof(struct in_addr); if (m->m_len < sizeof(struct ip) + hlen) m = m_pullup(m, sizeof(struct ip) + hlen); if (m == NULL) goto drop; mh = (struct mobhdr *)mtodo(m, sizeof(struct ip)); /* check for wrong flags */ if (mh->mob_flags & (~MOB_FLAGS_SP)) { m_freem(m); goto drop; } if (mh->mob_flags) { if (hlen != sizeof(struct mobhdr)) { m_freem(m); goto drop; } } else hlen = sizeof(struct mobhdr) - sizeof(struct in_addr); /* check mobile header checksum */ if (me_in_cksum((uint16_t *)mh, hlen / sizeof(uint16_t)) != 0) { m_freem(m); goto drop; } #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif ip = mtod(m, struct ip *); ip->ip_dst = mh->mob_dst; ip->ip_p = mh->mob_proto; ip->ip_sum = 0; ip->ip_len = htons(m->m_pkthdr.len - hlen); if (mh->mob_flags) ip->ip_src = mh->mob_src; memmove(mtodo(m, hlen), ip, sizeof(struct ip)); m_adj(m, hlen); m_clrprotoflags(m); m->m_pkthdr.rcvif = ifp; m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); M_SETFIB(m, ifp->if_fib); hlen = AF_INET; BPF_MTAP2(ifp, &hlen, sizeof(hlen), m); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if ((ifp->if_flags & IFF_MONITOR) != 0) m_freem(m); else netisr_dispatch(NETISR_IP, m); return (IPPROTO_DONE); drop: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return (IPPROTO_DONE); } static int me_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro __unused) { uint32_t af; if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else af = dst->sa_family; m->m_pkthdr.csum_data = af; return (ifp->if_transmit(ifp, m)); } #define MTAG_ME 1414491977 static int me_transmit(struct ifnet *ifp, struct mbuf *m) { ME_RLOCK_TRACKER; struct mobhdr mh; struct me_softc *sc; struct ip *ip; uint32_t af; int error, hlen, plen; ME_RLOCK(); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error != 0) goto drop; #endif error = ENETDOWN; sc = ifp->if_softc; if (sc == NULL || !ME_READY(sc) || (ifp->if_flags & IFF_MONITOR) != 0 || (ifp->if_flags & IFF_UP) == 0 || (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (error = if_tunnel_check_nesting(ifp, m, MTAG_ME, V_max_me_nesting)) != 0) { m_freem(m); goto drop; } af = m->m_pkthdr.csum_data; if (af != AF_INET) { error = EAFNOSUPPORT; m_freem(m); goto drop; } if (m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) { error = ENOBUFS; goto drop; } ip = mtod(m, struct ip *); /* Fragmented datagramms shouldn't be encapsulated */ if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) { error = EINVAL; m_freem(m); goto drop; } mh.mob_proto = ip->ip_p; mh.mob_src = ip->ip_src; mh.mob_dst = ip->ip_dst; if (in_hosteq(sc->me_src, ip->ip_src)) { hlen = sizeof(struct mobhdr) - sizeof(struct in_addr); mh.mob_flags = 0; } else { hlen = sizeof(struct mobhdr); mh.mob_flags = MOB_FLAGS_SP; } BPF_MTAP2(ifp, &af, sizeof(af), m); plen = m->m_pkthdr.len; ip->ip_src = sc->me_src; ip->ip_dst = sc->me_dst; m->m_flags &= ~(M_BCAST|M_MCAST); M_SETFIB(m, sc->me_fibnum); M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto drop; } if (m->m_len < sizeof(struct ip) + hlen) m = m_pullup(m, sizeof(struct ip) + hlen); if (m == NULL) { error = ENOBUFS; goto drop; } memmove(mtod(m, void *), mtodo(m, hlen), sizeof(struct ip)); ip = mtod(m, struct ip *); ip->ip_len = htons(m->m_pkthdr.len); ip->ip_p = IPPROTO_MOBILE; ip->ip_sum = 0; mh.mob_csum = 0; mh.mob_csum = me_in_cksum((uint16_t *)&mh, hlen / sizeof(uint16_t)); bcopy(&mh, mtodo(m, sizeof(struct ip)), hlen); error = ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); drop: if (error) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, plen); } ME_RUNLOCK(); return (error); } static void me_qflush(struct ifnet *ifp __unused) { } static const struct srcaddrtab *me_srcaddrtab = NULL; static const struct encaptab *ecookie = NULL; static const struct encap_config me_encap_cfg = { .proto = IPPROTO_MOBILE, .min_length = sizeof(struct ip) + sizeof(struct mobhdr) - sizeof(in_addr_t), .exact_match = ENCAP_DRV_LOOKUP, .lookup = me_lookup, .input = me_input }; static int memodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: me_srcaddrtab = ip_encap_register_srcaddr(me_srcaddr, NULL, M_WAITOK); ecookie = ip_encap_attach(&me_encap_cfg, NULL, M_WAITOK); break; case MOD_UNLOAD: ip_encap_detach(ecookie); ip_encap_unregister_srcaddr(me_srcaddrtab); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t me_mod = { "if_me", memodevent, 0 }; DECLARE_MODULE(if_me, me_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_me, 1); diff --git a/sys/net/if_stf.c b/sys/net/if_stf.c index 225bd30b6bd3..aadc58ec5a48 100644 --- a/sys/net/if_stf.c +++ b/sys/net/if_stf.c @@ -1,757 +1,759 @@ /* $FreeBSD$ */ /* $KAME: if_stf.c,v 1.73 2001/12/03 11:08:30 keiichi Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 2000 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * 6to4 interface, based on RFC3056. * * 6to4 interface is NOT capable of link-layer (I mean, IPv4) multicasting. * There is no address mapping defined from IPv6 multicast address to IPv4 * address. Therefore, we do not have IFF_MULTICAST on the interface. * * Due to the lack of address mapping for link-local addresses, we cannot * throw packets toward link-local addresses (fe80::x). Also, we cannot throw * packets to link-local multicast addresses (ff02::x). * * Here are interesting symptoms due to the lack of link-local address: * * Unicast routing exchange: * - RIPng: Impossible. Uses link-local multicast packet toward ff02::9, * and link-local addresses as nexthop. * - OSPFv6: Impossible. OSPFv6 assumes that there's link-local address * assigned to the link, and makes use of them. Also, HELLO packets use * link-local multicast addresses (ff02::5 and ff02::6). * - BGP4+: Maybe. You can only use global address as nexthop, and global * address as TCP endpoint address. * * Multicast routing protocols: * - PIM: Hello packet cannot be used to discover adjacent PIM routers. * Adjacent PIM routers must be configured manually (is it really spec-wise * correct thing to do?). * * ICMPv6: * - Redirects cannot be used due to the lack of link-local address. * * stf interface does not have, and will not need, a link-local address. * It seems to have no real benefit and does not help the above symptoms much. * Even if we assign link-locals to interface, we cannot really * use link-local unicast/multicast on top of 6to4 cloud (since there's no * encapsulation defined for link-local address), and the above analysis does * not change. RFC3056 does not mandate the assignment of link-local address * either. * * 6to4 interface has security issues. Refer to * http://playground.iijlab.net/i-d/draft-itojun-ipv6-transition-abuse-00.txt * for details. The code tries to filter out some of malicious packets. * Note that there is no way to be 100% secure. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_STF, stf, CTLFLAG_RW, 0, "6to4 Interface"); static int stf_permit_rfc1918 = 0; SYSCTL_INT(_net_link_stf, OID_AUTO, permit_rfc1918, CTLFLAG_RWTUN, &stf_permit_rfc1918, 0, "Permit the use of private IPv4 addresses"); #define STFUNIT 0 #define IN6_IS_ADDR_6TO4(x) (ntohs((x)->s6_addr16[0]) == 0x2002) /* * XXX: Return a pointer with 16-bit aligned. Don't cast it to * struct in_addr *; use bcopy() instead. */ #define GET_V4(x) (&(x)->s6_addr16[1]) struct stf_softc { struct ifnet *sc_ifp; u_int sc_fibnum; const struct encaptab *encap_cookie; }; #define STF2IFP(sc) ((sc)->sc_ifp) static const char stfname[] = "stf"; static MALLOC_DEFINE(M_STF, stfname, "6to4 Tunnel Interface"); static const int ip_stf_ttl = 40; static int in_stf_input(struct mbuf *, int, int, void *); static char *stfnames[] = {"stf0", "stf", "6to4", NULL}; static int stfmodevent(module_t, int, void *); static int stf_encapcheck(const struct mbuf *, int, int, void *); static int stf_getsrcifa6(struct ifnet *, struct in6_addr *, struct in6_addr *); static int stf_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int isrfc1918addr(struct in_addr *); static int stf_checkaddr4(struct stf_softc *, struct in_addr *, struct ifnet *); static int stf_checkaddr6(struct stf_softc *, struct in6_addr *, struct ifnet *); static int stf_ioctl(struct ifnet *, u_long, caddr_t); static int stf_clone_match(struct if_clone *, const char *); static int stf_clone_create(struct if_clone *, char *, size_t, caddr_t); static int stf_clone_destroy(struct if_clone *, struct ifnet *); static struct if_clone *stf_cloner; static const struct encap_config ipv4_encap_cfg = { .proto = IPPROTO_IPV6, .min_length = sizeof(struct ip), .exact_match = (sizeof(in_addr_t) << 3) + 8, .check = stf_encapcheck, .input = in_stf_input }; static int stf_clone_match(struct if_clone *ifc, const char *name) { int i; for(i = 0; stfnames[i] != NULL; i++) { if (strcmp(stfnames[i], name) == 0) return (1); } return (0); } static int stf_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { char *dp; int err, unit, wildcard; struct stf_softc *sc; struct ifnet *ifp; err = ifc_name2unit(name, &unit); if (err != 0) return (err); wildcard = (unit < 0); /* * We can only have one unit, but since unit allocation is * already locked, we use it to keep from allocating extra * interfaces. */ unit = STFUNIT; err = ifc_alloc_unit(ifc, &unit); if (err != 0) return (err); sc = malloc(sizeof(struct stf_softc), M_STF, M_WAITOK | M_ZERO); ifp = STF2IFP(sc) = if_alloc(IFT_STF); if (ifp == NULL) { free(sc, M_STF); ifc_free_unit(ifc, unit); return (ENOSPC); } ifp->if_softc = sc; sc->sc_fibnum = curthread->td_proc->p_fibnum; /* * Set the name manually rather then using if_initname because * we don't conform to the default naming convention for interfaces. * In the wildcard case, we need to update the name. */ if (wildcard) { for (dp = name; *dp != '\0'; dp++); if (snprintf(dp, len - (dp-name), "%d", unit) > len - (dp-name) - 1) { /* * This can only be a programmer error and * there's no straightforward way to recover if * it happens. */ panic("if_clone_create(): interface name too long"); } } strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = stfname; ifp->if_dunit = IF_DUNIT_NONE; sc->encap_cookie = ip_encap_attach(&ipv4_encap_cfg, sc, M_WAITOK); if (sc->encap_cookie == NULL) { if_printf(ifp, "attach failed\n"); free(sc, M_STF); ifc_free_unit(ifc, unit); return (ENOMEM); } ifp->if_mtu = IPV6_MMTU; ifp->if_ioctl = stf_ioctl; ifp->if_output = stf_output; ifp->if_snd.ifq_maxlen = ifqmaxlen; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); return (0); } static int stf_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) { struct stf_softc *sc = ifp->if_softc; int err __unused; err = ip_encap_detach(sc->encap_cookie); KASSERT(err == 0, ("Unexpected error detaching encap_cookie")); bpfdetach(ifp); if_detach(ifp); if_free(ifp); free(sc, M_STF); ifc_free_unit(ifc, STFUNIT); return (0); } static int stfmodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: stf_cloner = if_clone_advanced(stfname, 0, stf_clone_match, stf_clone_create, stf_clone_destroy); break; case MOD_UNLOAD: if_clone_detach(stf_cloner); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t stf_mod = { "if_stf", stfmodevent, 0 }; DECLARE_MODULE(if_stf, stf_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); static int stf_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { struct ip ip; struct stf_softc *sc; struct in_addr a, b, mask; struct in6_addr addr6, mask6; sc = (struct stf_softc *)arg; if (sc == NULL) return 0; if ((STF2IFP(sc)->if_flags & IFF_UP) == 0) return 0; /* IFF_LINK0 means "no decapsulation" */ if ((STF2IFP(sc)->if_flags & IFF_LINK0) != 0) return 0; if (proto != IPPROTO_IPV6) return 0; m_copydata(m, 0, sizeof(ip), (caddr_t)&ip); if (ip.ip_v != 4) return 0; if (stf_getsrcifa6(STF2IFP(sc), &addr6, &mask6) != 0) return (0); /* * check if IPv4 dst matches the IPv4 address derived from the * local 6to4 address. * success on: dst = 10.1.1.1, ia6->ia_addr = 2002:0a01:0101:... */ if (bcmp(GET_V4(&addr6), &ip.ip_dst, sizeof(ip.ip_dst)) != 0) return 0; /* * check if IPv4 src matches the IPv4 address derived from the * local 6to4 address masked by prefixmask. * success on: src = 10.1.1.1, ia6->ia_addr = 2002:0a00:.../24 * fail on: src = 10.1.1.1, ia6->ia_addr = 2002:0b00:.../24 */ bzero(&a, sizeof(a)); bcopy(GET_V4(&addr6), &a, sizeof(a)); bcopy(GET_V4(&mask6), &mask, sizeof(mask)); a.s_addr &= mask.s_addr; b = ip.ip_src; b.s_addr &= mask.s_addr; if (a.s_addr != b.s_addr) return 0; /* stf interface makes single side match only */ return 32; } static int stf_getsrcifa6(struct ifnet *ifp, struct in6_addr *addr, struct in6_addr *mask) { struct rm_priotracker in_ifa_tracker; struct ifaddr *ia; struct in_ifaddr *ia4; struct in6_ifaddr *ia6; struct sockaddr_in6 *sin6; struct in_addr in; if_addr_rlock(ifp); CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ia->ifa_addr; if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) continue; bcopy(GET_V4(&sin6->sin6_addr), &in, sizeof(in)); IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia4, INADDR_HASH(in.s_addr), ia_hash) if (ia4->ia_addr.sin_addr.s_addr == in.s_addr) break; IN_IFADDR_RUNLOCK(&in_ifa_tracker); if (ia4 == NULL) continue; ia6 = (struct in6_ifaddr *)ia; *addr = sin6->sin6_addr; *mask = ia6->ia_prefixmask.sin6_addr; if_addr_runlock(ifp); return (0); } if_addr_runlock(ifp); return (ENOENT); } static int stf_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct stf_softc *sc; const struct sockaddr_in6 *dst6; struct in_addr in4; const void *ptr; u_int8_t tos; struct ip *ip; struct ip6_hdr *ip6; struct in6_addr addr6, mask6; int error; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); return (error); } #endif sc = ifp->if_softc; dst6 = (const struct sockaddr_in6 *)dst; /* just in case */ if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETDOWN; } /* * If we don't have an ip4 address that match my inner ip6 address, * we shouldn't generate output. Without this check, we'll end up * using wrong IPv4 source. */ if (stf_getsrcifa6(ifp, &addr6, &mask6) != 0) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETDOWN; } if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENOBUFS; } } ip6 = mtod(m, struct ip6_hdr *); tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; /* * Pickup the right outer dst addr from the list of candidates. * ip6_dst has priority as it may be able to give us shorter IPv4 hops. */ ptr = NULL; if (IN6_IS_ADDR_6TO4(&ip6->ip6_dst)) ptr = GET_V4(&ip6->ip6_dst); else if (IN6_IS_ADDR_6TO4(&dst6->sin6_addr)) ptr = GET_V4(&dst6->sin6_addr); else { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETUNREACH; } bcopy(ptr, &in4, sizeof(in4)); if (bpf_peers_present(ifp->if_bpf)) { /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ u_int af = AF_INET6; bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); } M_PREPEND(m, sizeof(struct ip), M_NOWAIT); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENOBUFS; } ip = mtod(m, struct ip *); bzero(ip, sizeof(*ip)); bcopy(GET_V4(&addr6), &ip->ip_src, sizeof(ip->ip_src)); bcopy(&in4, &ip->ip_dst, sizeof(ip->ip_dst)); ip->ip_p = IPPROTO_IPV6; ip->ip_ttl = ip_stf_ttl; ip->ip_len = htons(m->m_pkthdr.len); if (ifp->if_flags & IFF_LINK1) ip_ecn_ingress(ECN_ALLOWED, &ip->ip_tos, &tos); else ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos); M_SETFIB(m, sc->sc_fibnum); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); error = ip_output(m, NULL, NULL, 0, NULL, NULL); return error; } static int isrfc1918addr(struct in_addr *in) { /* * returns 1 if private address range: * 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16 */ if (stf_permit_rfc1918 == 0 && ( (ntohl(in->s_addr) & 0xff000000) >> 24 == 10 || (ntohl(in->s_addr) & 0xfff00000) >> 16 == 172 * 256 + 16 || (ntohl(in->s_addr) & 0xffff0000) >> 16 == 192 * 256 + 168)) return 1; return 0; } static int stf_checkaddr4(struct stf_softc *sc, struct in_addr *in, struct ifnet *inifp) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia4; /* * reject packets with the following address: * 224.0.0.0/4 0.0.0.0/8 127.0.0.0/8 255.0.0.0/8 */ if (IN_MULTICAST(ntohl(in->s_addr))) return -1; switch ((ntohl(in->s_addr) & 0xff000000) >> 24) { case 0: case 127: case 255: return -1; } /* * reject packets with private address range. * (requirement from RFC3056 section 2 1st paragraph) */ if (isrfc1918addr(in)) return -1; /* * reject packets with broadcast */ IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia4, &V_in_ifaddrhead, ia_link) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) continue; if (in->s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return -1; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * perform ingress filter */ if (sc && (STF2IFP(sc)->if_flags & IFF_LINK2) == 0 && inifp) { struct nhop4_basic nh4; if (fib4_lookup_nh_basic(sc->sc_fibnum, *in, 0, 0, &nh4) != 0) return (-1); if (nh4.nh_ifp != inifp) return (-1); } return 0; } static int stf_checkaddr6(struct stf_softc *sc, struct in6_addr *in6, struct ifnet *inifp) { /* * check 6to4 addresses */ if (IN6_IS_ADDR_6TO4(in6)) { struct in_addr in4; bcopy(GET_V4(in6), &in4, sizeof(in4)); return stf_checkaddr4(sc, &in4, inifp); } /* * reject anything that look suspicious. the test is implemented * in ip6_input too, but we check here as well to * (1) reject bad packets earlier, and * (2) to be safe against future ip6_input change. */ if (IN6_IS_ADDR_V4COMPAT(in6) || IN6_IS_ADDR_V4MAPPED(in6)) return -1; return 0; } static int in_stf_input(struct mbuf *m, int off, int proto, void *arg) { struct stf_softc *sc = arg; struct ip *ip; struct ip6_hdr *ip6; u_int8_t otos, itos; struct ifnet *ifp; + NET_EPOCH_ASSERT(); + if (proto != IPPROTO_IPV6) { m_freem(m); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); if (sc == NULL || (STF2IFP(sc)->if_flags & IFF_UP) == 0) { m_freem(m); return (IPPROTO_DONE); } ifp = STF2IFP(sc); #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif /* * perform sanity check against outer src/dst. * for source, perform ingress filter as well. */ if (stf_checkaddr4(sc, &ip->ip_dst, NULL) < 0 || stf_checkaddr4(sc, &ip->ip_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); return (IPPROTO_DONE); } otos = ip->ip_tos; m_adj(m, off); if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) return (IPPROTO_DONE); } ip6 = mtod(m, struct ip6_hdr *); /* * perform sanity check against inner src/dst. * for source, perform ingress filter as well. */ if (stf_checkaddr6(sc, &ip6->ip6_dst, NULL) < 0 || stf_checkaddr6(sc, &ip6->ip6_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); return (IPPROTO_DONE); } itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; if ((ifp->if_flags & IFF_LINK1) != 0) ip_ecn_egress(ECN_ALLOWED, &otos, &itos); else ip_ecn_egress(ECN_NOCARE, &otos, &itos); ip6->ip6_flow &= ~htonl(0xff << 20); ip6->ip6_flow |= htonl((u_int32_t)itos << 20); m->m_pkthdr.rcvif = ifp; if (bpf_peers_present(ifp->if_bpf)) { /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ u_int32_t af = AF_INET6; bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); } /* * Put the packet to the network layer input queue according to the * specified address family. * See net/if_gif.c for possible issues with packet processing * reorder due to extra queueing. */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); M_SETFIB(m, ifp->if_fib); netisr_dispatch(NETISR_IPV6, m); return (IPPROTO_DONE); } static int stf_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifaddr *ifa; struct ifreq *ifr; struct sockaddr_in6 *sin6; struct in_addr addr; int error, mtu; error = 0; switch (cmd) { case SIOCSIFADDR: ifa = (struct ifaddr *)data; if (ifa == NULL || ifa->ifa_addr->sa_family != AF_INET6) { error = EAFNOSUPPORT; break; } sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) { error = EINVAL; break; } bcopy(GET_V4(&sin6->sin6_addr), &addr, sizeof(addr)); if (isrfc1918addr(&addr)) { error = EINVAL; break; } ifp->if_flags |= IFF_UP; ifp->if_drv_flags |= IFF_DRV_RUNNING; break; case SIOCADDMULTI: case SIOCDELMULTI: ifr = (struct ifreq *)data; if (ifr && ifr->ifr_addr.sa_family == AF_INET6) ; else error = EAFNOSUPPORT; break; case SIOCGIFMTU: break; case SIOCSIFMTU: ifr = (struct ifreq *)data; mtu = ifr->ifr_mtu; /* RFC 4213 3.2 ideal world MTU */ if (mtu < IPV6_MINMTU || mtu > IF_MAXMTU - 20) return (EINVAL); ifp->if_mtu = mtu; break; default: error = EINVAL; break; } return error; } diff --git a/sys/net/if_tuntap.c b/sys/net/if_tuntap.c index ff6c3291d7f4..eb197207ca46 100644 --- a/sys/net/if_tuntap.c +++ b/sys/net/if_tuntap.c @@ -1,1871 +1,1874 @@ /* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 1999-2000 by Maksim Yevmenkin * All rights reserved. * Copyright (c) 2019 Kyle Evans * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * BASED ON: * ------------------------------------------------------------------------- * * Copyright (c) 1988, Julian Onions * Nottingham University 1987. * * This source may be freely distributed, however I would be interested * in any changes that are made. * * This driver takes packets off the IP i/f and hands them up to a * user process to have its wicked way with. This driver has it's * roots in a similar driver written by Phil Cockcroft (formerly) at * UCL. This driver is based much more on read/write/poll mode of * operation though. * * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #endif #include #include #include #include #include #include struct tuntap_driver; /* * tun_list is protected by global tunmtx. Other mutable fields are * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is * static for the duration of a tunnel interface. */ struct tuntap_softc { TAILQ_ENTRY(tuntap_softc) tun_list; struct cdev *tun_alias; struct cdev *tun_dev; u_short tun_flags; /* misc flags */ #define TUN_OPEN 0x0001 #define TUN_INITED 0x0002 #define TUN_IASET 0x0008 #define TUN_DSTADDR 0x0010 #define TUN_LMODE 0x0020 #define TUN_RWAIT 0x0040 #define TUN_ASYNC 0x0080 #define TUN_IFHEAD 0x0100 #define TUN_DYING 0x0200 #define TUN_L2 0x0400 #define TUN_VMNET 0x0800 #define TUN_DRIVER_IDENT_MASK (TUN_L2 | TUN_VMNET) #define TUN_READY (TUN_OPEN | TUN_INITED) pid_t tun_pid; /* owning pid */ struct ifnet *tun_ifp; /* the interface */ struct sigio *tun_sigio; /* async I/O info */ struct tuntap_driver *tun_drv; /* appropriate driver */ struct selinfo tun_rsel; /* read select */ struct mtx tun_mtx; /* softc field mutex */ struct cv tun_cv; /* for ref'd dev destroy */ struct ether_addr tun_ether; /* remote address */ int tun_busy; /* busy count */ }; #define TUN2IFP(sc) ((sc)->tun_ifp) #define TUNDEBUG if (tundebug) if_printf #define TUN_LOCK(tp) mtx_lock(&(tp)->tun_mtx) #define TUN_UNLOCK(tp) mtx_unlock(&(tp)->tun_mtx) #define TUN_LOCK_ASSERT(tp) mtx_assert(&(tp)->tun_mtx, MA_OWNED); #define TUN_VMIO_FLAG_MASK 0x0fff /* * All mutable global variables in if_tun are locked using tunmtx, with * the exception of tundebug, which is used unlocked, and the drivers' *clones, * which are static after setup. */ static struct mtx tunmtx; static eventhandler_tag arrival_tag; static eventhandler_tag clone_tag; static const char tunname[] = "tun"; static const char tapname[] = "tap"; static const char vmnetname[] = "vmnet"; static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface"); static int tundebug = 0; static int tundclone = 1; static int tap_allow_uopen = 0; /* allow user open() */ static int tapuponopen = 0; /* IFF_UP on open() */ static int tapdclone = 1; /* enable devfs cloning */ static TAILQ_HEAD(,tuntap_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); static struct sx tun_ioctl_sx; SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl"); SYSCTL_DECL(_net_link); /* tun */ static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0, "IP tunnel software network interface"); SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0, "Enable legacy devfs interface creation"); /* tap */ static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW, 0, "Ethernet tunnel software network interface"); SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tap_allow_uopen, 0, "Allow user to open /dev/tap (based on node permissions)"); SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0, "Bring interface up when /dev/tap is opened"); SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0, "Enable legacy devfs interface creation"); SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tundebug, 0, ""); static int tun_busy_locked(struct tuntap_softc *tp); static void tun_unbusy_locked(struct tuntap_softc *tp); static int tun_busy(struct tuntap_softc *tp); static void tun_unbusy(struct tuntap_softc *tp); static int tuntap_name2info(const char *name, int *unit, int *flags); static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev); static void tuncreate(struct cdev *dev, struct tuntap_driver *); static void tunrename(void *arg, struct ifnet *ifp); static int tunifioctl(struct ifnet *, u_long, caddr_t); static void tuninit(struct ifnet *); static void tunifinit(void *xtp); static int tuntapmodevent(module_t, int, void *); static int tunoutput(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *ro); static void tunstart(struct ifnet *); static void tunstart_l2(struct ifnet *); static int tun_clone_match(struct if_clone *ifc, const char *name); static int tap_clone_match(struct if_clone *ifc, const char *name); static int vmnet_clone_match(struct if_clone *ifc, const char *name); static int tun_clone_create(struct if_clone *, char *, size_t, caddr_t); static int tun_clone_destroy(struct if_clone *, struct ifnet *); static d_open_t tunopen; static d_close_t tunclose; static d_read_t tunread; static d_write_t tunwrite; static d_ioctl_t tunioctl; static d_poll_t tunpoll; static d_kqfilter_t tunkqfilter; static int tunkqread(struct knote *, long); static int tunkqwrite(struct knote *, long); static void tunkqdetach(struct knote *); static struct filterops tun_read_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqread, }; static struct filterops tun_write_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqwrite, }; static struct tuntap_driver { struct cdevsw cdevsw; int ident_flags; struct unrhdr *unrhdr; struct clonedevs *clones; ifc_match_t *clone_match_fn; ifc_create_t *clone_create_fn; ifc_destroy_t *clone_destroy_fn; } tuntap_drivers[] = { { .ident_flags = 0, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_close = tunclose, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = tunname, }, .clone_match_fn = tun_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, { .ident_flags = TUN_L2, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_close = tunclose, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = tapname, }, .clone_match_fn = tap_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, { .ident_flags = TUN_L2 | TUN_VMNET, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_close = tunclose, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = vmnetname, }, .clone_match_fn = vmnet_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, }; struct tuntap_driver_cloner { SLIST_ENTRY(tuntap_driver_cloner) link; struct tuntap_driver *drv; struct if_clone *cloner; }; VNET_DEFINE_STATIC(SLIST_HEAD(, tuntap_driver_cloner), tuntap_driver_cloners) = SLIST_HEAD_INITIALIZER(tuntap_driver_cloners); #define V_tuntap_driver_cloners VNET(tuntap_driver_cloners) /* * Mechanism for marking a tunnel device as busy so that we can safely do some * orthogonal operations (such as operations on devices) without racing against * tun_destroy. tun_destroy will wait on the condvar if we're at all busy or * open, to be woken up when the condition is alleviated. */ static int tun_busy_locked(struct tuntap_softc *tp) { TUN_LOCK_ASSERT(tp); if ((tp->tun_flags & TUN_DYING) != 0) { /* * Perhaps unintuitive, but the device is busy going away. * Other interpretations of EBUSY from tun_busy make little * sense, since making a busy device even more busy doesn't * sound like a problem. */ return (EBUSY); } ++tp->tun_busy; return (0); } static void tun_unbusy_locked(struct tuntap_softc *tp) { TUN_LOCK_ASSERT(tp); KASSERT(tp->tun_busy != 0, ("tun_unbusy: called for non-busy tunnel")); --tp->tun_busy; /* Wake up anything that may be waiting on our busy tunnel. */ if (tp->tun_busy == 0) cv_broadcast(&tp->tun_cv); } static int tun_busy(struct tuntap_softc *tp) { int ret; TUN_LOCK(tp); ret = tun_busy_locked(tp); TUN_UNLOCK(tp); return (ret); } static void tun_unbusy(struct tuntap_softc *tp) { TUN_LOCK(tp); tun_unbusy_locked(tp); TUN_UNLOCK(tp); } /* * Sets unit and/or flags given the device name. Must be called with correct * vnet context. */ static int tuntap_name2info(const char *name, int *outunit, int *outflags) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; char *dname; int flags, unit; bool found; if (name == NULL) return (EINVAL); /* * Needed for dev_stdclone, but dev_stdclone will not modify, it just * wants to be able to pass back a char * through the second param. We * will always set that as NULL here, so we'll fake it. */ dname = __DECONST(char *, name); found = false; KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), ("tuntap_driver_cloners failed to initialize")); SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { KASSERT(drvc->drv != NULL, ("tuntap_driver_cloners entry not properly initialized")); drv = drvc->drv; if (strcmp(name, drv->cdevsw.d_name) == 0) { found = true; unit = -1; flags = drv->ident_flags; break; } if (dev_stdclone(dname, NULL, drv->cdevsw.d_name, &unit) == 1) { found = true; flags = drv->ident_flags; break; } } if (!found) return (ENXIO); if (outunit != NULL) *outunit = unit; if (outflags != NULL) *outflags = flags; return (0); } /* * Get driver information from a set of flags specified. Masks the identifying * part of the flags and compares it against all of the available * tuntap_drivers. Must be called with correct vnet context. */ static struct tuntap_driver * tuntap_driver_from_flags(int tun_flags) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), ("tuntap_driver_cloners failed to initialize")); SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { KASSERT(drvc->drv != NULL, ("tuntap_driver_cloners entry not properly initialized")); drv = drvc->drv; if ((tun_flags & TUN_DRIVER_IDENT_MASK) == drv->ident_flags) return (drv); } return (NULL); } static int tun_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & TUN_L2) == 0) return (1); } return (0); } static int tap_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & (TUN_L2 | TUN_VMNET)) == TUN_L2) return (1); } return (0); } static int vmnet_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & TUN_VMNET) != 0) return (1); } return (0); } static int tun_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { struct tuntap_driver *drv; struct cdev *dev; int err, i, tunflags, unit; tunflags = 0; /* The name here tells us exactly what we're creating */ err = tuntap_name2info(name, &unit, &tunflags); if (err != 0) return (err); drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) return (ENXIO); if (unit != -1) { /* If this unit number is still available that's okay. */ if (alloc_unr_specific(drv->unrhdr, unit) == -1) return (EEXIST); } else { unit = alloc_unr(drv->unrhdr); } snprintf(name, IFNAMSIZ, "%s%d", drv->cdevsw.d_name, unit); /* find any existing device, or allocate new unit number */ i = clone_create(&drv->clones, &drv->cdevsw, &unit, &dev, 0); if (i) { /* No preexisting struct cdev *, create one */ dev = make_dev(&drv->cdevsw, unit, UID_UUCP, GID_DIALER, 0600, "%s%d", drv->cdevsw.d_name, unit); } tuncreate(dev, drv); return (0); } static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) { char devname[SPECNAMELEN + 1]; struct tuntap_driver *drv; int append_unit, i, u, tunflags; bool mayclone; if (*dev != NULL) return; tunflags = 0; CURVNET_SET(CRED_TO_VNET(cred)); if (tuntap_name2info(name, &u, &tunflags) != 0) goto out; /* Not recognized */ if (u != -1 && u > IF_MAXUNIT) goto out; /* Unit number too high */ mayclone = priv_check_cred(cred, PRIV_NET_IFCREATE) == 0; if ((tunflags & TUN_L2) != 0) { /* tap/vmnet allow user open with a sysctl */ mayclone = (mayclone || tap_allow_uopen) && tapdclone; } else { mayclone = mayclone && tundclone; } /* * If tun cloning is enabled, only the superuser can create an * interface. */ if (!mayclone) goto out; if (u == -1) append_unit = 1; else append_unit = 0; drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) goto out; /* find any existing device, or allocate new unit number */ i = clone_create(&drv->clones, &drv->cdevsw, &u, dev, 0); if (i) { if (append_unit) { namelen = snprintf(devname, sizeof(devname), "%s%d", name, u); name = devname; } /* No preexisting struct cdev *, create one */ *dev = make_dev_credf(MAKEDEV_REF, &drv->cdevsw, u, cred, UID_UUCP, GID_DIALER, 0600, "%s", name); } if_clone_create(name, namelen, NULL); out: CURVNET_RESTORE(); } static void tun_destroy(struct tuntap_softc *tp) { TUN_LOCK(tp); tp->tun_flags |= TUN_DYING; if (tp->tun_busy != 0) cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx); else TUN_UNLOCK(tp); CURVNET_SET(TUN2IFP(tp)->if_vnet); /* destroy_dev will take care of any alias. */ destroy_dev(tp->tun_dev); seldrain(&tp->tun_rsel); knlist_clear(&tp->tun_rsel.si_note, 0); knlist_destroy(&tp->tun_rsel.si_note); if ((tp->tun_flags & TUN_L2) != 0) { ether_ifdetach(TUN2IFP(tp)); } else { bpfdetach(TUN2IFP(tp)); if_detach(TUN2IFP(tp)); } sx_xlock(&tun_ioctl_sx); TUN2IFP(tp)->if_softc = NULL; sx_xunlock(&tun_ioctl_sx); free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit); if_free(TUN2IFP(tp)); mtx_destroy(&tp->tun_mtx); cv_destroy(&tp->tun_cv); free(tp, M_TUN); CURVNET_RESTORE(); } static int tun_clone_destroy(struct if_clone *ifc __unused, struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; mtx_lock(&tunmtx); TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); return (0); } static void vnet_tun_init(const void *unused __unused) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; int i; for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; drvc = malloc(sizeof(*drvc), M_TUN, M_WAITOK | M_ZERO); drvc->drv = drv; drvc->cloner = if_clone_advanced(drv->cdevsw.d_name, 0, drv->clone_match_fn, drv->clone_create_fn, drv->clone_destroy_fn); SLIST_INSERT_HEAD(&V_tuntap_driver_cloners, drvc, link); }; } VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_tun_init, NULL); static void vnet_tun_uninit(const void *unused __unused) { struct tuntap_driver_cloner *drvc; while (!SLIST_EMPTY(&V_tuntap_driver_cloners)) { drvc = SLIST_FIRST(&V_tuntap_driver_cloners); SLIST_REMOVE_HEAD(&V_tuntap_driver_cloners, link); if_clone_detach(drvc->cloner); free(drvc, M_TUN); } } VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_tun_uninit, NULL); static void tun_uninit(const void *unused __unused) { struct tuntap_driver *drv; struct tuntap_softc *tp; int i; EVENTHANDLER_DEREGISTER(ifnet_arrival_event, arrival_tag); EVENTHANDLER_DEREGISTER(dev_clone, clone_tag); drain_dev_clone_events(); mtx_lock(&tunmtx); while ((tp = TAILQ_FIRST(&tunhead)) != NULL) { TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); mtx_lock(&tunmtx); } mtx_unlock(&tunmtx); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; delete_unrhdr(drv->unrhdr); clone_cleanup(&drv->clones); } mtx_destroy(&tunmtx); } SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL); static struct tuntap_driver * tuntap_driver_from_ifnet(const struct ifnet *ifp) { struct tuntap_driver *drv; int i; if (ifp == NULL) return (NULL); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; if (strcmp(ifp->if_dname, drv->cdevsw.d_name) == 0) return (drv); } return (NULL); } static int tuntapmodevent(module_t mod, int type, void *data) { struct tuntap_driver *drv; int i; switch (type) { case MOD_LOAD: mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; clone_setup(&drv->clones); drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx); } arrival_tag = EVENTHANDLER_REGISTER(ifnet_arrival_event, tunrename, 0, 1000); if (arrival_tag == NULL) return (ENOMEM); clone_tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); if (clone_tag == NULL) return (ENOMEM); break; case MOD_UNLOAD: /* See tun_uninit, so it's done after the vnet_sysuninit() */ break; default: return EOPNOTSUPP; } return 0; } static moduledata_t tuntap_mod = { "if_tuntap", tuntapmodevent, 0 }; DECLARE_MODULE(if_tuntap, tuntap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_tuntap, 1); MODULE_VERSION(if_tun, 1); MODULE_VERSION(if_tap, 1); static void tunstart(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; struct mbuf *m; TUNDEBUG(ifp, "starting\n"); if (ALTQ_IS_ENABLED(&ifp->if_snd)) { IFQ_LOCK(&ifp->if_snd); IFQ_POLL_NOLOCK(&ifp->if_snd, m); if (m == NULL) { IFQ_UNLOCK(&ifp->if_snd); return; } IFQ_UNLOCK(&ifp->if_snd); } TUN_LOCK(tp); if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { TUN_UNLOCK(tp); pgsigio(&tp->tun_sigio, SIGIO, 0); } else TUN_UNLOCK(tp); } /* * tunstart_l2 * * queue packets from higher level ready to put out */ static void tunstart_l2(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; TUNDEBUG(ifp, "starting\n"); /* * do not junk pending output if we are in VMnet mode. * XXX: can this do any harm because of queue overflow? */ TUN_LOCK(tp); if (((tp->tun_flags & TUN_VMNET) == 0) && ((tp->tun_flags & TUN_READY) != TUN_READY)) { struct mbuf *m; /* Unlocked read. */ TUNDEBUG(ifp, "not ready, tun_flags = 0x%x\n", tp->tun_flags); for (;;) { IF_DEQUEUE(&ifp->if_snd, m); if (m != NULL) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } else break; } TUN_UNLOCK(tp); return; } ifp->if_drv_flags |= IFF_DRV_OACTIVE; if (!IFQ_IS_EMPTY(&ifp->if_snd)) { if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } if ((tp->tun_flags & TUN_ASYNC) && (tp->tun_sigio != NULL)) { TUN_UNLOCK(tp); pgsigio(&tp->tun_sigio, SIGIO, 0); TUN_LOCK(tp); } selwakeuppri(&tp->tun_rsel, PZERO+1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */ } ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; TUN_UNLOCK(tp); } /* tunstart_l2 */ /* XXX: should return an error code so it can fail. */ static void tuncreate(struct cdev *dev, struct tuntap_driver *drv) { struct tuntap_softc *sc; struct ifnet *ifp; struct ether_addr eaddr; int iflags; u_char type; sc = malloc(sizeof(*sc), M_TUN, M_WAITOK | M_ZERO); mtx_init(&sc->tun_mtx, "tun_mtx", NULL, MTX_DEF); cv_init(&sc->tun_cv, "tun_condvar"); sc->tun_flags = drv->ident_flags; sc->tun_dev = dev; sc->tun_drv = drv; mtx_lock(&tunmtx); TAILQ_INSERT_TAIL(&tunhead, sc, tun_list); mtx_unlock(&tunmtx); iflags = IFF_MULTICAST; if ((sc->tun_flags & TUN_L2) != 0) { type = IFT_ETHER; iflags |= IFF_BROADCAST | IFF_SIMPLEX; } else { type = IFT_PPP; iflags |= IFF_POINTOPOINT; } ifp = sc->tun_ifp = if_alloc(type); if (ifp == NULL) panic("%s%d: failed to if_alloc() interface.\n", drv->cdevsw.d_name, dev2unit(dev)); ifp->if_softc = sc; if_initname(ifp, drv->cdevsw.d_name, dev2unit(dev)); ifp->if_ioctl = tunifioctl; ifp->if_flags = iflags; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); knlist_init_mtx(&sc->tun_rsel.si_note, &sc->tun_mtx); ifp->if_capabilities |= IFCAP_LINKSTATE; ifp->if_capenable |= IFCAP_LINKSTATE; if ((sc->tun_flags & TUN_L2) != 0) { ifp->if_mtu = ETHERMTU; ifp->if_init = tunifinit; ifp->if_start = tunstart_l2; ether_gen_addr(ifp, &eaddr); ether_ifattach(ifp, eaddr.octet); } else { ifp->if_mtu = TUNMTU; ifp->if_start = tunstart; ifp->if_output = tunoutput; ifp->if_snd.ifq_drv_maxlen = 0; IFQ_SET_READY(&ifp->if_snd); if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); } dev->si_drv1 = sc; TUN_LOCK(sc); sc->tun_flags |= TUN_INITED; TUN_UNLOCK(sc); TUNDEBUG(ifp, "interface %s is created, minor = %#x\n", ifp->if_xname, dev2unit(dev)); } static void tunrename(void *arg __unused, struct ifnet *ifp) { struct tuntap_softc *tp; int error; if ((ifp->if_flags & IFF_RENAMING) == 0) return; if (tuntap_driver_from_ifnet(ifp) == NULL) return; /* * We need to grab the ioctl sx long enough to make sure the softc is * still there. If it is, we can safely try to busy the tun device. * The busy may fail if the device is currently dying, in which case * we do nothing. If it doesn't fail, the busy count stops the device * from dying until we've created the alias (that will then be * subsequently destroyed). */ sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; if (tp == NULL) { sx_xunlock(&tun_ioctl_sx); return; } error = tun_busy(tp); sx_xunlock(&tun_ioctl_sx); if (error != 0) return; if (tp->tun_alias != NULL) { destroy_dev(tp->tun_alias); tp->tun_alias = NULL; } if (strcmp(ifp->if_xname, tp->tun_dev->si_name) == 0) goto out; /* * Failure's ok, aliases are created on a best effort basis. If a * tun user/consumer decides to rename the interface to conflict with * another device (non-ifnet) on the system, we will assume they know * what they are doing. make_dev_alias_p won't touch tun_alias on * failure, so we use it but ignore the return value. */ make_dev_alias_p(MAKEDEV_CHECKNAME, &tp->tun_alias, tp->tun_dev, "%s", ifp->if_xname); out: tun_unbusy(tp); } static int tunopen(struct cdev *dev, int flag, int mode, struct thread *td) { struct ifnet *ifp; struct tuntap_driver *drv; struct tuntap_softc *tp; int error, tunflags; tunflags = 0; CURVNET_SET(TD_TO_VNET(td)); error = tuntap_name2info(dev->si_name, NULL, &tunflags); if (error != 0) { CURVNET_RESTORE(); return (error); /* Shouldn't happen */ } if ((tunflags & TUN_L2) != 0) { /* Restrict? */ if (tap_allow_uopen == 0) { error = priv_check(td, PRIV_NET_TAP); if (error != 0) { CURVNET_RESTORE(); return (error); } } } /* * XXXRW: Non-atomic test and set of dev->si_drv1 requires * synchronization. */ tp = dev->si_drv1; if (!tp) { drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) { CURVNET_RESTORE(); return (ENXIO); } tuncreate(dev, drv); tp = dev->si_drv1; } TUN_LOCK(tp); if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) { TUN_UNLOCK(tp); CURVNET_RESTORE(); return (EBUSY); } error = tun_busy_locked(tp); KASSERT(error == 0, ("Must be able to busy an unopen tunnel")); ifp = TUN2IFP(tp); if ((tp->tun_flags & TUN_L2) != 0) { bcopy(IF_LLADDR(ifp), tp->tun_ether.octet, sizeof(tp->tun_ether.octet)); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if (tapuponopen) ifp->if_flags |= IFF_UP; } tp->tun_pid = td->td_proc->p_pid; tp->tun_flags |= TUN_OPEN; if_link_state_change(ifp, LINK_STATE_UP); TUNDEBUG(ifp, "open\n"); TUN_UNLOCK(tp); CURVNET_RESTORE(); return (0); } /* * tunclose - close the device - mark i/f down & delete * routing info */ static int tunclose(struct cdev *dev, int foo, int bar, struct thread *td) { struct proc *p; struct tuntap_softc *tp; struct ifnet *ifp; bool l2tun; p = td->td_proc; tp = dev->si_drv1; ifp = TUN2IFP(tp); TUN_LOCK(tp); /* * Realistically, we can't be obstinate here. This only means that the * tuntap device was closed out of order, and the last closer wasn't the * controller. These are still good to know about, though, as software * should avoid multiple processes with a tuntap device open and * ill-defined transfer of control (e.g., handoff, TUNSIFPID, close in * parent). */ if (p->p_pid != tp->tun_pid) { log(LOG_INFO, "pid %d (%s), %s: tun/tap protocol violation, non-controlling process closed last.\n", p->p_pid, p->p_comm, dev->si_name); } /* * junk all pending output */ CURVNET_SET(ifp->if_vnet); l2tun = false; if ((tp->tun_flags & TUN_L2) != 0) { l2tun = true; IF_DRAIN(&ifp->if_snd); } else { IFQ_PURGE(&ifp->if_snd); } /* For vmnet, we won't do most of the address/route bits */ if ((tp->tun_flags & TUN_VMNET) != 0 || (l2tun && (ifp->if_flags & IFF_LINK0) != 0)) goto out; if (ifp->if_flags & IFF_UP) { TUN_UNLOCK(tp); if_down(ifp); TUN_LOCK(tp); } /* Delete all addresses and routes which reference this interface. */ if (ifp->if_drv_flags & IFF_DRV_RUNNING) { struct ifaddr *ifa; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; TUN_UNLOCK(tp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { /* deal w/IPv4 PtP destination; unlocked read */ if (!l2tun && ifa->ifa_addr->sa_family == AF_INET) { rtinit(ifa, (int)RTM_DELETE, tp->tun_flags & TUN_DSTADDR ? RTF_HOST : 0); } else { rtinit(ifa, (int)RTM_DELETE, 0); } } if_purgeaddrs(ifp); TUN_LOCK(tp); } out: if_link_state_change(ifp, LINK_STATE_DOWN); CURVNET_RESTORE(); funsetown(&tp->tun_sigio); selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); TUNDEBUG (ifp, "closed\n"); tp->tun_flags &= ~TUN_OPEN; tp->tun_pid = 0; tun_unbusy_locked(tp); TUN_UNLOCK(tp); return (0); } static void tuninit(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; #ifdef INET struct ifaddr *ifa; #endif TUNDEBUG(ifp, "tuninit\n"); TUN_LOCK(tp); ifp->if_drv_flags |= IFF_DRV_RUNNING; if ((tp->tun_flags & TUN_L2) == 0) { ifp->if_flags |= IFF_UP; getmicrotime(&ifp->if_lastchange); #ifdef INET if_addr_rlock(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) { struct sockaddr_in *si; si = (struct sockaddr_in *)ifa->ifa_addr; if (si->sin_addr.s_addr) tp->tun_flags |= TUN_IASET; si = (struct sockaddr_in *)ifa->ifa_dstaddr; if (si && si->sin_addr.s_addr) tp->tun_flags |= TUN_DSTADDR; } } if_addr_runlock(ifp); #endif TUN_UNLOCK(tp); } else { ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; TUN_UNLOCK(tp); /* attempt to start output */ tunstart_l2(ifp); } } /* * Used only for l2 tunnel. */ static void tunifinit(void *xtp) { struct tuntap_softc *tp; tp = (struct tuntap_softc *)xtp; tuninit(tp->tun_ifp); } /* * Process an ioctl request. */ static int tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct tuntap_softc *tp; struct ifstat *ifs; struct ifmediareq *ifmr; int dummy, error = 0; bool l2tun; ifmr = NULL; sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; if (tp == NULL) { error = ENXIO; goto bad; } l2tun = (tp->tun_flags & TUN_L2) != 0; switch(cmd) { case SIOCGIFSTATUS: ifs = (struct ifstat *)data; TUN_LOCK(tp); if (tp->tun_pid) snprintf(ifs->ascii, sizeof(ifs->ascii), "\tOpened by PID %d\n", tp->tun_pid); else ifs->ascii[0] = '\0'; TUN_UNLOCK(tp); break; case SIOCSIFADDR: if (l2tun) error = ether_ioctl(ifp, cmd, data); else tuninit(ifp); if (error == 0) TUNDEBUG(ifp, "address set\n"); break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; TUNDEBUG(ifp, "mtu set\n"); break; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCGIFMEDIA: if (!l2tun) { error = EINVAL; break; } ifmr = (struct ifmediareq *)data; dummy = ifmr->ifm_count; ifmr->ifm_count = 1; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (tp->tun_flags & TUN_OPEN) ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_current = ifmr->ifm_active; if (dummy >= 1) { int media = IFM_ETHER; error = copyout(&media, ifmr->ifm_ulist, sizeof(int)); } break; default: if (l2tun) { error = ether_ioctl(ifp, cmd, data); } else { error = EINVAL; } } bad: sx_xunlock(&tun_ioctl_sx); return (error); } /* * tunoutput - queue packets from higher level ready to put out. */ static int tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, struct route *ro) { struct tuntap_softc *tp = ifp->if_softc; u_short cached_tun_flags; int error; u_int32_t af; TUNDEBUG (ifp, "tunoutput\n"); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m0); if (error) { m_freem(m0); return (error); } #endif /* Could be unlocked read? */ TUN_LOCK(tp); cached_tun_flags = tp->tun_flags; TUN_UNLOCK(tp); if ((cached_tun_flags & TUN_READY) != TUN_READY) { TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); m_freem (m0); return (EHOSTDOWN); } if ((ifp->if_flags & IFF_UP) != IFF_UP) { m_freem (m0); return (EHOSTDOWN); } /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else af = dst->sa_family; if (bpf_peers_present(ifp->if_bpf)) bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0); /* prepend sockaddr? this may abort if the mbuf allocation fails */ if (cached_tun_flags & TUN_LMODE) { /* allocate space for sockaddr */ M_PREPEND(m0, dst->sa_len, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else { bcopy(dst, m0->m_data, dst->sa_len); } } if (cached_tun_flags & TUN_IFHEAD) { /* Prepend the address family */ M_PREPEND(m0, 4, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else *(u_int32_t *)m0->m_data = htonl(af); } else { #ifdef INET if (af != AF_INET) #endif { m_freem(m0); return (EAFNOSUPPORT); } } error = (ifp->if_transmit)(ifp, m0); if (error) return (ENOBUFS); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); return (0); } /* * the cdevsw interface is now pretty minimal. */ static int tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { struct ifreq ifr, *ifrp; struct tuntap_softc *tp = dev->si_drv1; struct tuninfo *tunp; int error, iflags; #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) int ival; #endif bool l2tun; l2tun = (tp->tun_flags & TUN_L2) != 0; if (l2tun) { /* tap specific ioctls */ switch(cmd) { /* VMware/VMnet port ioctl's */ #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) case _IO('V', 0): ival = IOCPARM_IVAL(data); data = (caddr_t)&ival; /* FALLTHROUGH */ #endif case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */ iflags = *(int *)data; iflags &= TUN_VMIO_FLAG_MASK; iflags &= ~IFF_CANTCHANGE; iflags |= IFF_UP; TUN_LOCK(tp); TUN2IFP(tp)->if_flags = iflags | (TUN2IFP(tp)->if_flags & IFF_CANTCHANGE); TUN_UNLOCK(tp); return (0); case SIOCGIFADDR: /* get MAC address of the remote side */ TUN_LOCK(tp); bcopy(&tp->tun_ether.octet, data, sizeof(tp->tun_ether.octet)); TUN_UNLOCK(tp); return (0); case SIOCSIFADDR: /* set MAC address of the remote side */ TUN_LOCK(tp); bcopy(data, &tp->tun_ether.octet, sizeof(tp->tun_ether.octet)); TUN_UNLOCK(tp); return (0); } /* Fall through to the common ioctls if unhandled */ } else { switch (cmd) { case TUNSLMODE: TUN_LOCK(tp); if (*(int *)data) { tp->tun_flags |= TUN_LMODE; tp->tun_flags &= ~TUN_IFHEAD; } else tp->tun_flags &= ~TUN_LMODE; TUN_UNLOCK(tp); return (0); case TUNSIFHEAD: TUN_LOCK(tp); if (*(int *)data) { tp->tun_flags |= TUN_IFHEAD; tp->tun_flags &= ~TUN_LMODE; } else tp->tun_flags &= ~TUN_IFHEAD; TUN_UNLOCK(tp); return (0); case TUNGIFHEAD: TUN_LOCK(tp); *(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0; TUN_UNLOCK(tp); return (0); case TUNSIFMODE: /* deny this if UP */ if (TUN2IFP(tp)->if_flags & IFF_UP) return (EBUSY); switch (*(int *)data & ~IFF_MULTICAST) { case IFF_POINTOPOINT: case IFF_BROADCAST: TUN_LOCK(tp); TUN2IFP(tp)->if_flags &= ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); TUN2IFP(tp)->if_flags |= *(int *)data; TUN_UNLOCK(tp); break; default: return (EINVAL); } return (0); case TUNSIFPID: TUN_LOCK(tp); tp->tun_pid = curthread->td_proc->p_pid; TUN_UNLOCK(tp); return (0); } /* Fall through to the common ioctls if unhandled */ } switch (cmd) { case TUNGIFNAME: ifrp = (struct ifreq *)data; strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname, IFNAMSIZ); return (0); case TUNSIFINFO: tunp = (struct tuninfo *)data; if (TUN2IFP(tp)->if_type != tunp->type) return (EPROTOTYPE); TUN_LOCK(tp); if (TUN2IFP(tp)->if_mtu != tunp->mtu) { strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); ifr.ifr_mtu = tunp->mtu; CURVNET_SET(TUN2IFP(tp)->if_vnet); error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp), (caddr_t)&ifr, td); CURVNET_RESTORE(); if (error) { TUN_UNLOCK(tp); return (error); } } TUN2IFP(tp)->if_baudrate = tunp->baudrate; TUN_UNLOCK(tp); break; case TUNGIFINFO: tunp = (struct tuninfo *)data; TUN_LOCK(tp); tunp->mtu = TUN2IFP(tp)->if_mtu; tunp->type = TUN2IFP(tp)->if_type; tunp->baudrate = TUN2IFP(tp)->if_baudrate; TUN_UNLOCK(tp); break; case TUNSDEBUG: tundebug = *(int *)data; break; case TUNGDEBUG: *(int *)data = tundebug; break; case FIONBIO: break; case FIOASYNC: TUN_LOCK(tp); if (*(int *)data) tp->tun_flags |= TUN_ASYNC; else tp->tun_flags &= ~TUN_ASYNC; TUN_UNLOCK(tp); break; case FIONREAD: if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) { struct mbuf *mb; IFQ_LOCK(&TUN2IFP(tp)->if_snd); IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb); for (*(int *)data = 0; mb != NULL; mb = mb->m_next) *(int *)data += mb->m_len; IFQ_UNLOCK(&TUN2IFP(tp)->if_snd); } else *(int *)data = 0; break; case FIOSETOWN: return (fsetown(*(int *)data, &tp->tun_sigio)); case FIOGETOWN: *(int *)data = fgetown(&tp->tun_sigio); return (0); /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: return (fsetown(-(*(int *)data), &tp->tun_sigio)); /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(&tp->tun_sigio); return (0); default: return (ENOTTY); } return (0); } /* * The cdevsw read interface - reads a packet at a time, or at * least as much of a packet as can be read. */ static int tunread(struct cdev *dev, struct uio *uio, int flag) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct mbuf *m; int error=0, len; TUNDEBUG (ifp, "read\n"); TUN_LOCK(tp); if ((tp->tun_flags & TUN_READY) != TUN_READY) { TUN_UNLOCK(tp); TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); return (EHOSTDOWN); } tp->tun_flags &= ~TUN_RWAIT; for (;;) { IFQ_DEQUEUE(&ifp->if_snd, m); if (m != NULL) break; if (flag & O_NONBLOCK) { TUN_UNLOCK(tp); return (EWOULDBLOCK); } tp->tun_flags |= TUN_RWAIT; error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), "tunread", 0); if (error != 0) { TUN_UNLOCK(tp); return (error); } } TUN_UNLOCK(tp); if ((tp->tun_flags & TUN_L2) != 0) BPF_MTAP(ifp, m); while (m && uio->uio_resid > 0 && error == 0) { len = min(uio->uio_resid, m->m_len); if (len != 0) error = uiomove(mtod(m, void *), len, uio); m = m_free(m); } if (m) { TUNDEBUG(ifp, "Dropping mbuf\n"); m_freem(m); } return (error); } static int tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m) { struct ether_header *eh; struct ifnet *ifp; ifp = TUN2IFP(tp); /* * Only pass a unicast frame to ether_input(), if it would * actually have been received by non-virtual hardware. */ if (m->m_len < sizeof(struct ether_header)) { m_freem(m); return (0); } eh = mtod(m, struct ether_header *); if (eh && (ifp->if_flags & IFF_PROMISC) == 0 && !ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) { m_freem(m); return (0); } /* Pass packet up to parent. */ CURVNET_SET(ifp->if_vnet); (*ifp->if_input)(ifp, m); CURVNET_RESTORE(); /* ibytes are counted in parent */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); return (0); } static int tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m) { + struct epoch_tracker et; struct ifnet *ifp; int family, isr; ifp = TUN2IFP(tp); /* Could be unlocked read? */ TUN_LOCK(tp); if (tp->tun_flags & TUN_IFHEAD) { TUN_UNLOCK(tp); if (m->m_len < sizeof(family) && (m = m_pullup(m, sizeof(family))) == NULL) return (ENOBUFS); family = ntohl(*mtod(m, u_int32_t *)); m_adj(m, sizeof(family)); } else { TUN_UNLOCK(tp); family = AF_INET; } BPF_MTAP2(ifp, &family, sizeof(family), m); switch (family) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif default: m_freem(m); return (EAFNOSUPPORT); } random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); CURVNET_SET(ifp->if_vnet); M_SETFIB(m, ifp->if_fib); + NET_EPOCH_ENTER(et); netisr_dispatch(isr, m); + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } /* * the cdevsw write interface - an atomic write is a packet - or else! */ static int tunwrite(struct cdev *dev, struct uio *uio, int flag) { struct tuntap_softc *tp; struct ifnet *ifp; struct mbuf *m; uint32_t mru; int align; bool l2tun; tp = dev->si_drv1; ifp = TUN2IFP(tp); TUNDEBUG(ifp, "tunwrite\n"); if ((ifp->if_flags & IFF_UP) != IFF_UP) /* ignore silently */ return (0); if (uio->uio_resid == 0) return (0); l2tun = (tp->tun_flags & TUN_L2) != 0; align = 0; mru = l2tun ? TAPMRU : TUNMRU; if (l2tun) align = ETHER_ALIGN; else if ((tp->tun_flags & TUN_IFHEAD) != 0) mru += sizeof(uint32_t); /* family */ if (uio->uio_resid < 0 || uio->uio_resid > mru) { TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid); return (EIO); } if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return (ENOBUFS); } m->m_pkthdr.rcvif = ifp; #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif if (l2tun) return (tunwrite_l2(tp, m)); return (tunwrite_l3(tp, m)); } /* * tunpoll - the poll interface, this is only useful on reads * really. The write detect always returns true, write never blocks * anyway, it either accepts the packet or drops it. */ static int tunpoll(struct cdev *dev, int events, struct thread *td) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); int revents = 0; TUNDEBUG(ifp, "tunpoll\n"); if (events & (POLLIN | POLLRDNORM)) { IFQ_LOCK(&ifp->if_snd); if (!IFQ_IS_EMPTY(&ifp->if_snd)) { TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); revents |= events & (POLLIN | POLLRDNORM); } else { TUNDEBUG(ifp, "tunpoll waiting\n"); selrecord(td, &tp->tun_rsel); } IFQ_UNLOCK(&ifp->if_snd); } revents |= events & (POLLOUT | POLLWRNORM); return (revents); } /* * tunkqfilter - support for the kevent() system call. */ static int tunkqfilter(struct cdev *dev, struct knote *kn) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); switch(kn->kn_filter) { case EVFILT_READ: TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_read_filterops; break; case EVFILT_WRITE: TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_write_filterops; break; default: TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n", ifp->if_xname, dev2unit(dev)); return(EINVAL); } kn->kn_hook = tp; knlist_add(&tp->tun_rsel.si_note, kn, 0); return (0); } /* * Return true of there is data in the interface queue. */ static int tunkqread(struct knote *kn, long hint) { int ret; struct tuntap_softc *tp = kn->kn_hook; struct cdev *dev = tp->tun_dev; struct ifnet *ifp = TUN2IFP(tp); if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { TUNDEBUG(ifp, "%s have data in the queue. Len = %d, minor = %#x\n", ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); ret = 1; } else { TUNDEBUG(ifp, "%s waiting for data, minor = %#x\n", ifp->if_xname, dev2unit(dev)); ret = 0; } return (ret); } /* * Always can write, always return MTU in kn->data. */ static int tunkqwrite(struct knote *kn, long hint) { struct tuntap_softc *tp = kn->kn_hook; struct ifnet *ifp = TUN2IFP(tp); kn->kn_data = ifp->if_mtu; return (1); } static void tunkqdetach(struct knote *kn) { struct tuntap_softc *tp = kn->kn_hook; knlist_remove(&tp->tun_rsel.si_note, kn, 0); } diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c index 76a0f29d63c3..2f6aa69ca000 100644 --- a/sys/net/if_vlan.c +++ b/sys/net/if_vlan.c @@ -1,2090 +1,2061 @@ /*- * Copyright 1998 Massachusetts Institute of Technology * Copyright 2012 ADARA Networks, Inc. * Copyright 2017 Dell EMC Isilon * * Portions of this software were developed by Robert N. M. Watson under * contract to ADARA Networks, Inc. * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * if_vlan.c - pseudo-device driver for IEEE 802.1Q virtual LANs. * This is sort of sneaky in the implementation, since * we need to pretend to be enough of an Ethernet implementation * to make arp work. The way we do this is by telling everyone * that we are an Ethernet, and then catch the packets that * ether_output() sends to us via if_transmit(), rewrite them for * use by the real outgoing interface, and ask it to send them. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_kern_tls.h" #include "opt_vlan.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif #define VLAN_DEF_HWIDTH 4 #define VLAN_IFFLAGS (IFF_BROADCAST | IFF_MULTICAST) #define UP_AND_RUNNING(ifp) \ ((ifp)->if_flags & IFF_UP && (ifp)->if_drv_flags & IFF_DRV_RUNNING) CK_SLIST_HEAD(ifvlanhead, ifvlan); struct ifvlantrunk { struct ifnet *parent; /* parent interface of this trunk */ struct mtx lock; #ifdef VLAN_ARRAY #define VLAN_ARRAY_SIZE (EVL_VLID_MASK + 1) struct ifvlan *vlans[VLAN_ARRAY_SIZE]; /* static table */ #else struct ifvlanhead *hash; /* dynamic hash-list table */ uint16_t hmask; uint16_t hwidth; #endif int refcnt; }; #if defined(KERN_TLS) || defined(RATELIMIT) struct vlan_snd_tag { struct m_snd_tag com; struct m_snd_tag *tag; }; static inline struct vlan_snd_tag * mst_to_vst(struct m_snd_tag *mst) { return (__containerof(mst, struct vlan_snd_tag, com)); } #endif /* * This macro provides a facility to iterate over every vlan on a trunk with * the assumption that none will be added/removed during iteration. */ #ifdef VLAN_ARRAY #define VLAN_FOREACH(_ifv, _trunk) \ size_t _i; \ for (_i = 0; _i < VLAN_ARRAY_SIZE; _i++) \ if (((_ifv) = (_trunk)->vlans[_i]) != NULL) #else /* VLAN_ARRAY */ #define VLAN_FOREACH(_ifv, _trunk) \ struct ifvlan *_next; \ size_t _i; \ for (_i = 0; _i < (1 << (_trunk)->hwidth); _i++) \ CK_SLIST_FOREACH_SAFE((_ifv), &(_trunk)->hash[_i], ifv_list, _next) #endif /* VLAN_ARRAY */ /* * This macro provides a facility to iterate over every vlan on a trunk while * also modifying the number of vlans on the trunk. The iteration continues * until some condition is met or there are no more vlans on the trunk. */ #ifdef VLAN_ARRAY /* The VLAN_ARRAY case is simple -- just a for loop using the condition. */ #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \ size_t _i; \ for (_i = 0; !(_cond) && _i < VLAN_ARRAY_SIZE; _i++) \ if (((_ifv) = (_trunk)->vlans[_i])) #else /* VLAN_ARRAY */ /* * The hash table case is more complicated. We allow for the hash table to be * modified (i.e. vlans removed) while we are iterating over it. To allow for * this we must restart the iteration every time we "touch" something during * the iteration, since removal will resize the hash table and invalidate our * current position. If acting on the touched element causes the trunk to be * emptied, then iteration also stops. */ #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \ size_t _i; \ bool _touch = false; \ for (_i = 0; \ !(_cond) && _i < (1 << (_trunk)->hwidth); \ _i = (_touch && ((_trunk) != NULL) ? 0 : _i + 1), _touch = false) \ if (((_ifv) = CK_SLIST_FIRST(&(_trunk)->hash[_i])) != NULL && \ (_touch = true)) #endif /* VLAN_ARRAY */ struct vlan_mc_entry { struct sockaddr_dl mc_addr; CK_SLIST_ENTRY(vlan_mc_entry) mc_entries; struct epoch_context mc_epoch_ctx; }; struct ifvlan { struct ifvlantrunk *ifv_trunk; struct ifnet *ifv_ifp; #define TRUNK(ifv) ((ifv)->ifv_trunk) #define PARENT(ifv) ((ifv)->ifv_trunk->parent) void *ifv_cookie; int ifv_pflags; /* special flags we have set on parent */ int ifv_capenable; int ifv_encaplen; /* encapsulation length */ int ifv_mtufudge; /* MTU fudged by this much */ int ifv_mintu; /* min transmission unit */ uint16_t ifv_proto; /* encapsulation ethertype */ uint16_t ifv_tag; /* tag to apply on packets leaving if */ uint16_t ifv_vid; /* VLAN ID */ uint8_t ifv_pcp; /* Priority Code Point (PCP). */ struct task lladdr_task; CK_SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead; #ifndef VLAN_ARRAY CK_SLIST_ENTRY(ifvlan) ifv_list; #endif }; /* Special flags we should propagate to parent. */ static struct { int flag; int (*func)(struct ifnet *, int); } vlan_pflags[] = { {IFF_PROMISC, ifpromisc}, {IFF_ALLMULTI, if_allmulti}, {0, NULL} }; extern int vlan_mtag_pcp; static const char vlanname[] = "vlan"; static MALLOC_DEFINE(M_VLAN, vlanname, "802.1Q Virtual LAN Interface"); static eventhandler_tag ifdetach_tag; static eventhandler_tag iflladdr_tag; /* * if_vlan uses two module-level synchronizations primitives to allow concurrent * modification of vlan interfaces and (mostly) allow for vlans to be destroyed * while they are being used for tx/rx. To accomplish this in a way that has * acceptable performance and cooperation with other parts of the network stack * there is a non-sleepable epoch(9) and an sx(9). * * The performance-sensitive paths that warrant using the epoch(9) are * vlan_transmit and vlan_input. Both have to check for the vlan interface's * existence using if_vlantrunk, and being in the network tx/rx paths the use * of an epoch(9) gives a measureable improvement in performance. * * The reason for having an sx(9) is mostly because there are still areas that * must be sleepable and also have safe concurrent access to a vlan interface. * Since the sx(9) exists, it is used by default in most paths unless sleeping * is not permitted, or if it is not clear whether sleeping is permitted. * */ #define _VLAN_SX_ID ifv_sx static struct sx _VLAN_SX_ID; #define VLAN_LOCKING_INIT() \ sx_init(&_VLAN_SX_ID, "vlan_sx") #define VLAN_LOCKING_DESTROY() \ sx_destroy(&_VLAN_SX_ID) #define VLAN_SLOCK() sx_slock(&_VLAN_SX_ID) #define VLAN_SUNLOCK() sx_sunlock(&_VLAN_SX_ID) #define VLAN_XLOCK() sx_xlock(&_VLAN_SX_ID) #define VLAN_XUNLOCK() sx_xunlock(&_VLAN_SX_ID) #define VLAN_SLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_SLOCKED) #define VLAN_XLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_XLOCKED) #define VLAN_SXLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_LOCKED) /* * We also have a per-trunk mutex that should be acquired when changing * its state. */ #define TRUNK_LOCK_INIT(trunk) mtx_init(&(trunk)->lock, vlanname, NULL, MTX_DEF) #define TRUNK_LOCK_DESTROY(trunk) mtx_destroy(&(trunk)->lock) #define TRUNK_WLOCK(trunk) mtx_lock(&(trunk)->lock) #define TRUNK_WUNLOCK(trunk) mtx_unlock(&(trunk)->lock) -#define TRUNK_LOCK_ASSERT(trunk) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(trunk)->lock)) #define TRUNK_WLOCK_ASSERT(trunk) mtx_assert(&(trunk)->lock, MA_OWNED); /* * The VLAN_ARRAY substitutes the dynamic hash with a static array * with 4096 entries. In theory this can give a boost in processing, * however in practice it does not. Probably this is because the array * is too big to fit into CPU cache. */ #ifndef VLAN_ARRAY static void vlan_inithash(struct ifvlantrunk *trunk); static void vlan_freehash(struct ifvlantrunk *trunk); static int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv); static int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv); static void vlan_growhash(struct ifvlantrunk *trunk, int howmuch); static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid); #endif static void trunk_destroy(struct ifvlantrunk *trunk); static void vlan_init(void *foo); static void vlan_input(struct ifnet *ifp, struct mbuf *m); static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr); #if defined(KERN_TLS) || defined(RATELIMIT) static int vlan_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); static int vlan_snd_tag_modify(struct m_snd_tag *, union if_snd_tag_modify_params *); static int vlan_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *); static void vlan_snd_tag_free(struct m_snd_tag *); #endif static void vlan_qflush(struct ifnet *ifp); static int vlan_setflag(struct ifnet *ifp, int flag, int status, int (*func)(struct ifnet *, int)); static int vlan_setflags(struct ifnet *ifp, int status); static int vlan_setmulti(struct ifnet *ifp); static int vlan_transmit(struct ifnet *ifp, struct mbuf *m); static int vlan_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro); static void vlan_unconfig(struct ifnet *ifp); static void vlan_unconfig_locked(struct ifnet *ifp, int departing); static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag); static void vlan_link_state(struct ifnet *ifp); static void vlan_capabilities(struct ifvlan *ifv); static void vlan_trunk_capabilities(struct ifnet *ifp); static struct ifnet *vlan_clone_match_ethervid(const char *, int *); static int vlan_clone_match(struct if_clone *, const char *); static int vlan_clone_create(struct if_clone *, char *, size_t, caddr_t); static int vlan_clone_destroy(struct if_clone *, struct ifnet *); static void vlan_ifdetach(void *arg, struct ifnet *ifp); static void vlan_iflladdr(void *arg, struct ifnet *ifp); static void vlan_lladdr_fn(void *arg, int pending); static struct if_clone *vlan_cloner; #ifdef VIMAGE VNET_DEFINE_STATIC(struct if_clone *, vlan_cloner); #define V_vlan_cloner VNET(vlan_cloner) #endif static void vlan_mc_free(struct epoch_context *ctx) { struct vlan_mc_entry *mc = __containerof(ctx, struct vlan_mc_entry, mc_epoch_ctx); free(mc, M_VLAN); } #ifndef VLAN_ARRAY #define HASH(n, m) ((((n) >> 8) ^ ((n) >> 4) ^ (n)) & (m)) static void vlan_inithash(struct ifvlantrunk *trunk) { int i, n; /* * The trunk must not be locked here since we call malloc(M_WAITOK). * It is OK in case this function is called before the trunk struct * gets hooked up and becomes visible from other threads. */ KASSERT(trunk->hwidth == 0 && trunk->hash == NULL, ("%s: hash already initialized", __func__)); trunk->hwidth = VLAN_DEF_HWIDTH; n = 1 << trunk->hwidth; trunk->hmask = n - 1; trunk->hash = malloc(sizeof(struct ifvlanhead) * n, M_VLAN, M_WAITOK); for (i = 0; i < n; i++) CK_SLIST_INIT(&trunk->hash[i]); } static void vlan_freehash(struct ifvlantrunk *trunk) { #ifdef INVARIANTS int i; KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); for (i = 0; i < (1 << trunk->hwidth); i++) KASSERT(CK_SLIST_EMPTY(&trunk->hash[i]), ("%s: hash table not empty", __func__)); #endif free(trunk->hash, M_VLAN); trunk->hash = NULL; trunk->hwidth = trunk->hmask = 0; } static int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { int i, b; struct ifvlan *ifv2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; i = HASH(ifv->ifv_vid, trunk->hmask); CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv->ifv_vid == ifv2->ifv_vid) return (EEXIST); /* * Grow the hash when the number of vlans exceeds half of the number of * hash buckets squared. This will make the average linked-list length * buckets/2. */ if (trunk->refcnt > (b * b) / 2) { vlan_growhash(trunk, 1); i = HASH(ifv->ifv_vid, trunk->hmask); } CK_SLIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list); trunk->refcnt++; return (0); } static int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { int i, b; struct ifvlan *ifv2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; i = HASH(ifv->ifv_vid, trunk->hmask); CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv2 == ifv) { trunk->refcnt--; CK_SLIST_REMOVE(&trunk->hash[i], ifv2, ifvlan, ifv_list); if (trunk->refcnt < (b * b) / 2) vlan_growhash(trunk, -1); return (0); } panic("%s: vlan not found\n", __func__); return (ENOENT); /*NOTREACHED*/ } /* * Grow the hash larger or smaller if memory permits. */ static void vlan_growhash(struct ifvlantrunk *trunk, int howmuch) { struct ifvlan *ifv; struct ifvlanhead *hash2; int hwidth2, i, j, n, n2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); if (howmuch == 0) { /* Harmless yet obvious coding error */ printf("%s: howmuch is 0\n", __func__); return; } hwidth2 = trunk->hwidth + howmuch; n = 1 << trunk->hwidth; n2 = 1 << hwidth2; /* Do not shrink the table below the default */ if (hwidth2 < VLAN_DEF_HWIDTH) return; hash2 = malloc(sizeof(struct ifvlanhead) * n2, M_VLAN, M_WAITOK); if (hash2 == NULL) { printf("%s: out of memory -- hash size not changed\n", __func__); return; /* We can live with the old hash table */ } for (j = 0; j < n2; j++) CK_SLIST_INIT(&hash2[j]); for (i = 0; i < n; i++) while ((ifv = CK_SLIST_FIRST(&trunk->hash[i])) != NULL) { CK_SLIST_REMOVE(&trunk->hash[i], ifv, ifvlan, ifv_list); j = HASH(ifv->ifv_vid, n2 - 1); CK_SLIST_INSERT_HEAD(&hash2[j], ifv, ifv_list); } NET_EPOCH_WAIT(); free(trunk->hash, M_VLAN); trunk->hash = hash2; trunk->hwidth = hwidth2; trunk->hmask = n2 - 1; if (bootverbose) if_printf(trunk->parent, "VLAN hash table resized from %d to %d buckets\n", n, n2); } static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { struct ifvlan *ifv; NET_EPOCH_ASSERT(); CK_SLIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list) if (ifv->ifv_vid == vid) return (ifv); return (NULL); } #if 0 /* Debugging code to view the hashtables. */ static void vlan_dumphash(struct ifvlantrunk *trunk) { int i; struct ifvlan *ifv; for (i = 0; i < (1 << trunk->hwidth); i++) { printf("%d: ", i); CK_SLIST_FOREACH(ifv, &trunk->hash[i], ifv_list) printf("%s ", ifv->ifv_ifp->if_xname); printf("\n"); } } #endif /* 0 */ #else static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { return trunk->vlans[vid]; } static __inline int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { if (trunk->vlans[ifv->ifv_vid] != NULL) return EEXIST; trunk->vlans[ifv->ifv_vid] = ifv; trunk->refcnt++; return (0); } static __inline int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { trunk->vlans[ifv->ifv_vid] = NULL; trunk->refcnt--; return (0); } static __inline void vlan_freehash(struct ifvlantrunk *trunk) { } static __inline void vlan_inithash(struct ifvlantrunk *trunk) { } #endif /* !VLAN_ARRAY */ static void trunk_destroy(struct ifvlantrunk *trunk) { VLAN_XLOCK_ASSERT(); vlan_freehash(trunk); trunk->parent->if_vlantrunk = NULL; TRUNK_LOCK_DESTROY(trunk); if_rele(trunk->parent); free(trunk, M_VLAN); } /* * Program our multicast filter. What we're actually doing is * programming the multicast filter of the parent. This has the * side effect of causing the parent interface to receive multicast * traffic that it doesn't really want, which ends up being discarded * later by the upper protocol layers. Unfortunately, there's no way * to avoid this: there really is only one physical interface. */ static int vlan_setmulti(struct ifnet *ifp) { struct ifnet *ifp_p; struct ifmultiaddr *ifma; struct ifvlan *sc; struct vlan_mc_entry *mc; int error; VLAN_XLOCK_ASSERT(); /* Find the parent. */ sc = ifp->if_softc; ifp_p = PARENT(sc); CURVNET_SET_QUIET(ifp_p->if_vnet); /* First, remove any existing filter entries. */ while ((mc = CK_SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) { CK_SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries); (void)if_delmulti(ifp_p, (struct sockaddr *)&mc->mc_addr); epoch_call(net_epoch_preempt, &mc->mc_epoch_ctx, vlan_mc_free); } /* Now program new ones. */ IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mc = malloc(sizeof(struct vlan_mc_entry), M_VLAN, M_NOWAIT); if (mc == NULL) { IF_ADDR_WUNLOCK(ifp); return (ENOMEM); } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp_p->if_index; CK_SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries); } IF_ADDR_WUNLOCK(ifp); CK_SLIST_FOREACH (mc, &sc->vlan_mc_listhead, mc_entries) { error = if_addmulti(ifp_p, (struct sockaddr *)&mc->mc_addr, NULL); if (error) return (error); } CURVNET_RESTORE(); return (0); } /* * A handler for parent interface link layer address changes. * If the parent interface link layer address is changed we * should also change it on all children vlans. */ static void vlan_iflladdr(void *arg __unused, struct ifnet *ifp) { struct epoch_tracker et; struct ifvlan *ifv; struct ifnet *ifv_ifp; struct ifvlantrunk *trunk; struct sockaddr_dl *sdl; /* Need the epoch since this is run on taskqueue_swi. */ NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { NET_EPOCH_EXIT(et); return; } /* * OK, it's a trunk. Loop over and change all vlan's lladdrs on it. * We need an exclusive lock here to prevent concurrent SIOCSIFLLADDR * ioctl calls on the parent garbling the lladdr of the child vlan. */ TRUNK_WLOCK(trunk); VLAN_FOREACH(ifv, trunk) { /* * Copy new new lladdr into the ifv_ifp, enqueue a task * to actually call if_setlladdr. if_setlladdr needs to * be deferred to a taskqueue because it will call into * the if_vlan ioctl path and try to acquire the global * lock. */ ifv_ifp = ifv->ifv_ifp; bcopy(IF_LLADDR(ifp), IF_LLADDR(ifv_ifp), ifp->if_addrlen); sdl = (struct sockaddr_dl *)ifv_ifp->if_addr->ifa_addr; sdl->sdl_alen = ifp->if_addrlen; taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task); } TRUNK_WUNLOCK(trunk); NET_EPOCH_EXIT(et); } /* * A handler for network interface departure events. * Track departure of trunks here so that we don't access invalid * pointers or whatever if a trunk is ripped from under us, e.g., * by ejecting its hot-plug card. However, if an ifnet is simply * being renamed, then there's no need to tear down the state. */ static void vlan_ifdetach(void *arg __unused, struct ifnet *ifp) { struct ifvlan *ifv; struct ifvlantrunk *trunk; /* If the ifnet is just being renamed, don't do anything. */ if (ifp->if_flags & IFF_RENAMING) return; VLAN_XLOCK(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { VLAN_XUNLOCK(); return; } /* * OK, it's a trunk. Loop over and detach all vlan's on it. * Check trunk pointer after each vlan_unconfig() as it will * free it and set to NULL after the last vlan was detached. */ VLAN_FOREACH_UNTIL_SAFE(ifv, ifp->if_vlantrunk, ifp->if_vlantrunk == NULL) vlan_unconfig_locked(ifv->ifv_ifp, 1); /* Trunk should have been destroyed in vlan_unconfig(). */ KASSERT(ifp->if_vlantrunk == NULL, ("%s: purge failed", __func__)); VLAN_XUNLOCK(); } /* * Return the trunk device for a virtual interface. */ static struct ifnet * vlan_trunkdev(struct ifnet *ifp) { - struct epoch_tracker et; struct ifvlan *ifv; + NET_EPOCH_ASSERT(); + if (ifp->if_type != IFT_L2VLAN) return (NULL); - NET_EPOCH_ENTER(et); ifv = ifp->if_softc; ifp = NULL; if (ifv->ifv_trunk) ifp = PARENT(ifv); - NET_EPOCH_EXIT(et); return (ifp); } /* * Return the 12-bit VLAN VID for this interface, for use by external * components such as Infiniband. * * XXXRW: Note that the function name here is historical; it should be named * vlan_vid(). */ static int vlan_tag(struct ifnet *ifp, uint16_t *vidp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; *vidp = ifv->ifv_vid; return (0); } static int vlan_pcp(struct ifnet *ifp, uint16_t *pcpp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; *pcpp = ifv->ifv_pcp; return (0); } /* * Return a driver specific cookie for this interface. Synchronization * with setcookie must be provided by the driver. */ static void * vlan_cookie(struct ifnet *ifp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (NULL); ifv = ifp->if_softc; return (ifv->ifv_cookie); } /* * Store a cookie in our softc that drivers can use to store driver * private per-instance data in. */ static int vlan_setcookie(struct ifnet *ifp, void *cookie) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; ifv->ifv_cookie = cookie; return (0); } /* * Return the vlan device present at the specific VID. */ static struct ifnet * vlan_devat(struct ifnet *ifp, uint16_t vid) { - struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; - NET_EPOCH_ENTER(et); + NET_EPOCH_ASSERT(); + trunk = ifp->if_vlantrunk; - if (trunk == NULL) { - NET_EPOCH_EXIT(et); + if (trunk == NULL) return (NULL); - } ifp = NULL; ifv = vlan_gethash(trunk, vid); if (ifv) ifp = ifv->ifv_ifp; - NET_EPOCH_EXIT(et); return (ifp); } /* * Recalculate the cached VLAN tag exposed via the MIB. */ static void vlan_tag_recalculate(struct ifvlan *ifv) { ifv->ifv_tag = EVL_MAKETAG(ifv->ifv_vid, ifv->ifv_pcp, 0); } /* * VLAN support can be loaded as a module. The only place in the * system that's intimately aware of this is ether_input. We hook * into this code through vlan_input_p which is defined there and * set here. No one else in the system should be aware of this so * we use an explicit reference here. */ extern void (*vlan_input_p)(struct ifnet *, struct mbuf *); /* For if_link_state_change() eyes only... */ extern void (*vlan_link_state_p)(struct ifnet *); static int vlan_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: ifdetach_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, vlan_ifdetach, NULL, EVENTHANDLER_PRI_ANY); if (ifdetach_tag == NULL) return (ENOMEM); iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event, vlan_iflladdr, NULL, EVENTHANDLER_PRI_ANY); if (iflladdr_tag == NULL) return (ENOMEM); VLAN_LOCKING_INIT(); vlan_input_p = vlan_input; vlan_link_state_p = vlan_link_state; vlan_trunk_cap_p = vlan_trunk_capabilities; vlan_trunkdev_p = vlan_trunkdev; vlan_cookie_p = vlan_cookie; vlan_setcookie_p = vlan_setcookie; vlan_tag_p = vlan_tag; vlan_pcp_p = vlan_pcp; vlan_devat_p = vlan_devat; #ifndef VIMAGE vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match, vlan_clone_create, vlan_clone_destroy); #endif if (bootverbose) printf("vlan: initialized, using " #ifdef VLAN_ARRAY "full-size arrays" #else "hash tables with chaining" #endif "\n"); break; case MOD_UNLOAD: #ifndef VIMAGE if_clone_detach(vlan_cloner); #endif EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_tag); EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_tag); vlan_input_p = NULL; vlan_link_state_p = NULL; vlan_trunk_cap_p = NULL; vlan_trunkdev_p = NULL; vlan_tag_p = NULL; vlan_cookie_p = NULL; vlan_setcookie_p = NULL; vlan_devat_p = NULL; VLAN_LOCKING_DESTROY(); if (bootverbose) printf("vlan: unloaded\n"); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t vlan_mod = { "if_vlan", vlan_modevent, 0 }; DECLARE_MODULE(if_vlan, vlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_vlan, 3); #ifdef VIMAGE static void vnet_vlan_init(const void *unused __unused) { vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match, vlan_clone_create, vlan_clone_destroy); V_vlan_cloner = vlan_cloner; } VNET_SYSINIT(vnet_vlan_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_vlan_init, NULL); static void vnet_vlan_uninit(const void *unused __unused) { if_clone_detach(V_vlan_cloner); } VNET_SYSUNINIT(vnet_vlan_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_vlan_uninit, NULL); #endif /* * Check for . style interface names. */ static struct ifnet * vlan_clone_match_ethervid(const char *name, int *vidp) { char ifname[IFNAMSIZ]; char *cp; struct ifnet *ifp; int vid; strlcpy(ifname, name, IFNAMSIZ); if ((cp = strchr(ifname, '.')) == NULL) return (NULL); *cp = '\0'; if ((ifp = ifunit_ref(ifname)) == NULL) return (NULL); /* Parse VID. */ if (*++cp == '\0') { if_rele(ifp); return (NULL); } vid = 0; for(; *cp >= '0' && *cp <= '9'; cp++) vid = (vid * 10) + (*cp - '0'); if (*cp != '\0') { if_rele(ifp); return (NULL); } if (vidp != NULL) *vidp = vid; return (ifp); } static int vlan_clone_match(struct if_clone *ifc, const char *name) { const char *cp; if (vlan_clone_match_ethervid(name, NULL) != NULL) return (1); if (strncmp(vlanname, name, strlen(vlanname)) != 0) return (0); for (cp = name + 4; *cp != '\0'; cp++) { if (*cp < '0' || *cp > '9') return (0); } return (1); } static int vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { char *dp; int wildcard; int unit; int error; int vid; struct ifvlan *ifv; struct ifnet *ifp; struct ifnet *p; struct ifaddr *ifa; struct sockaddr_dl *sdl; struct vlanreq vlr; static const u_char eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ /* * There are 3 (ugh) ways to specify the cloned device: * o pass a parameter block with the clone request. * o specify parameters in the text of the clone device name * o specify no parameters and get an unattached device that * must be configured separately. * The first technique is preferred; the latter two are * supported for backwards compatibility. * * XXXRW: Note historic use of the word "tag" here. New ioctls may be * called for. */ if (params) { error = copyin(params, &vlr, sizeof(vlr)); if (error) return error; p = ifunit_ref(vlr.vlr_parent); if (p == NULL) return (ENXIO); error = ifc_name2unit(name, &unit); if (error != 0) { if_rele(p); return (error); } vid = vlr.vlr_tag; wildcard = (unit < 0); } else if ((p = vlan_clone_match_ethervid(name, &vid)) != NULL) { unit = -1; wildcard = 0; } else { p = NULL; error = ifc_name2unit(name, &unit); if (error != 0) return (error); wildcard = (unit < 0); } error = ifc_alloc_unit(ifc, &unit); if (error != 0) { if (p != NULL) if_rele(p); return (error); } /* In the wildcard case, we need to update the name. */ if (wildcard) { for (dp = name; *dp != '\0'; dp++); if (snprintf(dp, len - (dp-name), "%d", unit) > len - (dp-name) - 1) { panic("%s: interface name too long", __func__); } } ifv = malloc(sizeof(struct ifvlan), M_VLAN, M_WAITOK | M_ZERO); ifp = ifv->ifv_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { ifc_free_unit(ifc, unit); free(ifv, M_VLAN); if (p != NULL) if_rele(p); return (ENOSPC); } CK_SLIST_INIT(&ifv->vlan_mc_listhead); ifp->if_softc = ifv; /* * Set the name manually rather than using if_initname because * we don't conform to the default naming convention for interfaces. */ strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = vlanname; ifp->if_dunit = unit; ifp->if_init = vlan_init; ifp->if_transmit = vlan_transmit; ifp->if_qflush = vlan_qflush; ifp->if_ioctl = vlan_ioctl; #if defined(KERN_TLS) || defined(RATELIMIT) ifp->if_snd_tag_alloc = vlan_snd_tag_alloc; ifp->if_snd_tag_modify = vlan_snd_tag_modify; ifp->if_snd_tag_query = vlan_snd_tag_query; ifp->if_snd_tag_free = vlan_snd_tag_free; #endif ifp->if_flags = VLAN_IFFLAGS; ether_ifattach(ifp, eaddr); /* Now undo some of the damage... */ ifp->if_baudrate = 0; ifp->if_type = IFT_L2VLAN; ifp->if_hdrlen = ETHER_VLAN_ENCAP_LEN; ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_L2VLAN; if (p != NULL) { error = vlan_config(ifv, p, vid); if_rele(p); if (error != 0) { /* * Since we've partially failed, we need to back * out all the way, otherwise userland could get * confused. Thus, we destroy the interface. */ ether_ifdetach(ifp); vlan_unconfig(ifp); if_free(ifp); ifc_free_unit(ifc, unit); free(ifv, M_VLAN); return (error); } } return (0); } static int vlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) { struct ifvlan *ifv = ifp->if_softc; int unit = ifp->if_dunit; ether_ifdetach(ifp); /* first, remove it from system-wide lists */ vlan_unconfig(ifp); /* now it can be unconfigured and freed */ /* * We should have the only reference to the ifv now, so we can now * drain any remaining lladdr task before freeing the ifnet and the * ifvlan. */ taskqueue_drain(taskqueue_thread, &ifv->lladdr_task); NET_EPOCH_WAIT(); if_free(ifp); free(ifv, M_VLAN); ifc_free_unit(ifc, unit); return (0); } /* * The ifp->if_init entry point for vlan(4) is a no-op. */ static void vlan_init(void *foo __unused) { } /* * The if_transmit method for vlan(4) interface. */ static int vlan_transmit(struct ifnet *ifp, struct mbuf *m) { - struct epoch_tracker et; struct ifvlan *ifv; struct ifnet *p; int error, len, mcast; - NET_EPOCH_ENTER(et); + NET_EPOCH_ASSERT(); + ifv = ifp->if_softc; if (TRUNK(ifv) == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - NET_EPOCH_EXIT(et); m_freem(m); return (ENETDOWN); } p = PARENT(ifv); len = m->m_pkthdr.len; mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; BPF_MTAP(ifp, m); #if defined(KERN_TLS) || defined(RATELIMIT) if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { struct vlan_snd_tag *vst; struct m_snd_tag *mst; MPASS(m->m_pkthdr.snd_tag->ifp == ifp); mst = m->m_pkthdr.snd_tag; vst = mst_to_vst(mst); if (vst->tag->ifp != p) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - NET_EPOCH_EXIT(et); m_freem(m); return (EAGAIN); } m->m_pkthdr.snd_tag = m_snd_tag_ref(vst->tag); m_snd_tag_rele(mst); } #endif /* * Do not run parent's if_transmit() if the parent is not up, * or parent's driver will cause a system crash. */ if (!UP_AND_RUNNING(p)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - NET_EPOCH_EXIT(et); m_freem(m); return (ENETDOWN); } if (!ether_8021q_frame(&m, ifp, p, ifv->ifv_vid, ifv->ifv_pcp)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - NET_EPOCH_EXIT(et); return (0); } /* * Send it, precisely as ether_output() would have. */ error = (p->if_transmit)(p, m); if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast); } else if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - NET_EPOCH_EXIT(et); return (error); } static int vlan_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { - struct epoch_tracker et; struct ifvlan *ifv; struct ifnet *p; - NET_EPOCH_ENTER(et); + NET_EPOCH_ASSERT(); + ifv = ifp->if_softc; if (TRUNK(ifv) == NULL) { - NET_EPOCH_EXIT(et); m_freem(m); return (ENETDOWN); } p = PARENT(ifv); - NET_EPOCH_EXIT(et); return p->if_output(ifp, m, dst, ro); } /* * The ifp->if_qflush entry point for vlan(4) is a no-op. */ static void vlan_qflush(struct ifnet *ifp __unused) { } static void vlan_input(struct ifnet *ifp, struct mbuf *m) { - struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; struct m_tag *mtag; uint16_t vid, tag; - NET_EPOCH_ENTER(et); + NET_EPOCH_ASSERT(); + trunk = ifp->if_vlantrunk; if (trunk == NULL) { - NET_EPOCH_EXIT(et); m_freem(m); return; } if (m->m_flags & M_VLANTAG) { /* * Packet is tagged, but m contains a normal * Ethernet frame; the tag is stored out-of-band. */ tag = m->m_pkthdr.ether_vtag; m->m_flags &= ~M_VLANTAG; } else { struct ether_vlan_header *evl; /* * Packet is tagged in-band as specified by 802.1q. */ switch (ifp->if_type) { case IFT_ETHER: if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { if_printf(ifp, "cannot pullup VLAN header\n"); - NET_EPOCH_EXIT(et); return; } evl = mtod(m, struct ether_vlan_header *); tag = ntohs(evl->evl_tag); /* * Remove the 802.1q header by copying the Ethernet * addresses over it and adjusting the beginning of * the data in the mbuf. The encapsulated Ethernet * type field is already in place. */ bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); break; default: #ifdef INVARIANTS panic("%s: %s has unsupported if_type %u", __func__, ifp->if_xname, ifp->if_type); #endif if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); - NET_EPOCH_EXIT(et); m_freem(m); return; } } vid = EVL_VLANOFTAG(tag); ifv = vlan_gethash(trunk, vid); if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) { - NET_EPOCH_EXIT(et); if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } if (vlan_mtag_pcp) { /* * While uncommon, it is possible that we will find a 802.1q * packet encapsulated inside another packet that also had an * 802.1q header. For example, ethernet tunneled over IPSEC * arriving over ethernet. In that case, we replace the * existing 802.1q PCP m_tag value. */ mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL); if (mtag == NULL) { mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_IN, sizeof(uint8_t), M_NOWAIT); if (mtag == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); - NET_EPOCH_EXIT(et); m_freem(m); return; } m_tag_prepend(m, mtag); } *(uint8_t *)(mtag + 1) = EVL_PRIOFTAG(tag); } m->m_pkthdr.rcvif = ifv->ifv_ifp; if_inc_counter(ifv->ifv_ifp, IFCOUNTER_IPACKETS, 1); - NET_EPOCH_EXIT(et); /* Pass it back through the parent's input routine. */ (*ifv->ifv_ifp->if_input)(ifv->ifv_ifp, m); } static void vlan_lladdr_fn(void *arg, int pending __unused) { struct ifvlan *ifv; struct ifnet *ifp; ifv = (struct ifvlan *)arg; ifp = ifv->ifv_ifp; CURVNET_SET(ifp->if_vnet); /* The ifv_ifp already has the lladdr copied in. */ if_setlladdr(ifp, IF_LLADDR(ifp), ifp->if_addrlen); CURVNET_RESTORE(); } static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) { - struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifnet *ifp; int error = 0; + NET_EPOCH_ASSERT(); + /* * We can handle non-ethernet hardware types as long as * they handle the tagging and headers themselves. */ if (p->if_type != IFT_ETHER && (p->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) return (EPROTONOSUPPORT); if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS) return (EPROTONOSUPPORT); /* * Don't let the caller set up a VLAN VID with * anything except VLID bits. * VID numbers 0x0 and 0xFFF are reserved. */ if (vid == 0 || vid == 0xFFF || (vid & ~EVL_VLID_MASK)) return (EINVAL); if (ifv->ifv_trunk) return (EBUSY); VLAN_XLOCK(); if (p->if_vlantrunk == NULL) { trunk = malloc(sizeof(struct ifvlantrunk), M_VLAN, M_WAITOK | M_ZERO); vlan_inithash(trunk); TRUNK_LOCK_INIT(trunk); TRUNK_WLOCK(trunk); p->if_vlantrunk = trunk; trunk->parent = p; if_ref(trunk->parent); TRUNK_WUNLOCK(trunk); } else { trunk = p->if_vlantrunk; } ifv->ifv_vid = vid; /* must set this before vlan_inshash() */ ifv->ifv_pcp = 0; /* Default: best effort delivery. */ vlan_tag_recalculate(ifv); error = vlan_inshash(trunk, ifv); if (error) goto done; ifv->ifv_proto = ETHERTYPE_VLAN; ifv->ifv_encaplen = ETHER_VLAN_ENCAP_LEN; ifv->ifv_mintu = ETHERMIN; ifv->ifv_pflags = 0; ifv->ifv_capenable = -1; /* * If the parent supports the VLAN_MTU capability, * i.e. can Tx/Rx larger than ETHER_MAX_LEN frames, * use it. */ if (p->if_capenable & IFCAP_VLAN_MTU) { /* * No need to fudge the MTU since the parent can * handle extended frames. */ ifv->ifv_mtufudge = 0; } else { /* * Fudge the MTU by the encapsulation size. This * makes us incompatible with strictly compliant * 802.1Q implementations, but allows us to use * the feature with other NetBSD implementations, * which might still be useful. */ ifv->ifv_mtufudge = ifv->ifv_encaplen; } ifv->ifv_trunk = trunk; ifp = ifv->ifv_ifp; /* * Initialize fields from our parent. This duplicates some * work with ether_ifattach() but allows for non-ethernet * interfaces to also work. */ ifp->if_mtu = p->if_mtu - ifv->ifv_mtufudge; ifp->if_baudrate = p->if_baudrate; ifp->if_input = p->if_input; ifp->if_resolvemulti = p->if_resolvemulti; ifp->if_addrlen = p->if_addrlen; ifp->if_broadcastaddr = p->if_broadcastaddr; ifp->if_pcp = ifv->ifv_pcp; /* * We wrap the parent's if_output using vlan_output to ensure that it * can't become stale. */ ifp->if_output = vlan_output; /* * Copy only a selected subset of flags from the parent. * Other flags are none of our business. */ #define VLAN_COPY_FLAGS (IFF_SIMPLEX) ifp->if_flags &= ~VLAN_COPY_FLAGS; ifp->if_flags |= p->if_flags & VLAN_COPY_FLAGS; #undef VLAN_COPY_FLAGS ifp->if_link_state = p->if_link_state; - NET_EPOCH_ENTER(et); vlan_capabilities(ifv); - NET_EPOCH_EXIT(et); /* * Set up our interface address to reflect the underlying * physical interface's. */ bcopy(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen); ((struct sockaddr_dl *)ifp->if_addr->ifa_addr)->sdl_alen = p->if_addrlen; TASK_INIT(&ifv->lladdr_task, 0, vlan_lladdr_fn, ifv); /* We are ready for operation now. */ ifp->if_drv_flags |= IFF_DRV_RUNNING; /* Update flags on the parent, if necessary. */ vlan_setflags(ifp, 1); /* * Configure multicast addresses that may already be * joined on the vlan device. */ (void)vlan_setmulti(ifp); done: if (error == 0) EVENTHANDLER_INVOKE(vlan_config, p, ifv->ifv_vid); VLAN_XUNLOCK(); return (error); } static void vlan_unconfig(struct ifnet *ifp) { VLAN_XLOCK(); vlan_unconfig_locked(ifp, 0); VLAN_XUNLOCK(); } static void vlan_unconfig_locked(struct ifnet *ifp, int departing) { struct ifvlantrunk *trunk; struct vlan_mc_entry *mc; struct ifvlan *ifv; struct ifnet *parent; int error; VLAN_XLOCK_ASSERT(); ifv = ifp->if_softc; trunk = ifv->ifv_trunk; parent = NULL; if (trunk != NULL) { parent = trunk->parent; /* * Since the interface is being unconfigured, we need to * empty the list of multicast groups that we may have joined * while we were alive from the parent's list. */ while ((mc = CK_SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) { /* * If the parent interface is being detached, * all its multicast addresses have already * been removed. Warn about errors if * if_delmulti() does fail, but don't abort as * all callers expect vlan destruction to * succeed. */ if (!departing) { error = if_delmulti(parent, (struct sockaddr *)&mc->mc_addr); if (error) if_printf(ifp, "Failed to delete multicast address from parent: %d\n", error); } CK_SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries); epoch_call(net_epoch_preempt, &mc->mc_epoch_ctx, vlan_mc_free); } vlan_setflags(ifp, 0); /* clear special flags on parent */ vlan_remhash(trunk, ifv); ifv->ifv_trunk = NULL; /* * Check if we were the last. */ if (trunk->refcnt == 0) { parent->if_vlantrunk = NULL; NET_EPOCH_WAIT(); trunk_destroy(trunk); } } /* Disconnect from parent. */ if (ifv->ifv_pflags) if_printf(ifp, "%s: ifv_pflags unclean\n", __func__); ifp->if_mtu = ETHERMTU; ifp->if_link_state = LINK_STATE_UNKNOWN; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; /* * Only dispatch an event if vlan was * attached, otherwise there is nothing * to cleanup anyway. */ if (parent != NULL) EVENTHANDLER_INVOKE(vlan_unconfig, parent, ifv->ifv_vid); } /* Handle a reference counted flag that should be set on the parent as well */ static int vlan_setflag(struct ifnet *ifp, int flag, int status, int (*func)(struct ifnet *, int)) { struct ifvlan *ifv; int error; VLAN_SXLOCK_ASSERT(); ifv = ifp->if_softc; status = status ? (ifp->if_flags & flag) : 0; /* Now "status" contains the flag value or 0 */ /* * See if recorded parent's status is different from what * we want it to be. If it is, flip it. We record parent's * status in ifv_pflags so that we won't clear parent's flag * we haven't set. In fact, we don't clear or set parent's * flags directly, but get or release references to them. * That's why we can be sure that recorded flags still are * in accord with actual parent's flags. */ if (status != (ifv->ifv_pflags & flag)) { error = (*func)(PARENT(ifv), status); if (error) return (error); ifv->ifv_pflags &= ~flag; ifv->ifv_pflags |= status; } return (0); } /* * Handle IFF_* flags that require certain changes on the parent: * if "status" is true, update parent's flags respective to our if_flags; * if "status" is false, forcedly clear the flags set on parent. */ static int vlan_setflags(struct ifnet *ifp, int status) { int error, i; for (i = 0; vlan_pflags[i].flag; i++) { error = vlan_setflag(ifp, vlan_pflags[i].flag, status, vlan_pflags[i].func); if (error) return (error); } return (0); } /* Inform all vlans that their parent has changed link state */ static void vlan_link_state(struct ifnet *ifp) { - struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; - /* Called from a taskqueue_swi task, so we cannot sleep. */ - NET_EPOCH_ENTER(et); + NET_EPOCH_ASSERT(); + trunk = ifp->if_vlantrunk; - if (trunk == NULL) { - NET_EPOCH_EXIT(et); + if (trunk == NULL) return; - } TRUNK_WLOCK(trunk); VLAN_FOREACH(ifv, trunk) { ifv->ifv_ifp->if_baudrate = trunk->parent->if_baudrate; if_link_state_change(ifv->ifv_ifp, trunk->parent->if_link_state); } TRUNK_WUNLOCK(trunk); - NET_EPOCH_EXIT(et); } static void vlan_capabilities(struct ifvlan *ifv) { struct ifnet *p; struct ifnet *ifp; struct ifnet_hw_tsomax hw_tsomax; int cap = 0, ena = 0, mena; u_long hwa = 0; - VLAN_SXLOCK_ASSERT(); NET_EPOCH_ASSERT(); + VLAN_SXLOCK_ASSERT(); + p = PARENT(ifv); ifp = ifv->ifv_ifp; /* Mask parent interface enabled capabilities disabled by user. */ mena = p->if_capenable & ifv->ifv_capenable; /* * If the parent interface can do checksum offloading * on VLANs, then propagate its hardware-assisted * checksumming flags. Also assert that checksum * offloading requires hardware VLAN tagging. */ if (p->if_capabilities & IFCAP_VLAN_HWCSUM) cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); if (p->if_capenable & IFCAP_VLAN_HWCSUM && p->if_capenable & IFCAP_VLAN_HWTAGGING) { ena |= mena & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); if (ena & IFCAP_TXCSUM) hwa |= p->if_hwassist & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP); if (ena & IFCAP_TXCSUM_IPV6) hwa |= p->if_hwassist & (CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6); } /* * If the parent interface can do TSO on VLANs then * propagate the hardware-assisted flag. TSO on VLANs * does not necessarily require hardware VLAN tagging. */ memset(&hw_tsomax, 0, sizeof(hw_tsomax)); if_hw_tsomax_common(p, &hw_tsomax); if_hw_tsomax_update(ifp, &hw_tsomax); if (p->if_capabilities & IFCAP_VLAN_HWTSO) cap |= p->if_capabilities & IFCAP_TSO; if (p->if_capenable & IFCAP_VLAN_HWTSO) { ena |= mena & IFCAP_TSO; if (ena & IFCAP_TSO) hwa |= p->if_hwassist & CSUM_TSO; } /* * If the parent interface can do LRO and checksum offloading on * VLANs, then guess it may do LRO on VLANs. False positive here * cost nothing, while false negative may lead to some confusions. */ if (p->if_capabilities & IFCAP_VLAN_HWCSUM) cap |= p->if_capabilities & IFCAP_LRO; if (p->if_capenable & IFCAP_VLAN_HWCSUM) ena |= p->if_capenable & IFCAP_LRO; /* * If the parent interface can offload TCP connections over VLANs then * propagate its TOE capability to the VLAN interface. * * All TOE drivers in the tree today can deal with VLANs. If this * changes then IFCAP_VLAN_TOE should be promoted to a full capability * with its own bit. */ #define IFCAP_VLAN_TOE IFCAP_TOE if (p->if_capabilities & IFCAP_VLAN_TOE) cap |= p->if_capabilities & IFCAP_TOE; if (p->if_capenable & IFCAP_VLAN_TOE) { TOEDEV(ifp) = TOEDEV(p); ena |= mena & IFCAP_TOE; } /* * If the parent interface supports dynamic link state, so does the * VLAN interface. */ cap |= (p->if_capabilities & IFCAP_LINKSTATE); ena |= (mena & IFCAP_LINKSTATE); #ifdef RATELIMIT /* * If the parent interface supports ratelimiting, so does the * VLAN interface. */ cap |= (p->if_capabilities & IFCAP_TXRTLMT); ena |= (mena & IFCAP_TXRTLMT); #endif /* * If the parent interface supports unmapped mbufs, so does * the VLAN interface. Note that this should be fine even for * interfaces that don't support hardware tagging as headers * are prepended in normal mbufs to unmapped mbufs holding * payload data. */ cap |= (p->if_capabilities & IFCAP_NOMAP); ena |= (mena & IFCAP_NOMAP); /* * If the parent interface can offload encryption and segmentation * of TLS records over TCP, propagate it's capability to the VLAN * interface. * * All TLS drivers in the tree today can deal with VLANs. If * this ever changes, then a new IFCAP_VLAN_TXTLS can be * defined. */ if (p->if_capabilities & IFCAP_TXTLS) cap |= p->if_capabilities & IFCAP_TXTLS; if (p->if_capenable & IFCAP_TXTLS) ena |= mena & IFCAP_TXTLS; ifp->if_capabilities = cap; ifp->if_capenable = ena; ifp->if_hwassist = hwa; } static void vlan_trunk_capabilities(struct ifnet *ifp) { - struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; VLAN_SLOCK(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { VLAN_SUNLOCK(); return; } - NET_EPOCH_ENTER(et); - VLAN_FOREACH(ifv, trunk) { + VLAN_FOREACH(ifv, trunk) vlan_capabilities(ifv); - } - NET_EPOCH_EXIT(et); VLAN_SUNLOCK(); } static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifnet *p; struct ifreq *ifr; struct ifaddr *ifa; struct ifvlan *ifv; struct ifvlantrunk *trunk; struct vlanreq vlr; int error = 0; + NET_EPOCH_ASSERT(); + ifr = (struct ifreq *)data; ifa = (struct ifaddr *) data; ifv = ifp->if_softc; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit(ifp, ifa); #endif break; case SIOCGIFADDR: bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0], ifp->if_addrlen); break; case SIOCGIFMEDIA: VLAN_SLOCK(); if (TRUNK(ifv) != NULL) { p = PARENT(ifv); if_ref(p); error = (*p->if_ioctl)(p, SIOCGIFMEDIA, data); if_rele(p); /* Limit the result to the parent's current config. */ if (error == 0) { struct ifmediareq *ifmr; ifmr = (struct ifmediareq *)data; if (ifmr->ifm_count >= 1 && ifmr->ifm_ulist) { ifmr->ifm_count = 1; error = copyout(&ifmr->ifm_current, ifmr->ifm_ulist, sizeof(int)); } } } else { error = EINVAL; } VLAN_SUNLOCK(); break; case SIOCSIFMEDIA: error = EINVAL; break; case SIOCSIFMTU: /* * Set the interface MTU. */ VLAN_SLOCK(); trunk = TRUNK(ifv); if (trunk != NULL) { TRUNK_WLOCK(trunk); if (ifr->ifr_mtu > (PARENT(ifv)->if_mtu - ifv->ifv_mtufudge) || ifr->ifr_mtu < (ifv->ifv_mintu - ifv->ifv_mtufudge)) error = EINVAL; else ifp->if_mtu = ifr->ifr_mtu; TRUNK_WUNLOCK(trunk); } else error = EINVAL; VLAN_SUNLOCK(); break; case SIOCSETVLAN: #ifdef VIMAGE /* * XXXRW/XXXBZ: The goal in these checks is to allow a VLAN * interface to be delegated to a jail without allowing the * jail to change what underlying interface/VID it is * associated with. We are not entirely convinced that this * is the right way to accomplish that policy goal. */ if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif error = copyin(ifr_data_get_ptr(ifr), &vlr, sizeof(vlr)); if (error) break; if (vlr.vlr_parent[0] == '\0') { vlan_unconfig(ifp); break; } p = ifunit_ref(vlr.vlr_parent); if (p == NULL) { error = ENOENT; break; } error = vlan_config(ifv, p, vlr.vlr_tag); if_rele(p); break; case SIOCGETVLAN: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif bzero(&vlr, sizeof(vlr)); VLAN_SLOCK(); if (TRUNK(ifv) != NULL) { strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname, sizeof(vlr.vlr_parent)); vlr.vlr_tag = ifv->ifv_vid; } VLAN_SUNLOCK(); error = copyout(&vlr, ifr_data_get_ptr(ifr), sizeof(vlr)); break; case SIOCSIFFLAGS: /* * We should propagate selected flags to the parent, * e.g., promiscuous mode. */ VLAN_XLOCK(); if (TRUNK(ifv) != NULL) error = vlan_setflags(ifp, 1); VLAN_XUNLOCK(); break; case SIOCADDMULTI: case SIOCDELMULTI: /* * If we don't have a parent, just remember the membership for * when we do. * * XXX We need the rmlock here to avoid sleeping while * holding in6_multi_mtx. */ VLAN_XLOCK(); trunk = TRUNK(ifv); if (trunk != NULL) error = vlan_setmulti(ifp); VLAN_XUNLOCK(); break; case SIOCGVLANPCP: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif ifr->ifr_vlan_pcp = ifv->ifv_pcp; break; case SIOCSVLANPCP: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif error = priv_check(curthread, PRIV_NET_SETVLANPCP); if (error) break; if (ifr->ifr_vlan_pcp > 7) { error = EINVAL; break; } ifv->ifv_pcp = ifr->ifr_vlan_pcp; ifp->if_pcp = ifv->ifv_pcp; vlan_tag_recalculate(ifv); /* broadcast event about PCP change */ EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP); break; case SIOCSIFCAP: VLAN_SLOCK(); ifv->ifv_capenable = ifr->ifr_reqcap; trunk = TRUNK(ifv); - if (trunk != NULL) { - struct epoch_tracker et; - - NET_EPOCH_ENTER(et); + if (trunk != NULL) vlan_capabilities(ifv); - NET_EPOCH_EXIT(et); - } VLAN_SUNLOCK(); break; default: error = EINVAL; break; } return (error); } #if defined(KERN_TLS) || defined(RATELIMIT) static int vlan_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { struct epoch_tracker et; struct vlan_snd_tag *vst; struct ifvlan *ifv; struct ifnet *parent; int error; NET_EPOCH_ENTER(et); ifv = ifp->if_softc; if (ifv->ifv_trunk != NULL) parent = PARENT(ifv); else parent = NULL; if (parent == NULL || parent->if_snd_tag_alloc == NULL) { NET_EPOCH_EXIT(et); return (EOPNOTSUPP); } if_ref(parent); NET_EPOCH_EXIT(et); vst = malloc(sizeof(*vst), M_VLAN, M_NOWAIT); if (vst == NULL) { if_rele(parent); return (ENOMEM); } error = parent->if_snd_tag_alloc(parent, params, &vst->tag); if_rele(parent); if (error) { free(vst, M_VLAN); return (error); } m_snd_tag_init(&vst->com, ifp); *ppmt = &vst->com; return (0); } static int vlan_snd_tag_modify(struct m_snd_tag *mst, union if_snd_tag_modify_params *params) { struct vlan_snd_tag *vst; vst = mst_to_vst(mst); return (vst->tag->ifp->if_snd_tag_modify(vst->tag, params)); } static int vlan_snd_tag_query(struct m_snd_tag *mst, union if_snd_tag_query_params *params) { struct vlan_snd_tag *vst; vst = mst_to_vst(mst); return (vst->tag->ifp->if_snd_tag_query(vst->tag, params)); } static void vlan_snd_tag_free(struct m_snd_tag *mst) { struct vlan_snd_tag *vst; vst = mst_to_vst(mst); m_snd_tag_rele(vst->tag); free(vst, M_VLAN); } #endif diff --git a/sys/net/netisr.c b/sys/net/netisr.c index 049fe0fc32d8..ca269815d2fe 100644 --- a/sys/net/netisr.c +++ b/sys/net/netisr.c @@ -1,1534 +1,1538 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007-2009 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * This software was developed by Robert N. M. Watson under contract * to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * netisr is a packet dispatch service, allowing synchronous (directly * dispatched) and asynchronous (deferred dispatch) processing of packets by * registered protocol handlers. Callers pass a protocol identifier and * packet to netisr, along with a direct dispatch hint, and work will either * be immediately processed by the registered handler, or passed to a * software interrupt (SWI) thread for deferred dispatch. Callers will * generally select one or the other based on: * * - Whether directly dispatching a netisr handler lead to code reentrance or * lock recursion, such as entering the socket code from the socket code. * - Whether directly dispatching a netisr handler lead to recursive * processing, such as when decapsulating several wrapped layers of tunnel * information (IPSEC within IPSEC within ...). * * Maintaining ordering for protocol streams is a critical design concern. * Enforcing ordering limits the opportunity for concurrency, but maintains * the strong ordering requirements found in some protocols, such as TCP. Of * related concern is CPU affinity--it is desirable to process all data * associated with a particular stream on the same CPU over time in order to * avoid acquiring locks associated with the connection on different CPUs, * keep connection data in one cache, and to generally encourage associated * user threads to live on the same CPU as the stream. It's also desirable * to avoid lock migration and contention where locks are associated with * more than one flow. * * netisr supports several policy variations, represented by the * NETISR_POLICY_* constants, allowing protocols to play various roles in * identifying flows, assigning work to CPUs, etc. These are described in * netisr.h. */ #include "opt_ddb.h" #include "opt_device_polling.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #define _WANT_NETISR_INTERNAL /* Enable definitions from netisr_internal.h */ #include #include #include #include #include /*- * Synchronize use and modification of the registered netisr data structures; * acquire a read lock while modifying the set of registered protocols to * prevent partially registered or unregistered protocols from being run. * * The following data structures and fields are protected by this lock: * * - The netisr_proto array, including all fields of struct netisr_proto. * - The nws array, including all fields of struct netisr_worker. * - The nws_array array. * * Note: the NETISR_LOCKING define controls whether read locks are acquired * in packet processing paths requiring netisr registration stability. This * is disabled by default as it can lead to measurable performance * degradation even with rmlocks (3%-6% for loopback ping-pong traffic), and * because netisr registration and unregistration is extremely rare at * runtime. If it becomes more common, this decision should be revisited. * * XXXRW: rmlocks don't support assertions. */ static struct rmlock netisr_rmlock; #define NETISR_LOCK_INIT() rm_init_flags(&netisr_rmlock, "netisr", \ RM_NOWITNESS) #define NETISR_LOCK_ASSERT() #define NETISR_RLOCK(tracker) rm_rlock(&netisr_rmlock, (tracker)) #define NETISR_RUNLOCK(tracker) rm_runlock(&netisr_rmlock, (tracker)) #define NETISR_WLOCK() rm_wlock(&netisr_rmlock) #define NETISR_WUNLOCK() rm_wunlock(&netisr_rmlock) /* #define NETISR_LOCKING */ static SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr"); /*- * Three global direct dispatch policies are supported: * * NETISR_DISPATCH_DEFERRED: All work is deferred for a netisr, regardless of * context (may be overriden by protocols). * * NETISR_DISPATCH_HYBRID: If the executing context allows direct dispatch, * and we're running on the CPU the work would be performed on, then direct * dispatch it if it wouldn't violate ordering constraints on the workstream. * * NETISR_DISPATCH_DIRECT: If the executing context allows direct dispatch, * always direct dispatch. (The default.) * * Notice that changing the global policy could lead to short periods of * misordered processing, but this is considered acceptable as compared to * the complexity of enforcing ordering during policy changes. Protocols can * override the global policy (when they're not doing that, they select * NETISR_DISPATCH_DEFAULT). */ #define NETISR_DISPATCH_POLICY_DEFAULT NETISR_DISPATCH_DIRECT #define NETISR_DISPATCH_POLICY_MAXSTR 20 /* Used for temporary buffers. */ static u_int netisr_dispatch_policy = NETISR_DISPATCH_POLICY_DEFAULT; static int sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_net_isr, OID_AUTO, dispatch, CTLTYPE_STRING | CTLFLAG_RWTUN, 0, 0, sysctl_netisr_dispatch_policy, "A", "netisr dispatch policy"); /* * Allow the administrator to limit the number of threads (CPUs) to use for * netisr. We don't check netisr_maxthreads before creating the thread for * CPU 0. This must be set at boot. We will create at most one thread per CPU. * By default we initialize this to 1 which would assign just 1 cpu (cpu0) and * therefore only 1 workstream. If set to -1, netisr would use all cpus * (mp_ncpus) and therefore would have those many workstreams. One workstream * per thread (CPU). */ static int netisr_maxthreads = 1; /* Max number of threads. */ SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RDTUN, &netisr_maxthreads, 0, "Use at most this many CPUs for netisr processing"); static int netisr_bindthreads = 0; /* Bind threads to CPUs. */ SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RDTUN, &netisr_bindthreads, 0, "Bind netisr threads to CPUs."); /* * Limit per-workstream mbuf queue limits s to at most net.isr.maxqlimit, * both for initial configuration and later modification using * netisr_setqlimit(). */ #define NETISR_DEFAULT_MAXQLIMIT 10240 static u_int netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT; SYSCTL_UINT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RDTUN, &netisr_maxqlimit, 0, "Maximum netisr per-protocol, per-CPU queue depth."); /* * The default per-workstream mbuf queue limit for protocols that don't * initialize the nh_qlimit field of their struct netisr_handler. If this is * set above netisr_maxqlimit, we truncate it to the maximum during boot. */ #define NETISR_DEFAULT_DEFAULTQLIMIT 256 static u_int netisr_defaultqlimit = NETISR_DEFAULT_DEFAULTQLIMIT; SYSCTL_UINT(_net_isr, OID_AUTO, defaultqlimit, CTLFLAG_RDTUN, &netisr_defaultqlimit, 0, "Default netisr per-protocol, per-CPU queue limit if not set by protocol"); /* * Store and export the compile-time constant NETISR_MAXPROT limit on the * number of protocols that can register with netisr at a time. This is * required for crashdump analysis, as it sizes netisr_proto[]. */ static u_int netisr_maxprot = NETISR_MAXPROT; SYSCTL_UINT(_net_isr, OID_AUTO, maxprot, CTLFLAG_RD, &netisr_maxprot, 0, "Compile-time limit on the number of protocols supported by netisr."); /* * The netisr_proto array describes all registered protocols, indexed by * protocol number. See netisr_internal.h for more details. */ static struct netisr_proto netisr_proto[NETISR_MAXPROT]; #ifdef VIMAGE /* * The netisr_enable array describes a per-VNET flag for registered * protocols on whether this netisr is active in this VNET or not. * netisr_register() will automatically enable the netisr for the * default VNET and all currently active instances. * netisr_unregister() will disable all active VNETs, including vnet0. * Individual network stack instances can be enabled/disabled by the * netisr_(un)register _vnet() functions. * With this we keep the one netisr_proto per protocol but add a * mechanism to stop netisr processing for vnet teardown. * Apart from that we expect a VNET to always be enabled. */ VNET_DEFINE_STATIC(u_int, netisr_enable[NETISR_MAXPROT]); #define V_netisr_enable VNET(netisr_enable) #endif /* * Per-CPU workstream data. See netisr_internal.h for more details. */ DPCPU_DEFINE(struct netisr_workstream, nws); /* * Map contiguous values between 0 and nws_count into CPU IDs appropriate for * accessing workstreams. This allows constructions of the form * DPCPU_ID_GET(nws_array[arbitraryvalue % nws_count], nws). */ static u_int nws_array[MAXCPU]; /* * Number of registered workstreams. Will be at most the number of running * CPUs once fully started. */ static u_int nws_count; SYSCTL_UINT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD, &nws_count, 0, "Number of extant netisr threads."); /* * Synchronization for each workstream: a mutex protects all mutable fields * in each stream, including per-protocol state (mbuf queues). The SWI is * woken up if asynchronous dispatch is required. */ #define NWS_LOCK(s) mtx_lock(&(s)->nws_mtx) #define NWS_LOCK_ASSERT(s) mtx_assert(&(s)->nws_mtx, MA_OWNED) #define NWS_UNLOCK(s) mtx_unlock(&(s)->nws_mtx) #define NWS_SIGNAL(s) swi_sched((s)->nws_swi_cookie, 0) /* * Utility routines for protocols that implement their own mapping of flows * to CPUs. */ u_int netisr_get_cpucount(void) { return (nws_count); } u_int netisr_get_cpuid(u_int cpunumber) { return (nws_array[cpunumber % nws_count]); } /* * The default implementation of flow -> CPU ID mapping. * * Non-static so that protocols can use it to map their own work to specific * CPUs in a manner consistent to netisr for affinity purposes. */ u_int netisr_default_flow2cpu(u_int flowid) { return (nws_array[flowid % nws_count]); } /* * Dispatch tunable and sysctl configuration. */ struct netisr_dispatch_table_entry { u_int ndte_policy; const char *ndte_policy_str; }; static const struct netisr_dispatch_table_entry netisr_dispatch_table[] = { { NETISR_DISPATCH_DEFAULT, "default" }, { NETISR_DISPATCH_DEFERRED, "deferred" }, { NETISR_DISPATCH_HYBRID, "hybrid" }, { NETISR_DISPATCH_DIRECT, "direct" }, }; static void netisr_dispatch_policy_to_str(u_int dispatch_policy, char *buffer, u_int buflen) { const struct netisr_dispatch_table_entry *ndtep; const char *str; u_int i; str = "unknown"; for (i = 0; i < nitems(netisr_dispatch_table); i++) { ndtep = &netisr_dispatch_table[i]; if (ndtep->ndte_policy == dispatch_policy) { str = ndtep->ndte_policy_str; break; } } snprintf(buffer, buflen, "%s", str); } static int netisr_dispatch_policy_from_str(const char *str, u_int *dispatch_policyp) { const struct netisr_dispatch_table_entry *ndtep; u_int i; for (i = 0; i < nitems(netisr_dispatch_table); i++) { ndtep = &netisr_dispatch_table[i]; if (strcmp(ndtep->ndte_policy_str, str) == 0) { *dispatch_policyp = ndtep->ndte_policy; return (0); } } return (EINVAL); } static int sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS) { char tmp[NETISR_DISPATCH_POLICY_MAXSTR]; u_int dispatch_policy; int error; netisr_dispatch_policy_to_str(netisr_dispatch_policy, tmp, sizeof(tmp)); error = sysctl_handle_string(oidp, tmp, sizeof(tmp), req); if (error == 0 && req->newptr != NULL) { error = netisr_dispatch_policy_from_str(tmp, &dispatch_policy); if (error == 0 && dispatch_policy == NETISR_DISPATCH_DEFAULT) error = EINVAL; if (error == 0) netisr_dispatch_policy = dispatch_policy; } return (error); } /* * Register a new netisr handler, which requires initializing per-protocol * fields for each workstream. All netisr work is briefly suspended while * the protocol is installed. */ void netisr_register(const struct netisr_handler *nhp) { VNET_ITERATOR_DECL(vnet_iter); struct netisr_work *npwp; const char *name; u_int i, proto; proto = nhp->nh_proto; name = nhp->nh_name; /* * Test that the requested registration is valid. */ KASSERT(nhp->nh_name != NULL, ("%s: nh_name NULL for %u", __func__, proto)); KASSERT(nhp->nh_handler != NULL, ("%s: nh_handler NULL for %s", __func__, name)); KASSERT(nhp->nh_policy == NETISR_POLICY_SOURCE || nhp->nh_policy == NETISR_POLICY_FLOW || nhp->nh_policy == NETISR_POLICY_CPU, ("%s: unsupported nh_policy %u for %s", __func__, nhp->nh_policy, name)); KASSERT(nhp->nh_policy == NETISR_POLICY_FLOW || nhp->nh_m2flow == NULL, ("%s: nh_policy != FLOW but m2flow defined for %s", __func__, name)); KASSERT(nhp->nh_policy == NETISR_POLICY_CPU || nhp->nh_m2cpuid == NULL, ("%s: nh_policy != CPU but m2cpuid defined for %s", __func__, name)); KASSERT(nhp->nh_policy != NETISR_POLICY_CPU || nhp->nh_m2cpuid != NULL, ("%s: nh_policy == CPU but m2cpuid not defined for %s", __func__, name)); KASSERT(nhp->nh_dispatch == NETISR_DISPATCH_DEFAULT || nhp->nh_dispatch == NETISR_DISPATCH_DEFERRED || nhp->nh_dispatch == NETISR_DISPATCH_HYBRID || nhp->nh_dispatch == NETISR_DISPATCH_DIRECT, ("%s: invalid nh_dispatch (%u)", __func__, nhp->nh_dispatch)); KASSERT(proto < NETISR_MAXPROT, ("%s(%u, %s): protocol too big", __func__, proto, name)); /* * Test that no existing registration exists for this protocol. */ NETISR_WLOCK(); KASSERT(netisr_proto[proto].np_name == NULL, ("%s(%u, %s): name present", __func__, proto, name)); KASSERT(netisr_proto[proto].np_handler == NULL, ("%s(%u, %s): handler present", __func__, proto, name)); netisr_proto[proto].np_name = name; netisr_proto[proto].np_handler = nhp->nh_handler; netisr_proto[proto].np_m2flow = nhp->nh_m2flow; netisr_proto[proto].np_m2cpuid = nhp->nh_m2cpuid; netisr_proto[proto].np_drainedcpu = nhp->nh_drainedcpu; if (nhp->nh_qlimit == 0) netisr_proto[proto].np_qlimit = netisr_defaultqlimit; else if (nhp->nh_qlimit > netisr_maxqlimit) { printf("%s: %s requested queue limit %u capped to " "net.isr.maxqlimit %u\n", __func__, name, nhp->nh_qlimit, netisr_maxqlimit); netisr_proto[proto].np_qlimit = netisr_maxqlimit; } else netisr_proto[proto].np_qlimit = nhp->nh_qlimit; netisr_proto[proto].np_policy = nhp->nh_policy; netisr_proto[proto].np_dispatch = nhp->nh_dispatch; CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; bzero(npwp, sizeof(*npwp)); npwp->nw_qlimit = netisr_proto[proto].np_qlimit; } #ifdef VIMAGE /* * Test that we are in vnet0 and have a curvnet set. */ KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__)); KASSERT(IS_DEFAULT_VNET(curvnet), ("%s: curvnet %p is not vnet0 %p", __func__, curvnet, vnet0)); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); V_netisr_enable[proto] = 1; CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); #endif NETISR_WUNLOCK(); } /* * Clear drop counters across all workstreams for a protocol. */ void netisr_clearqdrops(const struct netisr_handler *nhp) { struct netisr_work *npwp; #ifdef INVARIANTS const char *name; #endif u_int i, proto; proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_WLOCK(); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; npwp->nw_qdrops = 0; } NETISR_WUNLOCK(); } /* * Query current drop counters across all workstreams for a protocol. */ void netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp) { struct netisr_work *npwp; struct rm_priotracker tracker; #ifdef INVARIANTS const char *name; #endif u_int i, proto; *qdropp = 0; proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_RLOCK(&tracker); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; *qdropp += npwp->nw_qdrops; } NETISR_RUNLOCK(&tracker); } /* * Query current per-workstream queue limit for a protocol. */ void netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp) { struct rm_priotracker tracker; #ifdef INVARIANTS const char *name; #endif u_int proto; proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_RLOCK(&tracker); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); *qlimitp = netisr_proto[proto].np_qlimit; NETISR_RUNLOCK(&tracker); } /* * Update the queue limit across per-workstream queues for a protocol. We * simply change the limits, and don't drain overflowed packets as they will * (hopefully) take care of themselves shortly. */ int netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit) { struct netisr_work *npwp; #ifdef INVARIANTS const char *name; #endif u_int i, proto; if (qlimit > netisr_maxqlimit) return (EINVAL); proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_WLOCK(); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); netisr_proto[proto].np_qlimit = qlimit; CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; npwp->nw_qlimit = qlimit; } NETISR_WUNLOCK(); return (0); } /* * Drain all packets currently held in a particular protocol work queue. */ static void netisr_drain_proto(struct netisr_work *npwp) { struct mbuf *m; /* * We would assert the lock on the workstream but it's not passed in. */ while ((m = npwp->nw_head) != NULL) { npwp->nw_head = m->m_nextpkt; m->m_nextpkt = NULL; if (npwp->nw_head == NULL) npwp->nw_tail = NULL; npwp->nw_len--; m_freem(m); } KASSERT(npwp->nw_tail == NULL, ("%s: tail", __func__)); KASSERT(npwp->nw_len == 0, ("%s: len", __func__)); } /* * Remove the registration of a network protocol, which requires clearing * per-protocol fields across all workstreams, including freeing all mbufs in * the queues at time of unregister. All work in netisr is briefly suspended * while this takes place. */ void netisr_unregister(const struct netisr_handler *nhp) { VNET_ITERATOR_DECL(vnet_iter); struct netisr_work *npwp; #ifdef INVARIANTS const char *name; #endif u_int i, proto; proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_WLOCK(); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); #ifdef VIMAGE VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); V_netisr_enable[proto] = 0; CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); #endif netisr_proto[proto].np_name = NULL; netisr_proto[proto].np_handler = NULL; netisr_proto[proto].np_m2flow = NULL; netisr_proto[proto].np_m2cpuid = NULL; netisr_proto[proto].np_qlimit = 0; netisr_proto[proto].np_policy = 0; CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; netisr_drain_proto(npwp); bzero(npwp, sizeof(*npwp)); } NETISR_WUNLOCK(); } #ifdef VIMAGE void netisr_register_vnet(const struct netisr_handler *nhp) { u_int proto; proto = nhp->nh_proto; KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__)); KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, nhp->nh_name)); NETISR_WLOCK(); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, nhp->nh_name)); V_netisr_enable[proto] = 1; NETISR_WUNLOCK(); } static void netisr_drain_proto_vnet(struct vnet *vnet, u_int proto) { struct netisr_workstream *nwsp; struct netisr_work *npwp; struct mbuf *m, *mp, *n, *ne; u_int i; KASSERT(vnet != NULL, ("%s: vnet is NULL", __func__)); NETISR_LOCK_ASSERT(); CPU_FOREACH(i) { nwsp = DPCPU_ID_PTR(i, nws); if (nwsp->nws_intr_event == NULL) continue; npwp = &nwsp->nws_work[proto]; NWS_LOCK(nwsp); /* * Rather than dissecting and removing mbufs from the middle * of the chain, we build a new chain if the packet stays and * update the head and tail pointers at the end. All packets * matching the given vnet are freed. */ m = npwp->nw_head; n = ne = NULL; while (m != NULL) { mp = m; m = m->m_nextpkt; mp->m_nextpkt = NULL; if (mp->m_pkthdr.rcvif->if_vnet != vnet) { if (n == NULL) { n = ne = mp; } else { ne->m_nextpkt = mp; ne = mp; } continue; } /* This is a packet in the selected vnet. Free it. */ npwp->nw_len--; m_freem(mp); } npwp->nw_head = n; npwp->nw_tail = ne; NWS_UNLOCK(nwsp); } } void netisr_unregister_vnet(const struct netisr_handler *nhp) { u_int proto; proto = nhp->nh_proto; KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__)); KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, nhp->nh_name)); NETISR_WLOCK(); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, nhp->nh_name)); V_netisr_enable[proto] = 0; netisr_drain_proto_vnet(curvnet, proto); NETISR_WUNLOCK(); } #endif /* * Compose the global and per-protocol policies on dispatch, and return the * dispatch policy to use. */ static u_int netisr_get_dispatch(struct netisr_proto *npp) { /* * Protocol-specific configuration overrides the global default. */ if (npp->np_dispatch != NETISR_DISPATCH_DEFAULT) return (npp->np_dispatch); return (netisr_dispatch_policy); } /* * Look up the workstream given a packet and source identifier. Do this by * checking the protocol's policy, and optionally call out to the protocol * for assistance if required. */ static struct mbuf * netisr_select_cpuid(struct netisr_proto *npp, u_int dispatch_policy, uintptr_t source, struct mbuf *m, u_int *cpuidp) { struct ifnet *ifp; u_int policy; NETISR_LOCK_ASSERT(); /* * In the event we have only one worker, shortcut and deliver to it * without further ado. */ if (nws_count == 1) { *cpuidp = nws_array[0]; return (m); } /* * What happens next depends on the policy selected by the protocol. * If we want to support per-interface policies, we should do that * here first. */ policy = npp->np_policy; if (policy == NETISR_POLICY_CPU) { m = npp->np_m2cpuid(m, source, cpuidp); if (m == NULL) return (NULL); /* * It's possible for a protocol not to have a good idea about * where to process a packet, in which case we fall back on * the netisr code to decide. In the hybrid case, return the * current CPU ID, which will force an immediate direct * dispatch. In the queued case, fall back on the SOURCE * policy. */ if (*cpuidp != NETISR_CPUID_NONE) { *cpuidp = netisr_get_cpuid(*cpuidp); return (m); } if (dispatch_policy == NETISR_DISPATCH_HYBRID) { *cpuidp = netisr_get_cpuid(curcpu); return (m); } policy = NETISR_POLICY_SOURCE; } if (policy == NETISR_POLICY_FLOW) { if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE && npp->np_m2flow != NULL) { m = npp->np_m2flow(m, source); if (m == NULL) return (NULL); } if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { *cpuidp = netisr_default_flow2cpu(m->m_pkthdr.flowid); return (m); } policy = NETISR_POLICY_SOURCE; } KASSERT(policy == NETISR_POLICY_SOURCE, ("%s: invalid policy %u for %s", __func__, npp->np_policy, npp->np_name)); MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); ifp = m->m_pkthdr.rcvif; if (ifp != NULL) *cpuidp = nws_array[(ifp->if_index + source) % nws_count]; else *cpuidp = nws_array[source % nws_count]; return (m); } /* * Process packets associated with a workstream and protocol. For reasons of * fairness, we process up to one complete netisr queue at a time, moving the * queue to a stack-local queue for processing, but do not loop refreshing * from the global queue. The caller is responsible for deciding whether to * loop, and for setting the NWS_RUNNING flag. The passed workstream will be * locked on entry and relocked before return, but will be released while * processing. The number of packets processed is returned. */ static u_int netisr_process_workstream_proto(struct netisr_workstream *nwsp, u_int proto) { struct netisr_work local_npw, *npwp; + struct epoch_tracker et; u_int handled; struct mbuf *m; NETISR_LOCK_ASSERT(); NWS_LOCK_ASSERT(nwsp); KASSERT(nwsp->nws_flags & NWS_RUNNING, ("%s(%u): not running", __func__, proto)); KASSERT(proto >= 0 && proto < NETISR_MAXPROT, ("%s(%u): invalid proto\n", __func__, proto)); npwp = &nwsp->nws_work[proto]; if (npwp->nw_len == 0) return (0); /* * Move the global work queue to a thread-local work queue. * * Notice that this means the effective maximum length of the queue * is actually twice that of the maximum queue length specified in * the protocol registration call. */ handled = npwp->nw_len; local_npw = *npwp; npwp->nw_head = NULL; npwp->nw_tail = NULL; npwp->nw_len = 0; nwsp->nws_pendingbits &= ~(1 << proto); NWS_UNLOCK(nwsp); + NET_EPOCH_ENTER(et); while ((m = local_npw.nw_head) != NULL) { local_npw.nw_head = m->m_nextpkt; m->m_nextpkt = NULL; if (local_npw.nw_head == NULL) local_npw.nw_tail = NULL; local_npw.nw_len--; VNET_ASSERT(m->m_pkthdr.rcvif != NULL, ("%s:%d rcvif == NULL: m=%p", __func__, __LINE__, m)); CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); netisr_proto[proto].np_handler(m); CURVNET_RESTORE(); } + NET_EPOCH_EXIT(et); KASSERT(local_npw.nw_len == 0, ("%s(%u): len %u", __func__, proto, local_npw.nw_len)); if (netisr_proto[proto].np_drainedcpu) netisr_proto[proto].np_drainedcpu(nwsp->nws_cpu); NWS_LOCK(nwsp); npwp->nw_handled += handled; return (handled); } /* * SWI handler for netisr -- processes packets in a set of workstreams that * it owns, woken up by calls to NWS_SIGNAL(). If this workstream is already * being direct dispatched, go back to sleep and wait for the dispatching * thread to wake us up again. */ static void swi_net(void *arg) { #ifdef NETISR_LOCKING struct rm_priotracker tracker; #endif struct netisr_workstream *nwsp; u_int bits, prot; nwsp = arg; #ifdef DEVICE_POLLING KASSERT(nws_count == 1, ("%s: device_polling but nws_count != 1", __func__)); netisr_poll(); #endif #ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); #endif NWS_LOCK(nwsp); KASSERT(!(nwsp->nws_flags & NWS_RUNNING), ("swi_net: running")); if (nwsp->nws_flags & NWS_DISPATCHING) goto out; nwsp->nws_flags |= NWS_RUNNING; nwsp->nws_flags &= ~NWS_SCHEDULED; while ((bits = nwsp->nws_pendingbits) != 0) { while ((prot = ffs(bits)) != 0) { prot--; bits &= ~(1 << prot); (void)netisr_process_workstream_proto(nwsp, prot); } } nwsp->nws_flags &= ~NWS_RUNNING; out: NWS_UNLOCK(nwsp); #ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); #endif #ifdef DEVICE_POLLING netisr_pollmore(); #endif } static int netisr_queue_workstream(struct netisr_workstream *nwsp, u_int proto, struct netisr_work *npwp, struct mbuf *m, int *dosignalp) { NWS_LOCK_ASSERT(nwsp); *dosignalp = 0; if (npwp->nw_len < npwp->nw_qlimit) { m->m_nextpkt = NULL; if (npwp->nw_head == NULL) { npwp->nw_head = m; npwp->nw_tail = m; } else { npwp->nw_tail->m_nextpkt = m; npwp->nw_tail = m; } npwp->nw_len++; if (npwp->nw_len > npwp->nw_watermark) npwp->nw_watermark = npwp->nw_len; /* * We must set the bit regardless of NWS_RUNNING, so that * swi_net() keeps calling netisr_process_workstream_proto(). */ nwsp->nws_pendingbits |= (1 << proto); if (!(nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED))) { nwsp->nws_flags |= NWS_SCHEDULED; *dosignalp = 1; /* Defer until unlocked. */ } npwp->nw_queued++; return (0); } else { m_freem(m); npwp->nw_qdrops++; return (ENOBUFS); } } static int netisr_queue_internal(u_int proto, struct mbuf *m, u_int cpuid) { struct netisr_workstream *nwsp; struct netisr_work *npwp; int dosignal, error; #ifdef NETISR_LOCKING NETISR_LOCK_ASSERT(); #endif KASSERT(cpuid <= mp_maxid, ("%s: cpuid too big (%u, %u)", __func__, cpuid, mp_maxid)); KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); dosignal = 0; error = 0; nwsp = DPCPU_ID_PTR(cpuid, nws); npwp = &nwsp->nws_work[proto]; NWS_LOCK(nwsp); error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal); NWS_UNLOCK(nwsp); if (dosignal) NWS_SIGNAL(nwsp); return (error); } int netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m) { #ifdef NETISR_LOCKING struct rm_priotracker tracker; #endif u_int cpuid; int error; KASSERT(proto < NETISR_MAXPROT, ("%s: invalid proto %u", __func__, proto)); #ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); #endif KASSERT(netisr_proto[proto].np_handler != NULL, ("%s: invalid proto %u", __func__, proto)); #ifdef VIMAGE if (V_netisr_enable[proto] == 0) { m_freem(m); return (ENOPROTOOPT); } #endif m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_DEFERRED, source, m, &cpuid); if (m != NULL) { KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); error = netisr_queue_internal(proto, m, cpuid); } else error = ENOBUFS; #ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); #endif return (error); } int netisr_queue(u_int proto, struct mbuf *m) { return (netisr_queue_src(proto, 0, m)); } /* * Dispatch a packet for netisr processing; direct dispatch is permitted by * calling context. */ int netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m) { #ifdef NETISR_LOCKING struct rm_priotracker tracker; #endif struct netisr_workstream *nwsp; struct netisr_proto *npp; struct netisr_work *npwp; int dosignal, error; u_int cpuid, dispatch_policy; + NET_EPOCH_ASSERT(); KASSERT(proto < NETISR_MAXPROT, ("%s: invalid proto %u", __func__, proto)); #ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); #endif npp = &netisr_proto[proto]; KASSERT(npp->np_handler != NULL, ("%s: invalid proto %u", __func__, proto)); #ifdef VIMAGE if (V_netisr_enable[proto] == 0) { m_freem(m); return (ENOPROTOOPT); } #endif dispatch_policy = netisr_get_dispatch(npp); if (dispatch_policy == NETISR_DISPATCH_DEFERRED) return (netisr_queue_src(proto, source, m)); /* * If direct dispatch is forced, then unconditionally dispatch * without a formal CPU selection. Borrow the current CPU's stats, * even if there's no worker on it. In this case we don't update * nws_flags because all netisr processing will be source ordered due * to always being forced to directly dispatch. */ if (dispatch_policy == NETISR_DISPATCH_DIRECT) { nwsp = DPCPU_PTR(nws); npwp = &nwsp->nws_work[proto]; npwp->nw_dispatched++; npwp->nw_handled++; netisr_proto[proto].np_handler(m); error = 0; goto out_unlock; } KASSERT(dispatch_policy == NETISR_DISPATCH_HYBRID, ("%s: unknown dispatch policy (%u)", __func__, dispatch_policy)); /* * Otherwise, we execute in a hybrid mode where we will try to direct * dispatch if we're on the right CPU and the netisr worker isn't * already running. */ sched_pin(); m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_HYBRID, source, m, &cpuid); if (m == NULL) { error = ENOBUFS; goto out_unpin; } KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); if (cpuid != curcpu) goto queue_fallback; nwsp = DPCPU_PTR(nws); npwp = &nwsp->nws_work[proto]; /*- * We are willing to direct dispatch only if three conditions hold: * * (1) The netisr worker isn't already running, * (2) Another thread isn't already directly dispatching, and * (3) The netisr hasn't already been woken up. */ NWS_LOCK(nwsp); if (nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED)) { error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal); NWS_UNLOCK(nwsp); if (dosignal) NWS_SIGNAL(nwsp); goto out_unpin; } /* * The current thread is now effectively the netisr worker, so set * the dispatching flag to prevent concurrent processing of the * stream from another thread (even the netisr worker), which could * otherwise lead to effective misordering of the stream. */ nwsp->nws_flags |= NWS_DISPATCHING; NWS_UNLOCK(nwsp); netisr_proto[proto].np_handler(m); NWS_LOCK(nwsp); nwsp->nws_flags &= ~NWS_DISPATCHING; npwp->nw_handled++; npwp->nw_hybrid_dispatched++; /* * If other work was enqueued by another thread while we were direct * dispatching, we need to signal the netisr worker to do that work. * In the future, we might want to do some of that work in the * current thread, rather than trigger further context switches. If * so, we'll want to establish a reasonable bound on the work done in * the "borrowed" context. */ if (nwsp->nws_pendingbits != 0) { nwsp->nws_flags |= NWS_SCHEDULED; dosignal = 1; } else dosignal = 0; NWS_UNLOCK(nwsp); if (dosignal) NWS_SIGNAL(nwsp); error = 0; goto out_unpin; queue_fallback: error = netisr_queue_internal(proto, m, cpuid); out_unpin: sched_unpin(); out_unlock: #ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); #endif return (error); } int netisr_dispatch(u_int proto, struct mbuf *m) { return (netisr_dispatch_src(proto, 0, m)); } #ifdef DEVICE_POLLING /* * Kernel polling borrows a netisr thread to run interface polling in; this * function allows kernel polling to request that the netisr thread be * scheduled even if no packets are pending for protocols. */ void netisr_sched_poll(void) { struct netisr_workstream *nwsp; nwsp = DPCPU_ID_PTR(nws_array[0], nws); NWS_SIGNAL(nwsp); } #endif static void netisr_start_swi(u_int cpuid, struct pcpu *pc) { char swiname[12]; struct netisr_workstream *nwsp; int error; KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); nwsp = DPCPU_ID_PTR(cpuid, nws); mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF); nwsp->nws_cpu = cpuid; snprintf(swiname, sizeof(swiname), "netisr %u", cpuid); error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, nwsp, SWI_NET, INTR_MPSAFE, &nwsp->nws_swi_cookie); if (error) panic("%s: swi_add %d", __func__, error); pc->pc_netisr = nwsp->nws_intr_event; if (netisr_bindthreads) { error = intr_event_bind(nwsp->nws_intr_event, cpuid); if (error != 0) printf("%s: cpu %u: intr_event_bind: %d", __func__, cpuid, error); } NETISR_WLOCK(); nws_array[nws_count] = nwsp->nws_cpu; nws_count++; NETISR_WUNLOCK(); } /* * Initialize the netisr subsystem. We rely on BSS and static initialization * of most fields in global data structures. * * Start a worker thread for the boot CPU so that we can support network * traffic immediately in case the network stack is used before additional * CPUs are started (for example, diskless boot). */ static void netisr_init(void *arg) { struct pcpu *pc; NETISR_LOCK_INIT(); if (netisr_maxthreads == 0 || netisr_maxthreads < -1 ) netisr_maxthreads = 1; /* default behavior */ else if (netisr_maxthreads == -1) netisr_maxthreads = mp_ncpus; /* use max cpus */ if (netisr_maxthreads > mp_ncpus) { printf("netisr_init: forcing maxthreads from %d to %d\n", netisr_maxthreads, mp_ncpus); netisr_maxthreads = mp_ncpus; } if (netisr_defaultqlimit > netisr_maxqlimit) { printf("netisr_init: forcing defaultqlimit from %d to %d\n", netisr_defaultqlimit, netisr_maxqlimit); netisr_defaultqlimit = netisr_maxqlimit; } #ifdef DEVICE_POLLING /* * The device polling code is not yet aware of how to deal with * multiple netisr threads, so for the time being compiling in device * polling disables parallel netisr workers. */ if (netisr_maxthreads != 1 || netisr_bindthreads != 0) { printf("netisr_init: forcing maxthreads to 1 and " "bindthreads to 0 for device polling\n"); netisr_maxthreads = 1; netisr_bindthreads = 0; } #endif #ifdef EARLY_AP_STARTUP STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (nws_count >= netisr_maxthreads) break; netisr_start_swi(pc->pc_cpuid, pc); } #else pc = get_pcpu(); netisr_start_swi(pc->pc_cpuid, pc); #endif } SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL); #ifndef EARLY_AP_STARTUP /* * Start worker threads for additional CPUs. No attempt to gracefully handle * work reassignment, we don't yet support dynamic reconfiguration. */ static void netisr_start(void *arg) { struct pcpu *pc; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (nws_count >= netisr_maxthreads) break; /* Worker will already be present for boot CPU. */ if (pc->pc_netisr != NULL) continue; netisr_start_swi(pc->pc_cpuid, pc); } } SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL); #endif /* * Sysctl monitoring for netisr: query a list of registered protocols. */ static int sysctl_netisr_proto(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; struct sysctl_netisr_proto *snpp, *snp_array; struct netisr_proto *npp; u_int counter, proto; int error; if (req->newptr != NULL) return (EINVAL); snp_array = malloc(sizeof(*snp_array) * NETISR_MAXPROT, M_TEMP, M_ZERO | M_WAITOK); counter = 0; NETISR_RLOCK(&tracker); for (proto = 0; proto < NETISR_MAXPROT; proto++) { npp = &netisr_proto[proto]; if (npp->np_name == NULL) continue; snpp = &snp_array[counter]; snpp->snp_version = sizeof(*snpp); strlcpy(snpp->snp_name, npp->np_name, NETISR_NAMEMAXLEN); snpp->snp_proto = proto; snpp->snp_qlimit = npp->np_qlimit; snpp->snp_policy = npp->np_policy; snpp->snp_dispatch = npp->np_dispatch; if (npp->np_m2flow != NULL) snpp->snp_flags |= NETISR_SNP_FLAGS_M2FLOW; if (npp->np_m2cpuid != NULL) snpp->snp_flags |= NETISR_SNP_FLAGS_M2CPUID; if (npp->np_drainedcpu != NULL) snpp->snp_flags |= NETISR_SNP_FLAGS_DRAINEDCPU; counter++; } NETISR_RUNLOCK(&tracker); KASSERT(counter <= NETISR_MAXPROT, ("sysctl_netisr_proto: counter too big (%d)", counter)); error = SYSCTL_OUT(req, snp_array, sizeof(*snp_array) * counter); free(snp_array, M_TEMP); return (error); } SYSCTL_PROC(_net_isr, OID_AUTO, proto, CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_proto, "S,sysctl_netisr_proto", "Return list of protocols registered with netisr"); /* * Sysctl monitoring for netisr: query a list of workstreams. */ static int sysctl_netisr_workstream(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; struct sysctl_netisr_workstream *snwsp, *snws_array; struct netisr_workstream *nwsp; u_int counter, cpuid; int error; if (req->newptr != NULL) return (EINVAL); snws_array = malloc(sizeof(*snws_array) * MAXCPU, M_TEMP, M_ZERO | M_WAITOK); counter = 0; NETISR_RLOCK(&tracker); CPU_FOREACH(cpuid) { nwsp = DPCPU_ID_PTR(cpuid, nws); if (nwsp->nws_intr_event == NULL) continue; NWS_LOCK(nwsp); snwsp = &snws_array[counter]; snwsp->snws_version = sizeof(*snwsp); /* * For now, we equate workstream IDs and CPU IDs in the * kernel, but expose them independently to userspace in case * that assumption changes in the future. */ snwsp->snws_wsid = cpuid; snwsp->snws_cpu = cpuid; if (nwsp->nws_intr_event != NULL) snwsp->snws_flags |= NETISR_SNWS_FLAGS_INTR; NWS_UNLOCK(nwsp); counter++; } NETISR_RUNLOCK(&tracker); KASSERT(counter <= MAXCPU, ("sysctl_netisr_workstream: counter too big (%d)", counter)); error = SYSCTL_OUT(req, snws_array, sizeof(*snws_array) * counter); free(snws_array, M_TEMP); return (error); } SYSCTL_PROC(_net_isr, OID_AUTO, workstream, CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_workstream, "S,sysctl_netisr_workstream", "Return list of workstreams implemented by netisr"); /* * Sysctl monitoring for netisr: query per-protocol data across all * workstreams. */ static int sysctl_netisr_work(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; struct sysctl_netisr_work *snwp, *snw_array; struct netisr_workstream *nwsp; struct netisr_proto *npp; struct netisr_work *nwp; u_int counter, cpuid, proto; int error; if (req->newptr != NULL) return (EINVAL); snw_array = malloc(sizeof(*snw_array) * MAXCPU * NETISR_MAXPROT, M_TEMP, M_ZERO | M_WAITOK); counter = 0; NETISR_RLOCK(&tracker); CPU_FOREACH(cpuid) { nwsp = DPCPU_ID_PTR(cpuid, nws); if (nwsp->nws_intr_event == NULL) continue; NWS_LOCK(nwsp); for (proto = 0; proto < NETISR_MAXPROT; proto++) { npp = &netisr_proto[proto]; if (npp->np_name == NULL) continue; nwp = &nwsp->nws_work[proto]; snwp = &snw_array[counter]; snwp->snw_version = sizeof(*snwp); snwp->snw_wsid = cpuid; /* See comment above. */ snwp->snw_proto = proto; snwp->snw_len = nwp->nw_len; snwp->snw_watermark = nwp->nw_watermark; snwp->snw_dispatched = nwp->nw_dispatched; snwp->snw_hybrid_dispatched = nwp->nw_hybrid_dispatched; snwp->snw_qdrops = nwp->nw_qdrops; snwp->snw_queued = nwp->nw_queued; snwp->snw_handled = nwp->nw_handled; counter++; } NWS_UNLOCK(nwsp); } KASSERT(counter <= MAXCPU * NETISR_MAXPROT, ("sysctl_netisr_work: counter too big (%d)", counter)); NETISR_RUNLOCK(&tracker); error = SYSCTL_OUT(req, snw_array, sizeof(*snw_array) * counter); free(snw_array, M_TEMP); return (error); } SYSCTL_PROC(_net_isr, OID_AUTO, work, CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_work, "S,sysctl_netisr_work", "Return list of per-workstream, per-protocol work in netisr"); #ifdef DDB DB_SHOW_COMMAND(netisr, db_show_netisr) { struct netisr_workstream *nwsp; struct netisr_work *nwp; int first, proto; u_int cpuid; db_printf("%3s %6s %5s %5s %5s %8s %8s %8s %8s\n", "CPU", "Proto", "Len", "WMark", "Max", "Disp", "HDisp", "Drop", "Queue"); CPU_FOREACH(cpuid) { nwsp = DPCPU_ID_PTR(cpuid, nws); if (nwsp->nws_intr_event == NULL) continue; first = 1; for (proto = 0; proto < NETISR_MAXPROT; proto++) { if (netisr_proto[proto].np_handler == NULL) continue; nwp = &nwsp->nws_work[proto]; if (first) { db_printf("%3d ", cpuid); first = 0; } else db_printf("%3s ", ""); db_printf( "%6s %5d %5d %5d %8ju %8ju %8ju %8ju\n", netisr_proto[proto].np_name, nwp->nw_len, nwp->nw_watermark, nwp->nw_qlimit, nwp->nw_dispatched, nwp->nw_hybrid_dispatched, nwp->nw_qdrops, nwp->nw_queued); } } } #endif diff --git a/sys/net/route.c b/sys/net/route.c index 1dd12da6ded1..ee3550441cf9 100644 --- a/sys/net/route.c +++ b/sys/net/route.c @@ -1,2302 +1,2301 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.c 8.3.1.1 (Berkeley) 2/23/95 * $FreeBSD$ */ /************************************************************************ * Note: In this file a 'fib' is a "forwarding information base" * * Which is the new name for an in kernel routing (next hop) table. * ***********************************************************************/ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include "opt_sctp.h" #include "opt_mrouting.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #define RT_MAXFIBS UINT16_MAX /* Kernel config default option. */ #ifdef ROUTETABLES #if ROUTETABLES <= 0 #error "ROUTETABLES defined too low" #endif #if ROUTETABLES > RT_MAXFIBS #error "ROUTETABLES defined too big" #endif #define RT_NUMFIBS ROUTETABLES #endif /* ROUTETABLES */ /* Initialize to default if not otherwise set. */ #ifndef RT_NUMFIBS #define RT_NUMFIBS 1 #endif #if defined(INET) || defined(INET6) #ifdef SCTP extern void sctp_addr_change(struct ifaddr *ifa, int cmd); #endif /* SCTP */ #endif /* This is read-only.. */ u_int rt_numfibs = RT_NUMFIBS; SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RDTUN, &rt_numfibs, 0, ""); /* * By default add routes to all fibs for new interfaces. * Once this is set to 0 then only allocate routes on interface * changes for the FIB of the caller when adding a new set of addresses * to an interface. XXX this is a shotgun aproach to a problem that needs * a more fine grained solution.. that will come. * XXX also has the problems getting the FIB from curthread which will not * always work given the fib can be overridden and prefixes can be added * from the network stack context. */ VNET_DEFINE(u_int, rt_add_addr_allfibs) = 1; SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(rt_add_addr_allfibs), 0, ""); VNET_DEFINE(struct rtstat, rtstat); #define V_rtstat VNET(rtstat) VNET_DEFINE(struct rib_head *, rt_tables); #define V_rt_tables VNET(rt_tables) VNET_DEFINE(int, rttrash); /* routes not in table but not freed */ #define V_rttrash VNET(rttrash) /* * Convert a 'struct radix_node *' to a 'struct rtentry *'. * The operation can be done safely (in this code) because a * 'struct rtentry' starts with two 'struct radix_node''s, the first * one representing leaf nodes in the routing tree, which is * what the code in radix.c passes us as a 'struct radix_node'. * * But because there are a lot of assumptions in this conversion, * do not cast explicitly, but always use the macro below. */ #define RNTORT(p) ((struct rtentry *)(p)) VNET_DEFINE_STATIC(uma_zone_t, rtzone); /* Routing table UMA zone. */ #define V_rtzone VNET(rtzone) static int rtrequest1_fib_change(struct rib_head *, struct rt_addrinfo *, struct rtentry **, u_int); static void rt_setmetrics(const struct rt_addrinfo *, struct rtentry *); static int rt_ifdelroute(const struct rtentry *rt, void *arg); static struct rtentry *rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror); static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info); #ifdef RADIX_MPATH static struct radix_node *rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry *rto, int *perror); #endif static int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags); struct if_mtuinfo { struct ifnet *ifp; int mtu; }; static int if_updatemtu_cb(struct radix_node *, void *); /* * handler for net.my_fibnum */ static int sysctl_my_fibnum(SYSCTL_HANDLER_ARGS) { int fibnum; int error; fibnum = curthread->td_proc->p_fibnum; error = sysctl_handle_int(oidp, &fibnum, 0, req); return (error); } SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller"); static __inline struct rib_head ** rt_tables_get_rnh_ptr(int table, int fam) { struct rib_head **rnh; KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.", __func__)); KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.", __func__)); /* rnh is [fib=0][af=0]. */ rnh = (struct rib_head **)V_rt_tables; /* Get the offset to the requested table and fam. */ rnh += table * (AF_MAX+1) + fam; return (rnh); } struct rib_head * rt_tables_get_rnh(int table, int fam) { return (*rt_tables_get_rnh_ptr(table, fam)); } u_int rt_tables_get_gen(int table, int fam) { struct rib_head *rnh; rnh = *rt_tables_get_rnh_ptr(table, fam); KASSERT(rnh != NULL, ("%s: NULL rib_head pointer table %d fam %d", __func__, table, fam)); return (rnh->rnh_gen); } /* * route initialization must occur before ip6_init2(), which happenas at * SI_ORDER_MIDDLE. */ static void route_init(void) { /* whack the tunable ints into line. */ if (rt_numfibs > RT_MAXFIBS) rt_numfibs = RT_MAXFIBS; if (rt_numfibs == 0) rt_numfibs = 1; } SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL); static int rtentry_zinit(void *mem, int size, int how) { struct rtentry *rt = mem; rt->rt_pksent = counter_u64_alloc(how); if (rt->rt_pksent == NULL) return (ENOMEM); RT_LOCK_INIT(rt); return (0); } static void rtentry_zfini(void *mem, int size) { struct rtentry *rt = mem; RT_LOCK_DESTROY(rt); counter_u64_free(rt->rt_pksent); } static int rtentry_ctor(void *mem, int size, void *arg, int how) { struct rtentry *rt = mem; bzero(rt, offsetof(struct rtentry, rt_endzero)); counter_u64_zero(rt->rt_pksent); rt->rt_chain = NULL; return (0); } static void rtentry_dtor(void *mem, int size, void *arg) { struct rtentry *rt = mem; RT_UNLOCK_COND(rt); } static void vnet_route_init(const void *unused __unused) { struct domain *dom; struct rib_head **rnh; int table; int fam; V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) * sizeof(struct rib_head *), M_RTABLE, M_WAITOK|M_ZERO); V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), rtentry_ctor, rtentry_dtor, rtentry_zinit, rtentry_zfini, UMA_ALIGN_PTR, 0); for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtattach == NULL) continue; for (table = 0; table < rt_numfibs; table++) { fam = dom->dom_family; if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); dom->dom_rtattach((void **)rnh, 0); } } } VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, vnet_route_init, 0); #ifdef VIMAGE static void vnet_route_uninit(const void *unused __unused) { int table; int fam; struct domain *dom; struct rib_head **rnh; for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtdetach == NULL) continue; for (table = 0; table < rt_numfibs; table++) { fam = dom->dom_family; if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); dom->dom_rtdetach((void **)rnh, 0); } } free(V_rt_tables, M_RTABLE); uma_zdestroy(V_rtzone); } VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, vnet_route_uninit, 0); #endif struct rib_head * rt_table_init(int offset) { struct rib_head *rh; rh = malloc(sizeof(struct rib_head), M_RTABLE, M_WAITOK | M_ZERO); /* TODO: These details should be hidded inside radix.c */ /* Init masks tree */ rn_inithead_internal(&rh->head, rh->rnh_nodes, offset); rn_inithead_internal(&rh->rmhead.head, rh->rmhead.mask_nodes, 0); rh->head.rnh_masks = &rh->rmhead; /* Init locks */ RIB_LOCK_INIT(rh); /* Finally, set base callbacks */ rh->rnh_addaddr = rn_addroute; rh->rnh_deladdr = rn_delete; rh->rnh_matchaddr = rn_match; rh->rnh_lookup = rn_lookup; rh->rnh_walktree = rn_walktree; rh->rnh_walktree_from = rn_walktree_from; return (rh); } static int rt_freeentry(struct radix_node *rn, void *arg) { struct radix_head * const rnh = arg; struct radix_node *x; x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh); if (x != NULL) R_Free(x); return (0); } void rt_table_destroy(struct rib_head *rh) { rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head); /* Assume table is already empty */ RIB_LOCK_DESTROY(rh); free(rh, M_RTABLE); } #ifndef _SYS_SYSPROTO_H_ struct setfib_args { int fibnum; }; #endif int sys_setfib(struct thread *td, struct setfib_args *uap) { if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs) return EINVAL; td->td_proc->p_fibnum = uap->fibnum; return (0); } /* * Packet routing routines. */ void rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum) { struct rtentry *rt; if ((rt = ro->ro_rt) != NULL) { if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP) return; RTFREE(rt); ro->ro_rt = NULL; } ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum); if (ro->ro_rt) RT_UNLOCK(ro->ro_rt); } /* * Look up the route that matches the address given * Or, at least try.. Create a cloned route if needed. * * The returned route, if any, is locked. */ struct rtentry * rtalloc1(struct sockaddr *dst, int report, u_long ignflags) { return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB)); } struct rtentry * rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) { RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; struct rtentry *newrt; struct rt_addrinfo info; int err = 0, msgtype = RTM_MISS; KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum")); rh = rt_tables_get_rnh(fibnum, dst->sa_family); newrt = NULL; if (rh == NULL) goto miss; /* * Look up the address in the table for that Address Family */ if ((ignflags & RTF_RNH_LOCKED) == 0) RIB_RLOCK(rh); #ifdef INVARIANTS else RIB_LOCK_ASSERT(rh); #endif rn = rh->rnh_matchaddr(dst, &rh->head); if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { newrt = RNTORT(rn); RT_LOCK(newrt); RT_ADDREF(newrt); if ((ignflags & RTF_RNH_LOCKED) == 0) RIB_RUNLOCK(rh); return (newrt); } else if ((ignflags & RTF_RNH_LOCKED) == 0) RIB_RUNLOCK(rh); /* * Either we hit the root or could not find any match, * which basically means: "cannot get there from here". */ miss: V_rtstat.rts_unreach++; if (report) { /* * If required, report the failure to the supervising * Authorities. * For a delete, this is not an error. (report == 0) */ bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; rt_missmsg_fib(msgtype, &info, 0, err, fibnum); } return (newrt); } /* * Remove a reference count from an rtentry. * If the count gets low enough, take it out of the routing table */ void rtfree(struct rtentry *rt) { struct rib_head *rnh; KASSERT(rt != NULL,("%s: NULL rt", __func__)); rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family); KASSERT(rnh != NULL,("%s: NULL rnh", __func__)); RT_LOCK_ASSERT(rt); /* * The callers should use RTFREE_LOCKED() or RTFREE(), so * we should come here exactly with the last reference. */ RT_REMREF(rt); if (rt->rt_refcnt > 0) { log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt); goto done; } /* * On last reference give the "close method" a chance * to cleanup private state. This also permits (for * IPv4 and IPv6) a chance to decide if the routing table * entry should be purged immediately or at a later time. * When an immediate purge is to happen the close routine * typically calls rtexpunge which clears the RTF_UP flag * on the entry so that the code below reclaims the storage. */ if (rt->rt_refcnt == 0 && rnh->rnh_close) rnh->rnh_close((struct radix_node *)rt, &rnh->head); /* * If we are no longer "up" (and ref == 0) * then we can free the resources associated * with the route. */ if ((rt->rt_flags & RTF_UP) == 0) { if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic("rtfree 2"); /* * the rtentry must have been removed from the routing table * so it is represented in rttrash.. remove that now. */ V_rttrash--; #ifdef DIAGNOSTIC if (rt->rt_refcnt < 0) { printf("rtfree: %p not freed (neg refs)\n", rt); goto done; } #endif /* * release references on items we hold them on.. * e.g other routes and ifaddrs. */ if (rt->rt_ifa) ifa_free(rt->rt_ifa); /* * The key is separatly alloc'd so free it (see rt_setgate()). * This also frees the gateway, as they are always malloc'd * together. */ R_Free(rt_key(rt)); /* * and the rtentry itself of course */ uma_zfree(V_rtzone, rt); return; } done: RT_UNLOCK(rt); } /* * Force a routing table entry to the specified * destination to go through the given gateway. * Normally called as a result of a routing redirect * message from the network layer. */ void rtredirect_fib(struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct sockaddr *src, u_int fibnum) { struct rtentry *rt; int error = 0; short *stat = NULL; struct rt_addrinfo info; - struct epoch_tracker et; struct ifaddr *ifa; struct rib_head *rnh; + NET_EPOCH_ASSERT(); + ifa = NULL; - NET_EPOCH_ENTER(et); rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) { error = EAFNOSUPPORT; goto out; } /* verify the gateway is directly reachable */ if ((ifa = ifa_ifwithnet(gateway, 0, fibnum)) == NULL) { error = ENETUNREACH; goto out; } rt = rtalloc1_fib(dst, 0, 0UL, fibnum); /* NB: rt is locked */ /* * If the redirect isn't from our current router for this dst, * it's either old or wrong. If it redirects us to ourselves, * we have a routing loop, perhaps as a result of an interface * going down recently. */ if (!(flags & RTF_DONE) && rt) { if (!sa_equal(src, rt->rt_gateway)) { error = EINVAL; goto done; } if (rt->rt_ifa != ifa && ifa->ifa_addr->sa_family != AF_LINK) { error = EINVAL; goto done; } } if ((flags & RTF_GATEWAY) && ifa_ifwithaddr_check(gateway)) { error = EHOSTUNREACH; goto done; } /* * Create a new entry if we just got back a wildcard entry * or the lookup failed. This is necessary for hosts * which use routing redirects generated by smart gateways * to dynamically build the routing tables. */ if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2)) goto create; /* * Don't listen to the redirect if it's * for a route to an interface. */ if (rt->rt_flags & RTF_GATEWAY) { if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) { /* * Changing from route to net => route to host. * Create new route, rather than smashing route to net. */ create: if (rt != NULL) RTFREE_LOCKED(rt); flags |= RTF_DYNAMIC; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; ifa_ref(ifa); info.rti_ifa = ifa; info.rti_flags = flags; error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum); if (rt != NULL) { RT_LOCK(rt); flags = rt->rt_flags; } stat = &V_rtstat.rts_dynamic; } else { /* * Smash the current notion of the gateway to * this destination. Should check about netmask!!! */ if ((flags & RTF_GATEWAY) == 0) rt->rt_flags &= ~RTF_GATEWAY; rt->rt_flags |= RTF_MODIFIED; flags |= RTF_MODIFIED; stat = &V_rtstat.rts_newgateway; /* * add the key and gateway (in one malloc'd chunk). */ RT_UNLOCK(rt); RIB_WLOCK(rnh); RT_LOCK(rt); rt_setgate(rt, rt_key(rt), gateway); RIB_WUNLOCK(rnh); } } else error = EHOSTUNREACH; done: if (rt) RTFREE_LOCKED(rt); out: - NET_EPOCH_EXIT(et); if (error) V_rtstat.rts_badredirect++; else if (stat != NULL) (*stat)++; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_info[RTAX_AUTHOR] = src; rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum); } /* * Routing table ioctl interface. */ int rtioctl_fib(u_long req, caddr_t data, u_int fibnum) { /* * If more ioctl commands are added here, make sure the proper * super-user checks are being performed because it is possible for * prison-root to make it this far if raw sockets have been enabled * in jails. */ #ifdef INET /* Multicast goop, grrr... */ return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP; #else /* INET */ return ENXIO; #endif /* INET */ } struct ifaddr * ifa_ifwithroute(int flags, const struct sockaddr *dst, struct sockaddr *gateway, u_int fibnum) { struct ifaddr *ifa; int not_found = 0; MPASS(in_epoch(net_epoch_preempt)); if ((flags & RTF_GATEWAY) == 0) { /* * If we are adding a route to an interface, * and the interface is a pt to pt link * we should search for the destination * as our clue to the interface. Otherwise * we can use the local address. */ ifa = NULL; if (flags & RTF_HOST) ifa = ifa_ifwithdstaddr(dst, fibnum); if (ifa == NULL) ifa = ifa_ifwithaddr(gateway); } else { /* * If we are adding a route to a remote net * or host, the gateway may still be on the * other end of a pt to pt link. */ ifa = ifa_ifwithdstaddr(gateway, fibnum); } if (ifa == NULL) ifa = ifa_ifwithnet(gateway, 0, fibnum); if (ifa == NULL) { struct rtentry *rt; rt = rtalloc1_fib(gateway, 0, flags, fibnum); if (rt == NULL) goto out; /* * dismiss a gateway that is reachable only * through the default router */ switch (gateway->sa_family) { case AF_INET: if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) not_found = 1; break; case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr)) not_found = 1; break; default: break; } if (!not_found && rt->rt_ifa != NULL) { ifa = rt->rt_ifa; } RT_REMREF(rt); RT_UNLOCK(rt); if (not_found || ifa == NULL) goto out; } if (ifa->ifa_addr->sa_family != dst->sa_family) { struct ifaddr *oifa = ifa; ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); if (ifa == NULL) ifa = oifa; } out: return (ifa); } /* * Do appropriate manipulations of a routing tree given * all the bits of info needed */ int rtrequest_fib(int req, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct rtentry **ret_nrt, u_int fibnum) { struct rt_addrinfo info; if (dst->sa_len == 0) return(EINVAL); bzero((caddr_t)&info, sizeof(info)); info.rti_flags = flags; info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; return rtrequest1_fib(req, &info, ret_nrt, fibnum); } /* * Copy most of @rt data into @info. * * If @flags contains NHR_COPY, copies dst,netmask and gw to the * pointers specified by @info structure. Assume such pointers * are zeroed sockaddr-like structures with sa_len field initialized * to reflect size of the provided buffer. if no NHR_COPY is specified, * point dst,netmask and gw @info fields to appropriate @rt values. * * if @flags contains NHR_REF, do refcouting on rt_ifp. * * Returns 0 on success. */ int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags) { struct rt_metrics *rmx; struct sockaddr *src, *dst; int sa_len; if (flags & NHR_COPY) { /* Copy destination if dst is non-zero */ src = rt_key(rt); dst = info->rti_info[RTAX_DST]; sa_len = src->sa_len; if (dst != NULL) { if (src->sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_DST; } /* Copy mask if set && dst is non-zero */ src = rt_mask(rt); dst = info->rti_info[RTAX_NETMASK]; if (src != NULL && dst != NULL) { /* * Radix stores different value in sa_len, * assume rt_mask() to have the same length * as rt_key() */ if (sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_NETMASK; } /* Copy gateway is set && dst is non-zero */ src = rt->rt_gateway; dst = info->rti_info[RTAX_GATEWAY]; if ((rt->rt_flags & RTF_GATEWAY) && src != NULL && dst != NULL){ if (src->sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_GATEWAY; } } else { info->rti_info[RTAX_DST] = rt_key(rt); info->rti_addrs |= RTA_DST; if (rt_mask(rt) != NULL) { info->rti_info[RTAX_NETMASK] = rt_mask(rt); info->rti_addrs |= RTA_NETMASK; } if (rt->rt_flags & RTF_GATEWAY) { info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; info->rti_addrs |= RTA_GATEWAY; } } rmx = info->rti_rmx; if (rmx != NULL) { info->rti_mflags |= RTV_MTU; rmx->rmx_mtu = rt->rt_mtu; } info->rti_flags = rt->rt_flags; info->rti_ifp = rt->rt_ifp; info->rti_ifa = rt->rt_ifa; ifa_ref(info->rti_ifa); if (flags & NHR_REF) { /* Do 'traditional' refcouting */ if_ref(info->rti_ifp); } return (0); } /* * Lookups up route entry for @dst in RIB database for fib @fibnum. * Exports entry data to @info using rt_exportinfo(). * * if @flags contains NHR_REF, refcouting is performed on rt_ifp. * All references can be released later by calling rib_free_info() * * Returns 0 on success. * Returns ENOENT for lookup failure, ENOMEM for export failure. */ int rib_lookup_info(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, uint32_t flowid, struct rt_addrinfo *info) { RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; struct rtentry *rt; int error; KASSERT((fibnum < rt_numfibs), ("rib_lookup_rte: bad fibnum")); rh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rh == NULL) return (ENOENT); RIB_RLOCK(rh); rn = rh->rnh_matchaddr(__DECONST(void *, dst), &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rt = RNTORT(rn); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(rt->rt_ifp)) { flags = (flags & NHR_REF) | NHR_COPY; error = rt_exportinfo(rt, info, flags); RIB_RUNLOCK(rh); return (error); } } RIB_RUNLOCK(rh); return (ENOENT); } /* * Releases all references acquired by rib_lookup_info() when * called with NHR_REF flags. */ void rib_free_info(struct rt_addrinfo *info) { if_rele(info->rti_ifp); } /* * Iterates over all existing fibs in system calling * @setwa_f function prior to traversing each fib. * Calls @wa_f function for each element in current fib. * If af is not AF_UNSPEC, iterates over fibs in particular * address family. */ void rt_foreach_fib_walk(int af, rt_setwarg_t *setwa_f, rt_walktree_f_t *wa_f, void *arg) { struct rib_head *rnh; uint32_t fibnum; int i; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { /* Do we want some specific family? */ if (af != AF_UNSPEC) { rnh = rt_tables_get_rnh(fibnum, af); if (rnh == NULL) continue; if (setwa_f != NULL) setwa_f(rnh, fibnum, af, arg); RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg); RIB_WUNLOCK(rnh); continue; } for (i = 1; i <= AF_MAX; i++) { rnh = rt_tables_get_rnh(fibnum, i); if (rnh == NULL) continue; if (setwa_f != NULL) setwa_f(rnh, fibnum, i, arg); RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg); RIB_WUNLOCK(rnh); } } } struct rt_delinfo { struct rt_addrinfo info; struct rib_head *rnh; struct rtentry *head; }; /* * Conditionally unlinks @rn from radix tree based * on info data passed in @arg. */ static int rt_checkdelroute(struct radix_node *rn, void *arg) { struct rt_delinfo *di; struct rt_addrinfo *info; struct rtentry *rt; int error; di = (struct rt_delinfo *)arg; rt = (struct rtentry *)rn; info = &di->info; error = 0; info->rti_info[RTAX_DST] = rt_key(rt); info->rti_info[RTAX_NETMASK] = rt_mask(rt); info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; rt = rt_unlinkrte(di->rnh, info, &error); if (rt == NULL) { /* Either not allowed or not matched. Skip entry */ return (0); } /* Entry was unlinked. Add to the list and return */ rt->rt_chain = di->head; di->head = rt; return (0); } /* * Iterates over all existing fibs in system. * Deletes each element for which @filter_f function returned * non-zero value. * If @af is not AF_UNSPEC, iterates over fibs in particular * address family. */ void rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg) { struct rib_head *rnh; struct rt_delinfo di; struct rtentry *rt; uint32_t fibnum; int i, start, end; bzero(&di, sizeof(di)); di.info.rti_filter = filter_f; di.info.rti_filterdata = arg; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { /* Do we want some specific family? */ if (af != AF_UNSPEC) { start = af; end = af; } else { start = 1; end = AF_MAX; } for (i = start; i <= end; i++) { rnh = rt_tables_get_rnh(fibnum, i); if (rnh == NULL) continue; di.rnh = rnh; RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di); RIB_WUNLOCK(rnh); if (di.head == NULL) continue; /* We might have something to reclaim */ while (di.head != NULL) { rt = di.head; di.head = rt->rt_chain; rt->rt_chain = NULL; /* TODO std rt -> rt_addrinfo export */ di.info.rti_info[RTAX_DST] = rt_key(rt); di.info.rti_info[RTAX_NETMASK] = rt_mask(rt); rt_notifydelete(rt, &di.info); RTFREE_LOCKED(rt); } } } } /* * Delete Routes for a Network Interface * * Called for each routing entry via the rnh->rnh_walktree() call above * to delete all route entries referencing a detaching network interface. * * Arguments: * rt pointer to rtentry * arg argument passed to rnh->rnh_walktree() - detaching interface * * Returns: * 0 successful * errno failed - reason indicated */ static int rt_ifdelroute(const struct rtentry *rt, void *arg) { struct ifnet *ifp = arg; if (rt->rt_ifp != ifp) return (0); /* * Protect (sorta) against walktree recursion problems * with cloned routes */ if ((rt->rt_flags & RTF_UP) == 0) return (0); return (1); } /* * Delete all remaining routes using this interface * Unfortuneatly the only way to do this is to slog through * the entire routing table looking for routes which point * to this interface...oh well... */ void rt_flushifroutes_af(struct ifnet *ifp, int af) { KASSERT((af >= 1 && af <= AF_MAX), ("%s: af %d not >= 1 and <= %d", __func__, af, AF_MAX)); rt_foreach_fib_walk_del(af, rt_ifdelroute, ifp); } void rt_flushifroutes(struct ifnet *ifp) { rt_foreach_fib_walk_del(AF_UNSPEC, rt_ifdelroute, ifp); } /* * Conditionally unlinks rtentry matching data inside @info from @rnh. * Returns unlinked, locked and referenced @rtentry on success, * Returns NULL and sets @perror to: * ESRCH - if prefix was not found, * EADDRINUSE - if trying to delete PINNED route without appropriate flag. * ENOENT - if supplied filter function returned 0 (not matched). */ static struct rtentry * rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror) { struct sockaddr *dst, *netmask; struct rtentry *rt; struct radix_node *rn; dst = info->rti_info[RTAX_DST]; netmask = info->rti_info[RTAX_NETMASK]; rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head); if (rt == NULL) { *perror = ESRCH; return (NULL); } if ((info->rti_flags & RTF_PINNED) == 0) { /* Check if target route can be deleted */ if (rt->rt_flags & RTF_PINNED) { *perror = EADDRINUSE; return (NULL); } } if (info->rti_filter != NULL) { if (info->rti_filter(rt, info->rti_filterdata) == 0) { /* Not matched */ *perror = ENOENT; return (NULL); } /* * Filter function requested rte deletion. * Ease the caller work by filling in remaining info * from that particular entry. */ info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; } /* * Remove the item from the tree and return it. * Complain if it is not there and do no more processing. */ *perror = ESRCH; #ifdef RADIX_MPATH if (rt_mpath_capable(rnh)) rn = rt_mpath_unlink(rnh, info, rt, perror); else #endif rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); if (rn == NULL) return (NULL); if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic ("rtrequest delete"); rt = RNTORT(rn); RT_LOCK(rt); RT_ADDREF(rt); rt->rt_flags &= ~RTF_UP; *perror = 0; return (rt); } static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info) { struct ifaddr *ifa; /* * give the protocol a chance to keep things in sync. */ ifa = rt->rt_ifa; if (ifa != NULL && ifa->ifa_rtrequest != NULL) ifa->ifa_rtrequest(RTM_DELETE, rt, info); /* * One more rtentry floating around that is not * linked to the routing table. rttrash will be decremented * when RTFREE(rt) is eventually called. */ V_rttrash++; } /* * These (questionable) definitions of apparent local variables apply * to the next two functions. XXXXXX!!! */ #define dst info->rti_info[RTAX_DST] #define gateway info->rti_info[RTAX_GATEWAY] #define netmask info->rti_info[RTAX_NETMASK] #define ifaaddr info->rti_info[RTAX_IFA] #define ifpaddr info->rti_info[RTAX_IFP] #define flags info->rti_flags /* * Look up rt_addrinfo for a specific fib. Note that if rti_ifa is defined, * it will be referenced so the caller must free it. * * Assume basic consistency checks are executed by callers: * RTAX_DST exists, if RTF_GATEWAY is set, RTAX_GATEWAY exists as well. */ int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) { struct epoch_tracker et; int needref, error; /* * ifp may be specified by sockaddr_dl * when protocol address is ambiguous. */ error = 0; needref = (info->rti_ifa == NULL); NET_EPOCH_ENTER(et); /* If we have interface specified by the ifindex in the address, use it */ if (info->rti_ifp == NULL && ifpaddr != NULL && ifpaddr->sa_family == AF_LINK) { const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)ifpaddr; if (sdl->sdl_index != 0) info->rti_ifp = ifnet_byindex_locked(sdl->sdl_index); } /* * If we have source address specified, try to find it * TODO: avoid enumerating all ifas on all interfaces. */ if (info->rti_ifa == NULL && ifaaddr != NULL) info->rti_ifa = ifa_ifwithaddr(ifaaddr); if (info->rti_ifa == NULL) { struct sockaddr *sa; /* * Most common use case for the userland-supplied routes. * * Choose sockaddr to select ifa. * -- if ifp is set -- * Order of preference: * 1) IFA address * 2) gateway address * Note: for interface routes link-level gateway address * is specified to indicate the interface index without * specifying RTF_GATEWAY. In this case, ignore gateway * Note: gateway AF may be different from dst AF. In this case, * ignore gateway * 3) final destination. * 4) if all of these fails, try to get at least link-level ifa. * -- else -- * try to lookup gateway or dst in the routing table to get ifa */ if (info->rti_info[RTAX_IFA] != NULL) sa = info->rti_info[RTAX_IFA]; else if ((info->rti_flags & RTF_GATEWAY) != 0 && gateway->sa_family == dst->sa_family) sa = gateway; else sa = dst; if (info->rti_ifp != NULL) { info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp); /* Case 4 */ if (info->rti_ifa == NULL && gateway != NULL) info->rti_ifa = ifaof_ifpforaddr(gateway, info->rti_ifp); } else if (dst != NULL && gateway != NULL) info->rti_ifa = ifa_ifwithroute(flags, dst, gateway, fibnum); else if (sa != NULL) info->rti_ifa = ifa_ifwithroute(flags, sa, sa, fibnum); } if (needref && info->rti_ifa != NULL) { if (info->rti_ifp == NULL) info->rti_ifp = info->rti_ifa->ifa_ifp; ifa_ref(info->rti_ifa); } else error = ENETUNREACH; NET_EPOCH_EXIT(et); return (error); } static int if_updatemtu_cb(struct radix_node *rn, void *arg) { struct rtentry *rt; struct if_mtuinfo *ifmtu; rt = (struct rtentry *)rn; ifmtu = (struct if_mtuinfo *)arg; if (rt->rt_ifp != ifmtu->ifp) return (0); if (rt->rt_mtu >= ifmtu->mtu) { /* We have to decrease mtu regardless of flags */ rt->rt_mtu = ifmtu->mtu; return (0); } /* * New MTU is bigger. Check if are allowed to alter it */ if ((rt->rt_flags & (RTF_FIXEDMTU | RTF_GATEWAY | RTF_HOST)) != 0) { /* * Skip routes with user-supplied MTU and * non-interface routes */ return (0); } /* We are safe to update route MTU */ rt->rt_mtu = ifmtu->mtu; return (0); } void rt_updatemtu(struct ifnet *ifp) { struct if_mtuinfo ifmtu; struct rib_head *rnh; int i, j; ifmtu.ifp = ifp; /* * Try to update rt_mtu for all routes using this interface * Unfortunately the only way to do this is to traverse all * routing tables in all fibs/domains. */ for (i = 1; i <= AF_MAX; i++) { ifmtu.mtu = if_getmtu_family(ifp, i); for (j = 0; j < rt_numfibs; j++) { rnh = rt_tables_get_rnh(j, i); if (rnh == NULL) continue; RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, if_updatemtu_cb, &ifmtu); RIB_WUNLOCK(rnh); } } } #if 0 int p_sockaddr(char *buf, int buflen, struct sockaddr *s); int rt_print(char *buf, int buflen, struct rtentry *rt); int p_sockaddr(char *buf, int buflen, struct sockaddr *s) { void *paddr = NULL; switch (s->sa_family) { case AF_INET: paddr = &((struct sockaddr_in *)s)->sin_addr; break; case AF_INET6: paddr = &((struct sockaddr_in6 *)s)->sin6_addr; break; } if (paddr == NULL) return (0); if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL) return (0); return (strlen(buf)); } int rt_print(char *buf, int buflen, struct rtentry *rt) { struct sockaddr *addr, *mask; int i = 0; addr = rt_key(rt); mask = rt_mask(rt); i = p_sockaddr(buf, buflen, addr); if (!(rt->rt_flags & RTF_HOST)) { buf[i++] = '/'; i += p_sockaddr(buf + i, buflen - i, mask); } if (rt->rt_flags & RTF_GATEWAY) { buf[i++] = '>'; i += p_sockaddr(buf + i, buflen - i, rt->rt_gateway); } return (i); } #endif #ifdef RADIX_MPATH /* * Deletes key for single-path routes, unlinks rtentry with * gateway specified in @info from multi-path routes. * * Returnes unlinked entry. In case of failure, returns NULL * and sets @perror to ESRCH. */ static struct radix_node * rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry *rto, int *perror) { /* * if we got multipath routes, we require users to specify * a matching RTAX_GATEWAY. */ struct rtentry *rt; // *rto = NULL; struct radix_node *rn; struct sockaddr *gw; gw = info->rti_info[RTAX_GATEWAY]; rt = rt_mpath_matchgate(rto, gw); if (rt == NULL) { *perror = ESRCH; return (NULL); } /* * this is the first entry in the chain */ if (rto == rt) { rn = rn_mpath_next((struct radix_node *)rt); /* * there is another entry, now it's active */ if (rn) { rto = RNTORT(rn); RT_LOCK(rto); rto->rt_flags |= RTF_UP; RT_UNLOCK(rto); } else if (rt->rt_flags & RTF_GATEWAY) { /* * For gateway routes, we need to * make sure that we we are deleting * the correct gateway. * rt_mpath_matchgate() does not * check the case when there is only * one route in the chain. */ if (gw && (rt->rt_gateway->sa_len != gw->sa_len || memcmp(rt->rt_gateway, gw, gw->sa_len))) { *perror = ESRCH; return (NULL); } } /* * use the normal delete code to remove * the first entry */ rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); *perror = 0; return (rn); } /* * if the entry is 2nd and on up */ if (rt_mpath_deldup(rto, rt) == 0) panic ("rtrequest1: rt_mpath_deldup"); *perror = 0; rn = (struct radix_node *)rt; return (rn); } #endif int rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { int error = 0; struct rtentry *rt, *rt_old; struct radix_node *rn; struct rib_head *rnh; struct ifaddr *ifa; struct sockaddr *ndst; struct sockaddr_storage mdst; KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum")); KASSERT((flags & RTF_RNH_LOCKED) == 0, ("rtrequest1_fib: locked")); switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We support multiple FIBs. */ break; default: fibnum = RT_DEFAULT_FIB; break; } /* * Find the correct routing tree to use for this Address Family */ rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) return (EAFNOSUPPORT); /* * If we are adding a host route then we don't want to put * a netmask in the tree, nor do we want to clone it. */ if (flags & RTF_HOST) netmask = NULL; switch (req) { case RTM_DELETE: if (netmask) { if (dst->sa_len > sizeof(mdst)) return (EINVAL); rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask); dst = (struct sockaddr *)&mdst; } RIB_WLOCK(rnh); rt = rt_unlinkrte(rnh, info, &error); RIB_WUNLOCK(rnh); if (error != 0) return (error); rt_notifydelete(rt, info); /* * If the caller wants it, then it can have it, * but it's up to it to free the rtentry as we won't be * doing it. */ if (ret_nrt) { *ret_nrt = rt; RT_UNLOCK(rt); } else RTFREE_LOCKED(rt); break; case RTM_RESOLVE: /* * resolve was only used for route cloning * here for compat */ break; case RTM_ADD: if ((flags & RTF_GATEWAY) && !gateway) return (EINVAL); if (dst && gateway && (dst->sa_family != gateway->sa_family) && (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) return (EINVAL); if (info->rti_ifa == NULL) { error = rt_getifa_fib(info, fibnum); if (error) return (error); } rt = uma_zalloc(V_rtzone, M_NOWAIT); if (rt == NULL) { return (ENOBUFS); } rt->rt_flags = RTF_UP | flags; rt->rt_fibnum = fibnum; /* * Add the gateway. Possibly re-malloc-ing the storage for it. */ if ((error = rt_setgate(rt, dst, gateway)) != 0) { uma_zfree(V_rtzone, rt); return (error); } /* * point to the (possibly newly malloc'd) dest address. */ ndst = (struct sockaddr *)rt_key(rt); /* * make sure it contains the value we want (masked if needed). */ if (netmask) { rt_maskedcopy(dst, ndst, netmask); } else bcopy(dst, ndst, dst->sa_len); /* * We use the ifa reference returned by rt_getifa_fib(). * This moved from below so that rnh->rnh_addaddr() can * examine the ifa and ifa->ifa_ifp if it so desires. */ ifa = info->rti_ifa; ifa_ref(ifa); rt->rt_ifa = ifa; rt->rt_ifp = ifa->ifa_ifp; rt->rt_weight = 1; rt_setmetrics(info, rt); RIB_WLOCK(rnh); RT_LOCK(rt); #ifdef RADIX_MPATH /* do not permit exactly the same dst/mask/gw pair */ if (rt_mpath_capable(rnh) && rt_mpath_conflict(rnh, rt, netmask)) { RIB_WUNLOCK(rnh); ifa_free(rt->rt_ifa); R_Free(rt_key(rt)); uma_zfree(V_rtzone, rt); return (EEXIST); } #endif /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); rt_old = NULL; if (rn == NULL && (info->rti_flags & RTF_PINNED) != 0) { /* * Force removal and re-try addition * TODO: better multipath&pinned support */ struct sockaddr *info_dst = info->rti_info[RTAX_DST]; info->rti_info[RTAX_DST] = ndst; /* Do not delete existing PINNED(interface) routes */ info->rti_flags &= ~RTF_PINNED; rt_old = rt_unlinkrte(rnh, info, &error); info->rti_flags |= RTF_PINNED; info->rti_info[RTAX_DST] = info_dst; if (rt_old != NULL) rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); } RIB_WUNLOCK(rnh); if (rt_old != NULL) RT_UNLOCK(rt_old); /* * If it still failed to go into the tree, * then un-make it (this should be a function) */ if (rn == NULL) { ifa_free(rt->rt_ifa); R_Free(rt_key(rt)); uma_zfree(V_rtzone, rt); return (EEXIST); } if (rt_old != NULL) { rt_notifydelete(rt_old, info); RTFREE(rt_old); } /* * If this protocol has something to add to this then * allow it to do that as well. */ if (ifa->ifa_rtrequest) ifa->ifa_rtrequest(req, rt, info); /* * actually return a resultant rtentry and * give the caller a single reference. */ if (ret_nrt) { *ret_nrt = rt; RT_ADDREF(rt); } rnh->rnh_gen++; /* Routing table updated */ RT_UNLOCK(rt); break; case RTM_CHANGE: RIB_WLOCK(rnh); error = rtrequest1_fib_change(rnh, info, ret_nrt, fibnum); RIB_WUNLOCK(rnh); break; default: error = EOPNOTSUPP; } return (error); } #undef dst #undef gateway #undef netmask #undef ifaaddr #undef ifpaddr #undef flags static int rtrequest1_fib_change(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { struct rtentry *rt = NULL; int error = 0; int free_ifa = 0; int family, mtu; struct if_mtuinfo ifmtu; RIB_WLOCK_ASSERT(rnh); rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rt == NULL) return (ESRCH); #ifdef RADIX_MPATH /* * If we got multipath routes, * we require users to specify a matching RTAX_GATEWAY. */ if (rt_mpath_capable(rnh)) { rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]); if (rt == NULL) return (ESRCH); } #endif RT_LOCK(rt); rt_setmetrics(info, rt); /* * New gateway could require new ifaddr, ifp; * flags may also be different; ifp may be specified * by ll sockaddr when protocol address is ambiguous */ if (((rt->rt_flags & RTF_GATEWAY) && info->rti_info[RTAX_GATEWAY] != NULL) || info->rti_info[RTAX_IFP] != NULL || (info->rti_info[RTAX_IFA] != NULL && !sa_equal(info->rti_info[RTAX_IFA], rt->rt_ifa->ifa_addr))) { /* * XXX: Temporarily set RTF_RNH_LOCKED flag in the rti_flags * to avoid rlock in the ifa_ifwithroute(). */ info->rti_flags |= RTF_RNH_LOCKED; error = rt_getifa_fib(info, fibnum); info->rti_flags &= ~RTF_RNH_LOCKED; if (info->rti_ifa != NULL) free_ifa = 1; if (error != 0) goto bad; } /* Check if outgoing interface has changed */ if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa && rt->rt_ifa != NULL) { if (rt->rt_ifa->ifa_rtrequest != NULL) rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, info); ifa_free(rt->rt_ifa); rt->rt_ifa = NULL; } /* Update gateway address */ if (info->rti_info[RTAX_GATEWAY] != NULL) { error = rt_setgate(rt, rt_key(rt), info->rti_info[RTAX_GATEWAY]); if (error != 0) goto bad; rt->rt_flags &= ~RTF_GATEWAY; rt->rt_flags |= (RTF_GATEWAY & info->rti_flags); } if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa) { ifa_ref(info->rti_ifa); rt->rt_ifa = info->rti_ifa; rt->rt_ifp = info->rti_ifp; } /* Allow some flags to be toggled on change. */ rt->rt_flags &= ~RTF_FMASK; rt->rt_flags |= info->rti_flags & RTF_FMASK; if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest != NULL) rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info); /* Alter route MTU if necessary */ if (rt->rt_ifp != NULL) { family = info->rti_info[RTAX_DST]->sa_family; mtu = if_getmtu_family(rt->rt_ifp, family); /* Set default MTU */ if (rt->rt_mtu == 0) rt->rt_mtu = mtu; if (rt->rt_mtu != mtu) { /* Check if we really need to update */ ifmtu.ifp = rt->rt_ifp; ifmtu.mtu = mtu; if_updatemtu_cb(rt->rt_nodes, &ifmtu); } } /* * This route change may have modified the route's gateway. In that * case, any inpcbs that have cached this route need to invalidate their * llentry cache. */ rnh->rnh_gen++; if (ret_nrt) { *ret_nrt = rt; RT_ADDREF(rt); } bad: RT_UNLOCK(rt); if (free_ifa != 0) { ifa_free(info->rti_ifa); info->rti_ifa = NULL; } return (error); } static void rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt) { if (info->rti_mflags & RTV_MTU) { if (info->rti_rmx->rmx_mtu != 0) { /* * MTU was explicitly provided by user. * Keep it. */ rt->rt_flags |= RTF_FIXEDMTU; } else { /* * User explicitly sets MTU to 0. * Assume rollback to default. */ rt->rt_flags &= ~RTF_FIXEDMTU; } rt->rt_mtu = info->rti_rmx->rmx_mtu; } if (info->rti_mflags & RTV_WEIGHT) rt->rt_weight = info->rti_rmx->rmx_weight; /* Kernel -> userland timebase conversion. */ if (info->rti_mflags & RTV_EXPIRE) rt->rt_expire = info->rti_rmx->rmx_expire ? info->rti_rmx->rmx_expire - time_second + time_uptime : 0; } int rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) { /* XXX dst may be overwritten, can we move this to below */ int dlen = SA_SIZE(dst), glen = SA_SIZE(gate); /* * Prepare to store the gateway in rt->rt_gateway. * Both dst and gateway are stored one after the other in the same * malloc'd chunk. If we have room, we can reuse the old buffer, * rt_gateway already points to the right place. * Otherwise, malloc a new block and update the 'dst' address. */ if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) { caddr_t new; R_Malloc(new, caddr_t, dlen + glen); if (new == NULL) return ENOBUFS; /* * XXX note, we copy from *dst and not *rt_key(rt) because * rt_setgate() can be called to initialize a newly * allocated route entry, in which case rt_key(rt) == NULL * (and also rt->rt_gateway == NULL). * Free()/free() handle a NULL argument just fine. */ bcopy(dst, new, dlen); R_Free(rt_key(rt)); /* free old block, if any */ rt_key(rt) = (struct sockaddr *)new; rt->rt_gateway = (struct sockaddr *)(new + dlen); } /* * Copy the new gateway value into the memory chunk. */ bcopy(gate, rt->rt_gateway, glen); return (0); } void rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask) { u_char *cp1 = (u_char *)src; u_char *cp2 = (u_char *)dst; u_char *cp3 = (u_char *)netmask; u_char *cplim = cp2 + *cp3; u_char *cplim2 = cp2 + *cp1; *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */ cp3 += 2; if (cplim > cplim2) cplim = cplim2; while (cp2 < cplim) *cp2++ = *cp1++ & *cp3++; if (cp2 < cplim2) bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2)); } /* * Set up a routing table entry, normally * for an interface. */ #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */ static inline int rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) { RIB_RLOCK_TRACKER; struct sockaddr *dst; struct sockaddr *netmask; struct rtentry *rt = NULL; struct rt_addrinfo info; int error = 0; int startfib, endfib; char tempbuf[_SOCKADDR_TMPSIZE]; int didwork = 0; int a_failure = 0; struct sockaddr_dl *sdl = NULL; struct rib_head *rnh; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; netmask = NULL; } else { dst = ifa->ifa_addr; netmask = ifa->ifa_netmask; } if (dst->sa_len == 0) return(EINVAL); switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We support multiple FIBs. */ break; default: fibnum = RT_DEFAULT_FIB; break; } if (fibnum == RT_ALL_FIBS) { if (V_rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) startfib = endfib = ifa->ifa_ifp->if_fib; else { startfib = 0; endfib = rt_numfibs - 1; } } else { KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum")); startfib = fibnum; endfib = fibnum; } /* * If it's a delete, check that if it exists, * it's on the correct interface or we might scrub * a route to another ifa which would * be confusing at best and possibly worse. */ if (cmd == RTM_DELETE) { /* * It's a delete, so it should already exist.. * If it's a net, mask off the host bits * (Assuming we have a mask) * XXX this is kinda inet specific.. */ if (netmask != NULL) { rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask); dst = (struct sockaddr *)tempbuf; } } else if (cmd == RTM_ADD) { sdl = (struct sockaddr_dl *)tempbuf; bzero(sdl, sizeof(struct sockaddr_dl)); sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(struct sockaddr_dl); sdl->sdl_type = ifa->ifa_ifp->if_type; sdl->sdl_index = ifa->ifa_ifp->if_index; } /* * Now go through all the requested tables (fibs) and do the * requested action. Realistically, this will either be fib 0 * for protocols that don't do multiple tables or all the * tables for those that do. */ for ( fibnum = startfib; fibnum <= endfib; fibnum++) { if (cmd == RTM_DELETE) { struct radix_node *rn; /* * Look up an rtentry that is in the routing tree and * contains the correct info. */ rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) /* this table doesn't exist but others might */ continue; RIB_RLOCK(rnh); rn = rnh->rnh_lookup(dst, netmask, &rnh->head); #ifdef RADIX_MPATH if (rt_mpath_capable(rnh)) { if (rn == NULL) error = ESRCH; else { rt = RNTORT(rn); /* * for interface route the * rt->rt_gateway is sockaddr_intf * for cloning ARP entries, so * rt_mpath_matchgate must use the * interface address */ rt = rt_mpath_matchgate(rt, ifa->ifa_addr); if (rt == NULL) error = ESRCH; } } #endif error = (rn == NULL || (rn->rn_flags & RNF_ROOT) || RNTORT(rn)->rt_ifa != ifa); RIB_RUNLOCK(rnh); if (error) { /* this is only an error if bad on ALL tables */ continue; } } /* * Do the actual request */ bzero((caddr_t)&info, sizeof(info)); ifa_ref(ifa); info.rti_ifa = ifa; info.rti_flags = flags | (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED; info.rti_info[RTAX_DST] = dst; /* * doing this for compatibility reasons */ if (cmd == RTM_ADD) info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)sdl; else info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = netmask; error = rtrequest1_fib(cmd, &info, &rt, fibnum); if (error == 0 && rt != NULL) { /* * notify any listening routing agents of the change */ RT_LOCK(rt); #ifdef RADIX_MPATH /* * in case address alias finds the first address * e.g. ifconfig bge0 192.0.2.246/24 * e.g. ifconfig bge0 192.0.2.247/24 * the address set in the route is 192.0.2.246 * so we need to replace it with 192.0.2.247 */ if (memcmp(rt->rt_ifa->ifa_addr, ifa->ifa_addr, ifa->ifa_addr->sa_len)) { ifa_free(rt->rt_ifa); ifa_ref(ifa); rt->rt_ifp = ifa->ifa_ifp; rt->rt_ifa = ifa; } #endif RT_ADDREF(rt); RT_UNLOCK(rt); rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum); RT_LOCK(rt); RT_REMREF(rt); if (cmd == RTM_DELETE) { /* * If we are deleting, and we found an entry, * then it's been removed from the tree.. * now throw it away. */ RTFREE_LOCKED(rt); } else { if (cmd == RTM_ADD) { /* * We just wanted to add it.. * we don't actually need a reference. */ RT_REMREF(rt); } RT_UNLOCK(rt); } didwork = 1; } if (error) a_failure = error; } if (cmd == RTM_DELETE) { if (didwork) { error = 0; } else { /* we only give an error if it wasn't in any table */ error = ((flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH); } } else { if (a_failure) { /* return an error if any of them failed */ error = a_failure; } } return (error); } /* * Set up a routing table entry, normally * for an interface. */ int rtinit(struct ifaddr *ifa, int cmd, int flags) { struct sockaddr *dst; int fib = RT_DEFAULT_FIB; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; } else { dst = ifa->ifa_addr; } switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We do support multiple FIBs. */ fib = RT_ALL_FIBS; break; } return (rtinit1(ifa, cmd, flags, fib)); } /* * Announce interface address arrival/withdraw * Returns 0 on success. */ int rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); #if defined(INET) || defined(INET6) #ifdef SCTP /* * notify the SCTP stack * this will only get called when an address is added/deleted * XXX pass the ifaddr struct instead if ifa->ifa_addr... */ sctp_addr_change(ifa, cmd); #endif /* SCTP */ #endif return (rtsock_addrmsg(cmd, ifa, fibnum)); } /* * Announce route addition/removal. * Users of this function MUST validate input data BEFORE calling. * However we have to be able to handle invalid data: * if some userland app sends us "invalid" route message (invalid mask, * no dst, wrong address families, etc...) we need to pass it back * to app (and any other rtsock consumers) with rtm_errno field set to * non-zero value. * Returns 0 on success. */ int rt_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__)); return (rtsock_routemsg(cmd, ifp, error, rt, fibnum)); } void rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) { rt_newaddrmsg_fib(cmd, ifa, error, rt, RT_ALL_FIBS); } /* * This is called to generate messages from the routing socket * indicating a network interface has had addresses associated with it. */ void rt_newaddrmsg_fib(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %u", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); if (cmd == RTM_ADD) { rt_addrmsg(cmd, ifa, fibnum); if (rt != NULL) rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum); } else { if (rt != NULL) rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum); rt_addrmsg(cmd, ifa, fibnum); } } diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c index 63dd1dedb481..cec807eb6155 100644 --- a/sys/net/rtsock.c +++ b/sys/net/rtsock.c @@ -1,2408 +1,2405 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 * $FreeBSD$ */ #include "opt_ddb.h" #include "opt_mpath.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #ifdef COMPAT_FREEBSD32 #include #include struct if_msghdr32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; uint16_t _ifm_spare1; struct if_data ifm_data; }; struct if_msghdrl32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; uint16_t _ifm_spare1; uint16_t ifm_len; uint16_t ifm_data_off; uint32_t _ifm_spare2; struct if_data ifm_data; }; struct ifa_msghdrl32 { uint16_t ifam_msglen; uint8_t ifam_version; uint8_t ifam_type; int32_t ifam_addrs; int32_t ifam_flags; uint16_t ifam_index; uint16_t _ifam_spare1; uint16_t ifam_len; uint16_t ifam_data_off; int32_t ifam_metric; struct if_data ifam_data; }; #define SA_SIZE32(sa) \ ( (((struct sockaddr *)(sa))->sa_len == 0) ? \ sizeof(int) : \ 1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(int) - 1) ) ) #endif /* COMPAT_FREEBSD32 */ MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); /* NB: these are not modified */ static struct sockaddr route_src = { 2, PF_ROUTE, }; static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; /* These are external hooks for CARP. */ int (*carp_get_vhid_p)(struct ifaddr *); /* * Used by rtsock/raw_input callback code to decide whether to filter the update * notification to a socket bound to a particular FIB. */ #define RTS_FILTER_FIB M_PROTO8 typedef struct { int ip_count; /* attached w/ AF_INET */ int ip6_count; /* attached w/ AF_INET6 */ int any_count; /* total attached */ } route_cb_t; VNET_DEFINE_STATIC(route_cb_t, route_cb); #define V_route_cb VNET(route_cb) struct mtx rtsock_mtx; MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF); #define RTSOCK_LOCK() mtx_lock(&rtsock_mtx) #define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx) #define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED) static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD, 0, ""); struct walkarg { int w_tmemsize; int w_op, w_arg; caddr_t w_tmem; struct sysctl_req *w_req; }; static void rts_input(struct mbuf *m); static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo); static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen); static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo); static int sysctl_dumpentry(struct radix_node *rn, void *vw); static int sysctl_iflist(int af, struct walkarg *w); static int sysctl_ifmalist(int af, struct walkarg *w); static int route_output(struct mbuf *m, struct socket *so, ...); static void rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out); static void rt_dispatch(struct mbuf *, sa_family_t); static struct sockaddr *rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask, struct sockaddr_storage *dmask); static struct netisr_handler rtsock_nh = { .nh_name = "rtsock", .nh_handler = rts_input, .nh_proto = NETISR_ROUTE, .nh_policy = NETISR_POLICY_SOURCE, }; static int sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&rtsock_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&rtsock_nh, qlimit)); } SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_route_netisr_maxqlen, "I", "maximum routing socket dispatch queue length"); static void vnet_rts_init(void) { int tmp; if (IS_DEFAULT_VNET(curvnet)) { if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp)) rtsock_nh.nh_qlimit = tmp; netisr_register(&rtsock_nh); } #ifdef VIMAGE else netisr_register_vnet(&rtsock_nh); #endif } VNET_SYSINIT(vnet_rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, vnet_rts_init, 0); #ifdef VIMAGE static void vnet_rts_uninit(void) { netisr_unregister_vnet(&rtsock_nh); } VNET_SYSUNINIT(vnet_rts_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, vnet_rts_uninit, 0); #endif static int raw_input_rts_cb(struct mbuf *m, struct sockproto *proto, struct sockaddr *src, struct rawcb *rp) { int fibnum; KASSERT(m != NULL, ("%s: m is NULL", __func__)); KASSERT(proto != NULL, ("%s: proto is NULL", __func__)); KASSERT(rp != NULL, ("%s: rp is NULL", __func__)); /* No filtering requested. */ if ((m->m_flags & RTS_FILTER_FIB) == 0) return (0); /* Check if it is a rts and the fib matches the one of the socket. */ fibnum = M_GETFIB(m); if (proto->sp_family != PF_ROUTE || rp->rcb_socket == NULL || rp->rcb_socket->so_fibnum == fibnum) return (0); /* Filtering requested and no match, the socket shall be skipped. */ return (1); } static void rts_input(struct mbuf *m) { struct sockproto route_proto; unsigned short *family; struct m_tag *tag; route_proto.sp_family = PF_ROUTE; tag = m_tag_find(m, PACKET_TAG_RTSOCKFAM, NULL); if (tag != NULL) { family = (unsigned short *)(tag + 1); route_proto.sp_protocol = *family; m_tag_delete(m, tag); } else route_proto.sp_protocol = 0; raw_input_ext(m, &route_proto, &route_src, raw_input_rts_cb); } /* * It really doesn't make any sense at all for this code to share much * with raw_usrreq.c, since its functionality is so restricted. XXX */ static void rts_abort(struct socket *so) { raw_usrreqs.pru_abort(so); } static void rts_close(struct socket *so) { raw_usrreqs.pru_close(so); } /* pru_accept is EOPNOTSUPP */ static int rts_attach(struct socket *so, int proto, struct thread *td) { struct rawcb *rp; int error; KASSERT(so->so_pcb == NULL, ("rts_attach: so_pcb != NULL")); /* XXX */ rp = malloc(sizeof *rp, M_PCB, M_WAITOK | M_ZERO); so->so_pcb = (caddr_t)rp; so->so_fibnum = td->td_proc->p_fibnum; error = raw_attach(so, proto); rp = sotorawcb(so); if (error) { so->so_pcb = NULL; free(rp, M_PCB); return error; } RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: V_route_cb.ip_count++; break; case AF_INET6: V_route_cb.ip6_count++; break; } V_route_cb.any_count++; RTSOCK_UNLOCK(); soisconnected(so); so->so_options |= SO_USELOOPBACK; return 0; } static int rts_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { return (raw_usrreqs.pru_bind(so, nam, td)); /* xxx just EINVAL */ } static int rts_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (raw_usrreqs.pru_connect(so, nam, td)); /* XXX just EINVAL */ } /* pru_connect2 is EOPNOTSUPP */ /* pru_control is EOPNOTSUPP */ static void rts_detach(struct socket *so) { struct rawcb *rp = sotorawcb(so); KASSERT(rp != NULL, ("rts_detach: rp == NULL")); RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: V_route_cb.ip_count--; break; case AF_INET6: V_route_cb.ip6_count--; break; } V_route_cb.any_count--; RTSOCK_UNLOCK(); raw_usrreqs.pru_detach(so); } static int rts_disconnect(struct socket *so) { return (raw_usrreqs.pru_disconnect(so)); } /* pru_listen is EOPNOTSUPP */ static int rts_peeraddr(struct socket *so, struct sockaddr **nam) { return (raw_usrreqs.pru_peeraddr(so, nam)); } /* pru_rcvd is EOPNOTSUPP */ /* pru_rcvoob is EOPNOTSUPP */ static int rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { return (raw_usrreqs.pru_send(so, flags, m, nam, control, td)); } /* pru_sense is null */ static int rts_shutdown(struct socket *so) { return (raw_usrreqs.pru_shutdown(so)); } static int rts_sockaddr(struct socket *so, struct sockaddr **nam) { return (raw_usrreqs.pru_sockaddr(so, nam)); } static struct pr_usrreqs route_usrreqs = { .pru_abort = rts_abort, .pru_attach = rts_attach, .pru_bind = rts_bind, .pru_connect = rts_connect, .pru_detach = rts_detach, .pru_disconnect = rts_disconnect, .pru_peeraddr = rts_peeraddr, .pru_send = rts_send, .pru_shutdown = rts_shutdown, .pru_sockaddr = rts_sockaddr, .pru_close = rts_close, }; #ifndef _SOCKADDR_UNION_DEFINED #define _SOCKADDR_UNION_DEFINED /* * The union of all possible address formats we handle. */ union sockaddr_union { struct sockaddr sa; struct sockaddr_in sin; struct sockaddr_in6 sin6; }; #endif /* _SOCKADDR_UNION_DEFINED */ static int rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred) { #if defined(INET) || defined(INET6) struct epoch_tracker et; #endif /* First, see if the returned address is part of the jail. */ if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) { info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; return (0); } switch (info->rti_info[RTAX_DST]->sa_family) { #ifdef INET case AF_INET: { struct in_addr ia; struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; ia = ((struct sockaddr_in *)sa)->sin_addr; if (prison_check_ip4(cred, &ia) == 0) { found = 1; break; } } NET_EPOCH_EXIT(et); if (!found) { /* * As a last resort return the 'default' jail address. */ ia = ((struct sockaddr_in *)rt->rt_ifa->ifa_addr)-> sin_addr; if (prison_get_ip4(cred, &ia) != 0) return (ESRCH); } bzero(&saun->sin, sizeof(struct sockaddr_in)); saun->sin.sin_len = sizeof(struct sockaddr_in); saun->sin.sin_family = AF_INET; saun->sin.sin_addr.s_addr = ia.s_addr; info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin; break; } #endif #ifdef INET6 case AF_INET6: { struct in6_addr ia6; struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; if (sa->sa_family != AF_INET6) continue; bcopy(&((struct sockaddr_in6 *)sa)->sin6_addr, &ia6, sizeof(struct in6_addr)); if (prison_check_ip6(cred, &ia6) == 0) { found = 1; break; } } NET_EPOCH_EXIT(et); if (!found) { /* * As a last resort return the 'default' jail address. */ ia6 = ((struct sockaddr_in6 *)rt->rt_ifa->ifa_addr)-> sin6_addr; if (prison_get_ip6(cred, &ia6) != 0) return (ESRCH); } bzero(&saun->sin6, sizeof(struct sockaddr_in6)); saun->sin6.sin6_len = sizeof(struct sockaddr_in6); saun->sin6.sin6_family = AF_INET6; bcopy(&ia6, &saun->sin6.sin6_addr, sizeof(struct in6_addr)); if (sa6_recoverscope(&saun->sin6) != 0) return (ESRCH); info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6; break; } #endif default: return (ESRCH); } return (0); } /*ARGSUSED*/ static int route_output(struct mbuf *m, struct socket *so, ...) { RIB_RLOCK_TRACKER; struct rt_msghdr *rtm = NULL; struct rtentry *rt = NULL; struct rib_head *rnh; struct rt_addrinfo info; struct sockaddr_storage ss; + struct epoch_tracker et; #ifdef INET6 struct sockaddr_in6 *sin6; int i, rti_need_deembed = 0; #endif int alloc_len = 0, len, error = 0, fibnum; struct ifnet *ifp = NULL; union sockaddr_union saun; sa_family_t saf = AF_UNSPEC; struct rawcb *rp = NULL; struct walkarg w; fibnum = so->so_fibnum; #define senderr(e) { error = e; goto flush;} if (m == NULL || ((m->m_len < sizeof(long)) && (m = m_pullup(m, sizeof(long))) == NULL)) return (ENOBUFS); if ((m->m_flags & M_PKTHDR) == 0) panic("route_output"); + NET_EPOCH_ENTER(et); len = m->m_pkthdr.len; if (len < sizeof(*rtm) || len != mtod(m, struct rt_msghdr *)->rtm_msglen) senderr(EINVAL); /* * Most of current messages are in range 200-240 bytes, * minimize possible re-allocation on reply using larger size * buffer aligned on 1k boundaty. */ alloc_len = roundup2(len, 1024); if ((rtm = malloc(alloc_len, M_TEMP, M_NOWAIT)) == NULL) senderr(ENOBUFS); m_copydata(m, 0, len, (caddr_t)rtm); bzero(&info, sizeof(info)); bzero(&w, sizeof(w)); if (rtm->rtm_version != RTM_VERSION) { /* Do not touch message since format is unknown */ free(rtm, M_TEMP); rtm = NULL; senderr(EPROTONOSUPPORT); } /* * Starting from here, it is possible * to alter original message and insert * caller PID and error value. */ rtm->rtm_pid = curproc->p_pid; info.rti_addrs = rtm->rtm_addrs; info.rti_mflags = rtm->rtm_inits; info.rti_rmx = &rtm->rtm_rmx; /* * rt_xaddrs() performs s6_addr[2] := sin6_scope_id for AF_INET6 * link-local address because rtrequest requires addresses with * embedded scope id. */ if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) senderr(EINVAL); if (rtm->rtm_flags & RTF_RNH_LOCKED) senderr(EINVAL); info.rti_flags = rtm->rtm_flags; if (info.rti_info[RTAX_DST] == NULL || info.rti_info[RTAX_DST]->sa_family >= AF_MAX || (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX)) senderr(EINVAL); saf = info.rti_info[RTAX_DST]->sa_family; /* * Verify that the caller has the appropriate privilege; RTM_GET * is the only operation the non-superuser is allowed. */ if (rtm->rtm_type != RTM_GET) { error = priv_check(curthread, PRIV_NET_ROUTE); if (error) senderr(error); } /* * The given gateway address may be an interface address. * For example, issuing a "route change" command on a route * entry that was created from a tunnel, and the gateway * address given is the local end point. In this case the * RTF_GATEWAY flag must be cleared or the destination will * not be reachable even though there is no error message. */ if (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) { struct rt_addrinfo ginfo; struct sockaddr *gdst; bzero(&ginfo, sizeof(ginfo)); bzero(&ss, sizeof(ss)); ss.ss_len = sizeof(ss); ginfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&ss; gdst = info.rti_info[RTAX_GATEWAY]; /* * A host route through the loopback interface is * installed for each interface adddress. In pre 8.0 * releases the interface address of a PPP link type * is not reachable locally. This behavior is fixed as * part of the new L2/L3 redesign and rewrite work. The * signature of this interface address route is the * AF_LINK sa_family type of the rt_gateway, and the * rt_ifp has the IFF_LOOPBACK flag set. */ if (rib_lookup_info(fibnum, gdst, NHR_REF, 0, &ginfo) == 0) { if (ss.ss_family == AF_LINK && ginfo.rti_ifp->if_flags & IFF_LOOPBACK) { info.rti_flags &= ~RTF_GATEWAY; info.rti_flags |= RTF_GWFLAG_COMPAT; } rib_free_info(&ginfo); } } switch (rtm->rtm_type) { struct rtentry *saved_nrt; case RTM_ADD: case RTM_CHANGE: if (rtm->rtm_type == RTM_ADD) { if (info.rti_info[RTAX_GATEWAY] == NULL) senderr(EINVAL); } saved_nrt = NULL; /* support for new ARP code */ if (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK && (rtm->rtm_flags & RTF_LLDATA) != 0) { error = lla_rt_output(rtm, &info); #ifdef INET6 if (error == 0) rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; } error = rtrequest1_fib(rtm->rtm_type, &info, &saved_nrt, fibnum); if (error == 0 && saved_nrt != NULL) { #ifdef INET6 rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif RT_LOCK(saved_nrt); rtm->rtm_index = saved_nrt->rt_ifp->if_index; RT_REMREF(saved_nrt); RT_UNLOCK(saved_nrt); } break; case RTM_DELETE: saved_nrt = NULL; /* support for new ARP code */ if (info.rti_info[RTAX_GATEWAY] && (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) && (rtm->rtm_flags & RTF_LLDATA) != 0) { error = lla_rt_output(rtm, &info); #ifdef INET6 if (error == 0) rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; } error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt, fibnum); if (error == 0) { RT_LOCK(saved_nrt); rt = saved_nrt; goto report; } #ifdef INET6 /* rt_msg2() will not be used when RTM_DELETE fails. */ rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; case RTM_GET: rnh = rt_tables_get_rnh(fibnum, saf); if (rnh == NULL) senderr(EAFNOSUPPORT); RIB_RLOCK(rnh); if (info.rti_info[RTAX_NETMASK] == NULL && rtm->rtm_type == RTM_GET) { /* * Provide longest prefix match for * address lookup (no mask). * 'route -n get addr' */ rt = (struct rtentry *) rnh->rnh_matchaddr( info.rti_info[RTAX_DST], &rnh->head); } else rt = (struct rtentry *) rnh->rnh_lookup( info.rti_info[RTAX_DST], info.rti_info[RTAX_NETMASK], &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); senderr(ESRCH); } #ifdef RADIX_MPATH /* * for RTM_CHANGE/LOCK, if we got multipath routes, * we require users to specify a matching RTAX_GATEWAY. * * for RTM_GET, gate is optional even with multipath. * if gate == NULL the first match is returned. * (no need to call rt_mpath_matchgate if gate == NULL) */ if (rt_mpath_capable(rnh) && (rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) { rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]); if (!rt) { RIB_RUNLOCK(rnh); senderr(ESRCH); } } #endif /* * If performing proxied L2 entry insertion, and * the actual PPP host entry is found, perform * another search to retrieve the prefix route of * the local end point of the PPP link. */ if (rtm->rtm_flags & RTF_ANNOUNCE) { struct sockaddr laddr; if (rt->rt_ifp != NULL && rt->rt_ifp->if_type == IFT_PROPVIRTUAL) { struct epoch_tracker et; struct ifaddr *ifa; NET_EPOCH_ENTER(et); ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1, RT_ALL_FIBS); + NET_EPOCH_EXIT(et); if (ifa != NULL) rt_maskedcopy(ifa->ifa_addr, &laddr, ifa->ifa_netmask); - NET_EPOCH_EXIT(et); } else rt_maskedcopy(rt->rt_ifa->ifa_addr, &laddr, rt->rt_ifa->ifa_netmask); /* * refactor rt and no lock operation necessary */ rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr, &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); senderr(ESRCH); } } RT_LOCK(rt); RT_ADDREF(rt); RIB_RUNLOCK(rnh); report: RT_LOCK_ASSERT(rt); if ((rt->rt_flags & RTF_HOST) == 0 ? jailed_without_vnet(curthread->td_ucred) : prison_if(curthread->td_ucred, rt_key(rt)) != 0) { RT_UNLOCK(rt); senderr(ESRCH); } info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss); info.rti_info[RTAX_GENMASK] = 0; if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { ifp = rt->rt_ifp; if (ifp) { info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; error = rtm_get_jailed(&info, ifp, rt, &saun, curthread->td_ucred); if (error != 0) { RT_UNLOCK(rt); senderr(error); } if (ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; rtm->rtm_index = ifp->if_index; } else { info.rti_info[RTAX_IFP] = NULL; info.rti_info[RTAX_IFA] = NULL; } } else if ((ifp = rt->rt_ifp) != NULL) { rtm->rtm_index = ifp->if_index; } /* Check if we need to realloc storage */ rtsock_msg_buffer(rtm->rtm_type, &info, NULL, &len); if (len > alloc_len) { struct rt_msghdr *new_rtm; new_rtm = malloc(len, M_TEMP, M_NOWAIT); if (new_rtm == NULL) { RT_UNLOCK(rt); senderr(ENOBUFS); } bcopy(rtm, new_rtm, rtm->rtm_msglen); free(rtm, M_TEMP); rtm = new_rtm; alloc_len = len; } w.w_tmem = (caddr_t)rtm; w.w_tmemsize = alloc_len; rtsock_msg_buffer(rtm->rtm_type, &info, &w, &len); if (rt->rt_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rt->rt_flags & ~RTF_GWFLAG_COMPAT); else rtm->rtm_flags = rt->rt_flags; rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_addrs = info.rti_addrs; RT_UNLOCK(rt); break; default: senderr(EOPNOTSUPP); } flush: + NET_EPOCH_EXIT(et); if (rt != NULL) RTFREE(rt); /* * Check to see if we don't want our own messages. */ if ((so->so_options & SO_USELOOPBACK) == 0) { if (V_route_cb.any_count <= 1) { if (rtm != NULL) free(rtm, M_TEMP); m_freem(m); return (error); } /* There is another listener, so construct message */ rp = sotorawcb(so); } if (rtm != NULL) { #ifdef INET6 if (rti_need_deembed) { /* sin6_scope_id is recovered before sending rtm. */ sin6 = (struct sockaddr_in6 *)&ss; for (i = 0; i < RTAX_MAX; i++) { if (info.rti_info[i] == NULL) continue; if (info.rti_info[i]->sa_family != AF_INET6) continue; bcopy(info.rti_info[i], sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) bcopy(sin6, info.rti_info[i], sizeof(*sin6)); } } #endif if (error != 0) rtm->rtm_errno = error; else rtm->rtm_flags |= RTF_DONE; m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm); if (m->m_pkthdr.len < rtm->rtm_msglen) { m_freem(m); m = NULL; } else if (m->m_pkthdr.len > rtm->rtm_msglen) m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len); free(rtm, M_TEMP); } if (m != NULL) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; if (rp) { /* * XXX insure we don't get a copy by * invalidating our protocol */ unsigned short family = rp->rcb_proto.sp_family; rp->rcb_proto.sp_family = 0; rt_dispatch(m, saf); rp->rcb_proto.sp_family = family; } else rt_dispatch(m, saf); } return (error); } static void rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out) { bzero(out, sizeof(*out)); out->rmx_mtu = rt->rt_mtu; out->rmx_weight = rt->rt_weight; out->rmx_pksent = counter_u64_fetch(rt->rt_pksent); /* Kernel -> userland timebase conversion. */ out->rmx_expire = rt->rt_expire ? rt->rt_expire - time_uptime + time_second : 0; } /* * Extract the addresses of the passed sockaddrs. * Do a little sanity checking so as to avoid bad memory references. * This data is derived straight from userland. */ static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo) { struct sockaddr *sa; int i; for (i = 0; i < RTAX_MAX && cp < cplim; i++) { if ((rtinfo->rti_addrs & (1 << i)) == 0) continue; sa = (struct sockaddr *)cp; /* * It won't fit. */ if (cp + sa->sa_len > cplim) return (EINVAL); /* * there are no more.. quit now * If there are more bits, they are in error. * I've seen this. route(1) can evidently generate these. * This causes kernel to core dump. * for compatibility, If we see this, point to a safe address. */ if (sa->sa_len == 0) { rtinfo->rti_info[i] = &sa_zero; return (0); /* should be EINVAL but for compat */ } /* accept it */ #ifdef INET6 if (sa->sa_family == AF_INET6) sa6_embedscope((struct sockaddr_in6 *)sa, V_ip6_use_defzone); #endif rtinfo->rti_info[i] = sa; cp += SA_SIZE(sa); } return (0); } /* * Fill in @dmask with valid netmask leaving original @smask * intact. Mostly used with radix netmasks. */ static struct sockaddr * rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask, struct sockaddr_storage *dmask) { if (dst == NULL || smask == NULL) return (NULL); memset(dmask, 0, dst->sa_len); memcpy(dmask, smask, smask->sa_len); dmask->ss_len = dst->sa_len; dmask->ss_family = dst->sa_family; return ((struct sockaddr *)dmask); } /* * Writes information related to @rtinfo object to newly-allocated mbuf. * Assumes MCLBYTES is enough to construct any message. * Used for OS notifications of vaious events (if/ifa announces,etc) * * Returns allocated mbuf or NULL on failure. */ static struct mbuf * rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo) { struct rt_msghdr *rtm; struct mbuf *m; int i; struct sockaddr *sa; #ifdef INET6 struct sockaddr_storage ss; struct sockaddr_in6 *sin6; #endif int len, dlen; switch (type) { case RTM_DELADDR: case RTM_NEWADDR: len = sizeof(struct ifa_msghdr); break; case RTM_DELMADDR: case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; case RTM_IFINFO: len = sizeof(struct if_msghdr); break; case RTM_IFANNOUNCE: case RTM_IEEE80211: len = sizeof(struct if_announcemsghdr); break; default: len = sizeof(struct rt_msghdr); } /* XXXGL: can we use MJUMPAGESIZE cluster here? */ KASSERT(len <= MCLBYTES, ("%s: message too big", __func__)); if (len > MHLEN) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (m); m->m_pkthdr.len = m->m_len = len; rtm = mtod(m, struct rt_msghdr *); bzero((caddr_t)rtm, len); for (i = 0; i < RTAX_MAX; i++) { if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); dlen = SA_SIZE(sa); #ifdef INET6 if (V_deembed_scopeid && sa->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)&ss; bcopy(sa, sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) sa = (struct sockaddr *)sin6; } #endif m_copyback(m, len, dlen, (caddr_t)sa); len += dlen; } if (m->m_pkthdr.len != len) { m_freem(m); return (NULL); } rtm->rtm_msglen = len; rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; return (m); } /* * Writes information related to @rtinfo object to preallocated buffer. * Stores needed size in @plen. If @w is NULL, calculates size without * writing. * Used for sysctl dumps and rtsock answers (RTM_DEL/RTM_GET) generation. * * Returns 0 on success. * */ static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen) { int i; int len, buflen = 0, dlen; caddr_t cp = NULL; struct rt_msghdr *rtm = NULL; #ifdef INET6 struct sockaddr_storage ss; struct sockaddr_in6 *sin6; #endif #ifdef COMPAT_FREEBSD32 bool compat32 = false; #endif switch (type) { case RTM_DELADDR: case RTM_NEWADDR: if (w != NULL && w->w_op == NET_RT_IFLISTL) { #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { len = sizeof(struct ifa_msghdrl32); compat32 = true; } else #endif len = sizeof(struct ifa_msghdrl); } else len = sizeof(struct ifa_msghdr); break; case RTM_IFINFO: #ifdef COMPAT_FREEBSD32 if (w != NULL && w->w_req->flags & SCTL_MASK32) { if (w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl32); else len = sizeof(struct if_msghdr32); compat32 = true; break; } #endif if (w != NULL && w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl); else len = sizeof(struct if_msghdr); break; case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; default: len = sizeof(struct rt_msghdr); } if (w != NULL) { rtm = (struct rt_msghdr *)w->w_tmem; buflen = w->w_tmemsize - len; cp = (caddr_t)w->w_tmem + len; } rtinfo->rti_addrs = 0; for (i = 0; i < RTAX_MAX; i++) { struct sockaddr *sa; if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); #ifdef COMPAT_FREEBSD32 if (compat32) dlen = SA_SIZE32(sa); else #endif dlen = SA_SIZE(sa); if (cp != NULL && buflen >= dlen) { #ifdef INET6 if (V_deembed_scopeid && sa->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)&ss; bcopy(sa, sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) sa = (struct sockaddr *)sin6; } #endif bcopy((caddr_t)sa, cp, (unsigned)dlen); cp += dlen; buflen -= dlen; } else if (cp != NULL) { /* * Buffer too small. Count needed size * and return with error. */ cp = NULL; } len += dlen; } if (cp != NULL) { dlen = ALIGN(len) - len; if (buflen < dlen) cp = NULL; else { bzero(cp, dlen); cp += dlen; buflen -= dlen; } } len = ALIGN(len); if (cp != NULL) { /* fill header iff buffer is large enough */ rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; rtm->rtm_msglen = len; } *plen = len; if (w != NULL && cp == NULL) return (ENOBUFS); return (0); } /* * This routine is called to generate a message from the routing * socket indicating that a redirect has occurred, a routing lookup * has failed, or that a protocol has detected timeouts to a particular * destination. */ void rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error, int fibnum) { struct rt_msghdr *rtm; struct mbuf *m; struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; if (V_route_cb.any_count == 0) return; m = rtsock_msg_mbuf(type, rtinfo); if (m == NULL) return; if (fibnum != RT_ALL_FIBS) { KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out " "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs)); M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rtm = mtod(m, struct rt_msghdr *); rtm->rtm_flags = RTF_DONE | flags; rtm->rtm_errno = error; rtm->rtm_addrs = rtinfo->rti_addrs; rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); } void rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) { rt_missmsg_fib(type, rtinfo, flags, error, RT_ALL_FIBS); } /* * This routine is called to generate a message from the routing * socket indicating that the status of a network interface has changed. */ void rt_ifmsg(struct ifnet *ifp) { struct if_msghdr *ifm; struct mbuf *m; struct rt_addrinfo info; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); m = rtsock_msg_mbuf(RTM_IFINFO, &info); if (m == NULL) return; ifm = mtod(m, struct if_msghdr *); ifm->ifm_index = ifp->if_index; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; if_data_copy(ifp, &ifm->ifm_data); ifm->ifm_addrs = 0; rt_dispatch(m, AF_UNSPEC); } /* * Announce interface address arrival/withdraw. * Please do not call directly, use rt_addrmsg(). * Assume input data to be valid. * Returns 0 on success. */ int rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { struct rt_addrinfo info; struct sockaddr *sa; int ncmd; struct mbuf *m; struct ifa_msghdr *ifam; struct ifnet *ifp = ifa->ifa_ifp; struct sockaddr_storage ss; if (V_route_cb.any_count == 0) return (0); ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr; info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( info.rti_info[RTAX_IFP], ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; if ((m = rtsock_msg_mbuf(ncmd, &info)) == NULL) return (ENOBUFS); ifam = mtod(m, struct ifa_msghdr *); ifam->ifam_index = ifp->if_index; ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_addrs = info.rti_addrs; if (fibnum != RT_ALL_FIBS) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * Announce route addition/removal. * Please do not call directly, use rt_routemsg(). * Note that @rt data MAY be inconsistent/invalid: * if some userland app sends us "invalid" route message (invalid mask, * no dst, wrong address families, etc...) we need to pass it back * to app (and any other rtsock consumers) with rtm_errno field set to * non-zero value. * * Returns 0 on success. */ int rtsock_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt, int fibnum) { struct rt_addrinfo info; struct sockaddr *sa; struct mbuf *m; struct rt_msghdr *rtm; struct sockaddr_storage ss; if (V_route_cb.any_count == 0) return (0); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = sa = rt_key(rt); info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(sa, rt_mask(rt), &ss); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; if ((m = rtsock_msg_mbuf(cmd, &info)) == NULL) return (ENOBUFS); rtm = mtod(m, struct rt_msghdr *); rtm->rtm_index = ifp->if_index; rtm->rtm_flags |= rt->rt_flags; rtm->rtm_errno = error; rtm->rtm_addrs = info.rti_addrs; if (fibnum != RT_ALL_FIBS) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * This is the analogue to the rt_newaddrmsg which performs the same * function but for multicast group memberhips. This is easier since * there is no route state to worry about. */ void rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma) { struct rt_addrinfo info; struct mbuf *m = NULL; struct ifnet *ifp = ifma->ifma_ifp; struct ifma_msghdr *ifmam; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = ifma->ifma_addr; if (ifp && ifp->if_addr) info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; else info.rti_info[RTAX_IFP] = NULL; /* * If a link-layer address is present, present it as a ``gateway'' * (similarly to how ARP entries, e.g., are presented). */ info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr; m = rtsock_msg_mbuf(cmd, &info); if (m == NULL) return; ifmam = mtod(m, struct ifma_msghdr *); KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n", __func__)); ifmam->ifmam_index = ifp->if_index; ifmam->ifmam_addrs = info.rti_addrs; rt_dispatch(m, ifma->ifma_addr ? ifma->ifma_addr->sa_family : AF_UNSPEC); } static struct mbuf * rt_makeifannouncemsg(struct ifnet *ifp, int type, int what, struct rt_addrinfo *info) { struct if_announcemsghdr *ifan; struct mbuf *m; if (V_route_cb.any_count == 0) return NULL; bzero((caddr_t)info, sizeof(*info)); m = rtsock_msg_mbuf(type, info); if (m != NULL) { ifan = mtod(m, struct if_announcemsghdr *); ifan->ifan_index = ifp->if_index; strlcpy(ifan->ifan_name, ifp->if_xname, sizeof(ifan->ifan_name)); ifan->ifan_what = what; } return m; } /* * This is called to generate routing socket messages indicating * IEEE80211 wireless events. * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way. */ void rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info); if (m != NULL) { /* * Append the ieee80211 data. Try to stick it in the * mbuf containing the ifannounce msg; otherwise allocate * a new mbuf and append. * * NB: we assume m is a single mbuf. */ if (data_len > M_TRAILINGSPACE(m)) { struct mbuf *n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } bcopy(data, mtod(n, void *), data_len); n->m_len = data_len; m->m_next = n; } else if (data_len > 0) { bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len); m->m_len += data_len; } if (m->m_flags & M_PKTHDR) m->m_pkthdr.len += data_len; mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len; rt_dispatch(m, AF_UNSPEC); } } /* * This is called to generate routing socket messages indicating * network interface arrival and departure. */ void rt_ifannouncemsg(struct ifnet *ifp, int what) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info); if (m != NULL) rt_dispatch(m, AF_UNSPEC); } static void rt_dispatch(struct mbuf *m, sa_family_t saf) { struct m_tag *tag; /* * Preserve the family from the sockaddr, if any, in an m_tag for * use when injecting the mbuf into the routing socket buffer from * the netisr. */ if (saf != AF_UNSPEC) { tag = m_tag_get(PACKET_TAG_RTSOCKFAM, sizeof(unsigned short), M_NOWAIT); if (tag == NULL) { m_freem(m); return; } *(unsigned short *)(tag + 1) = saf; m_tag_prepend(m, tag); } #ifdef VIMAGE if (V_loif) m->m_pkthdr.rcvif = V_loif; else { m_freem(m); return; } #endif netisr_queue(NETISR_ROUTE, m); /* mbuf is free'd on failure. */ } /* * This is used in dumping the kernel table via sysctl(). */ static int sysctl_dumpentry(struct radix_node *rn, void *vw) { struct walkarg *w = vw; struct rtentry *rt = (struct rtentry *)rn; int error = 0, size; struct rt_addrinfo info; struct sockaddr_storage ss; NET_EPOCH_ASSERT(); if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg)) return 0; if ((rt->rt_flags & RTF_HOST) == 0 ? jailed_without_vnet(w->w_req->td->td_ucred) : prison_if(w->w_req->td->td_ucred, rt_key(rt)) != 0) return (0); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss); info.rti_info[RTAX_GENMASK] = 0; if (rt->rt_ifp && !(rt->rt_ifp->if_flags & IFF_DYING)) { info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr; info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; if (rt->rt_ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; } if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0) return (error); if (w->w_req && w->w_tmem) { struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; bzero(&rtm->rtm_index, sizeof(*rtm) - offsetof(struct rt_msghdr, rtm_index)); if (rt->rt_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rt->rt_flags & ~RTF_GWFLAG_COMPAT); else rtm->rtm_flags = rt->rt_flags; rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_index = rt->rt_ifp->if_index; rtm->rtm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); return (error); } return (error); } static int sysctl_iflist_ifml(struct ifnet *ifp, const struct if_data *src_ifd, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdrl *ifm; struct if_data *ifd; ifm = (struct if_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdrl32 *ifm32; ifm32 = (struct if_msghdrl32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifm32->_ifm_spare1 = 0; ifm32->ifm_len = sizeof(*ifm32); ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data); ifm32->_ifm_spare2 = 0; ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifm->_ifm_spare1 = 0; ifm->ifm_len = sizeof(*ifm); ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data); ifm->_ifm_spare2 = 0; ifd = &ifm->ifm_data; } memcpy(ifd, src_ifd, sizeof(*ifd)); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifm(struct ifnet *ifp, const struct if_data *src_ifd, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdr *ifm; struct if_data *ifd; ifm = (struct if_msghdr *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdr32 *ifm32; ifm32 = (struct if_msghdr32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifm32->_ifm_spare1 = 0; ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifm->_ifm_spare1 = 0; ifd = &ifm->ifm_data; } memcpy(ifd, src_ifd, sizeof(*ifd)); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdrl *ifam; struct if_data *ifd; ifam = (struct ifa_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct ifa_msghdrl32 *ifam32; ifam32 = (struct ifa_msghdrl32 *)ifam; ifam32->ifam_addrs = info->rti_addrs; ifam32->ifam_flags = ifa->ifa_flags; ifam32->ifam_index = ifa->ifa_ifp->if_index; ifam32->_ifam_spare1 = 0; ifam32->ifam_len = sizeof(*ifam32); ifam32->ifam_data_off = offsetof(struct ifa_msghdrl32, ifam_data); ifam32->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam32->ifam_data; } else #endif { ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->_ifam_spare1 = 0; ifam->ifam_len = sizeof(*ifam); ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data); ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam->ifam_data; } bzero(ifd, sizeof(*ifd)); ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_ipackets = counter_u64_fetch(ifa->ifa_ipackets); ifd->ifi_opackets = counter_u64_fetch(ifa->ifa_opackets); ifd->ifi_ibytes = counter_u64_fetch(ifa->ifa_ibytes); ifd->ifi_obytes = counter_u64_fetch(ifa->ifa_obytes); /* Fixup if_data carp(4) vhid. */ if (carp_get_vhid_p != NULL) ifd->ifi_vhid = (*carp_get_vhid_p)(ifa); return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdr *ifam; ifam = (struct ifa_msghdr *)w->w_tmem; ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->_ifam_spare1 = 0; ifam->ifam_metric = ifa->ifa_ifp->if_metric; return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist(int af, struct walkarg *w) { struct ifnet *ifp; struct ifaddr *ifa; struct if_data ifd; struct rt_addrinfo info; int len, error = 0; struct sockaddr_storage ss; - struct epoch_tracker et; bzero((caddr_t)&info, sizeof(info)); bzero(&ifd, sizeof(ifd)); - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; if_data_copy(ifp, &ifd); ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa->ifa_addr; error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len); if (error != 0) goto done; info.rti_info[RTAX_IFP] = NULL; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifml(ifp, &ifd, &info, w, len); else error = sysctl_iflist_ifm(ifp, &ifd, &info, w, len); if (error) goto done; } while ((ifa = CK_STAILQ_NEXT(ifa, ifa_link)) != NULL) { if (af && af != ifa->ifa_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifa->ifa_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( ifa->ifa_addr, ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; error = rtsock_msg_buffer(RTM_NEWADDR, &info, w, &len); if (error != 0) goto done; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifaml(ifa, &info, w, len); else error = sysctl_iflist_ifam(ifa, &info, w, len); if (error) goto done; } } info.rti_info[RTAX_IFA] = NULL; info.rti_info[RTAX_NETMASK] = NULL; info.rti_info[RTAX_BRD] = NULL; } done: - NET_EPOCH_EXIT(et); return (error); } static int sysctl_ifmalist(int af, struct walkarg *w) { struct rt_addrinfo info; - struct epoch_tracker et; struct ifaddr *ifa; struct ifmultiaddr *ifma; struct ifnet *ifp; int error, len; + NET_EPOCH_ASSERT(); + error = 0; bzero((caddr_t)&info, sizeof(info)); - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL; CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (af && af != ifma->ifma_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifma->ifma_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifma->ifma_addr; info.rti_info[RTAX_GATEWAY] = (ifma->ifma_addr->sa_family != AF_LINK) ? ifma->ifma_lladdr : NULL; error = rtsock_msg_buffer(RTM_NEWMADDR, &info, w, &len); if (error != 0) break; if (w->w_req && w->w_tmem) { struct ifma_msghdr *ifmam; ifmam = (struct ifma_msghdr *)w->w_tmem; ifmam->ifmam_index = ifma->ifma_ifp->if_index; ifmam->ifmam_flags = 0; ifmam->ifmam_addrs = info.rti_addrs; ifmam->_ifmam_spare1 = 0; error = SYSCTL_OUT(w->w_req, w->w_tmem, len); if (error != 0) break; } } if (error != 0) break; } - NET_EPOCH_EXIT(et); return (error); } static int sysctl_rtsock(SYSCTL_HANDLER_ARGS) { RIB_RLOCK_TRACKER; + struct epoch_tracker et; int *name = (int *)arg1; u_int namelen = arg2; struct rib_head *rnh = NULL; /* silence compiler. */ int i, lim, error = EINVAL; int fib = 0; u_char af; struct walkarg w; name ++; namelen--; if (req->newptr) return (EPERM); if (name[1] == NET_RT_DUMP) { if (namelen == 3) fib = req->td->td_proc->p_fibnum; else if (namelen == 4) fib = (name[3] == RT_ALL_FIBS) ? req->td->td_proc->p_fibnum : name[3]; else return ((namelen < 3) ? EISDIR : ENOTDIR); if (fib < 0 || fib >= rt_numfibs) return (EINVAL); } else if (namelen != 3) return ((namelen < 3) ? EISDIR : ENOTDIR); af = name[0]; if (af > AF_MAX) return (EINVAL); bzero(&w, sizeof(w)); w.w_op = name[1]; w.w_arg = name[2]; w.w_req = req; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); /* * Allocate reply buffer in advance. * All rtsock messages has maximum length of u_short. */ w.w_tmemsize = 65536; w.w_tmem = malloc(w.w_tmemsize, M_TEMP, M_WAITOK); + NET_EPOCH_ENTER(et); switch (w.w_op) { - case NET_RT_DUMP: case NET_RT_FLAGS: if (af == 0) { /* dump all tables */ i = 1; lim = AF_MAX; } else /* dump only one table */ i = lim = af; /* * take care of llinfo entries, the caller must * specify an AF */ if (w.w_op == NET_RT_FLAGS && (w.w_arg == 0 || w.w_arg & RTF_LLINFO)) { if (af != 0) error = lltable_sysctl_dumparp(af, w.w_req); else error = EINVAL; break; } /* * take care of routing entries */ for (error = 0; error == 0 && i <= lim; i++) { rnh = rt_tables_get_rnh(fib, i); if (rnh != NULL) { - struct epoch_tracker et; - RIB_RLOCK(rnh); - NET_EPOCH_ENTER(et); error = rnh->rnh_walktree(&rnh->head, sysctl_dumpentry, &w); - NET_EPOCH_EXIT(et); RIB_RUNLOCK(rnh); } else if (af != 0) error = EAFNOSUPPORT; } break; case NET_RT_IFLIST: case NET_RT_IFLISTL: error = sysctl_iflist(af, &w); break; case NET_RT_IFMALIST: error = sysctl_ifmalist(af, &w); break; } + NET_EPOCH_EXIT(et); free(w.w_tmem, M_TEMP); return (error); } static SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, ""); /* * Definitions of protocols supported in the ROUTE domain. */ static struct domain routedomain; /* or at least forward */ static struct protosw routesw[] = { { .pr_type = SOCK_RAW, .pr_domain = &routedomain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_output = route_output, .pr_ctlinput = raw_ctlinput, .pr_init = raw_init, .pr_usrreqs = &route_usrreqs } }; static struct domain routedomain = { .dom_family = PF_ROUTE, .dom_name = "route", .dom_protosw = routesw, .dom_protoswNPROTOSW = &routesw[nitems(routesw)] }; VNET_DOMAIN_SET(route); #ifdef DDB /* * Unfortunately, RTF_ values are expressed as raw masks rather than powers of * 2, so we cannot use them as nice C99 initializer indices below. */ static const char * const rtf_flag_strings[] = { "UP", "GATEWAY", "HOST", "REJECT", "DYNAMIC", "MODIFIED", "DONE", "UNUSED_0x80", "UNUSED_0x100", "XRESOLVE", "LLDATA", "STATIC", "BLACKHOLE", "UNUSED_0x2000", "PROTO2", "PROTO1", "UNUSED_0x10000", "UNUSED_0x20000", "PROTO3", "FIXEDMTU", "PINNED", "LOCAL", "BROADCAST", "MULTICAST", /* Big gap. */ [28] = "STICKY", [30] = "RNH_LOCKED", [31] = "GWFLAG_COMPAT", }; static const char * __pure rt_flag_name(unsigned idx) { if (idx >= nitems(rtf_flag_strings)) return ("INVALID_FLAG"); if (rtf_flag_strings[idx] == NULL) return ("UNKNOWN"); return (rtf_flag_strings[idx]); } static void rt_dumpaddr_ddb(const char *name, const struct sockaddr *sa) { char buf[INET6_ADDRSTRLEN], *res; res = NULL; if (sa == NULL) res = "NULL"; else if (sa->sa_family == AF_INET) { res = inet_ntop(AF_INET, &((const struct sockaddr_in *)sa)->sin_addr, buf, sizeof(buf)); } else if (sa->sa_family == AF_INET6) { res = inet_ntop(AF_INET6, &((const struct sockaddr_in6 *)sa)->sin6_addr, buf, sizeof(buf)); } else if (sa->sa_family == AF_LINK) { res = "on link"; } if (res != NULL) { db_printf("%s <%s> ", name, res); return; } db_printf("%s ", name, sa->sa_family); } static int rt_dumpentry_ddb(struct radix_node *rn, void *arg __unused) { struct sockaddr_storage ss; struct rtentry *rt; int flags, idx; /* If RNTORT is important, put it in a header. */ rt = (void *)rn; rt_dumpaddr_ddb("dst", rt_key(rt)); rt_dumpaddr_ddb("gateway", rt->rt_gateway); rt_dumpaddr_ddb("netmask", rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss)); if (rt->rt_ifp != NULL && (rt->rt_ifp->if_flags & IFF_DYING) == 0) { rt_dumpaddr_ddb("ifp", rt->rt_ifp->if_addr->ifa_addr); rt_dumpaddr_ddb("ifa", rt->rt_ifa->ifa_addr); } db_printf("flags "); flags = rt->rt_flags; if (flags == 0) db_printf("none"); while ((idx = ffs(flags)) > 0) { idx--; if (flags != rt->rt_flags) db_printf(","); db_printf("%s", rt_flag_name(idx)); flags &= ~(1ul << idx); } db_printf("\n"); return (0); } DB_SHOW_COMMAND(routetable, db_show_routetable_cmd) { struct rib_head *rnh; int error, i, lim; if (have_addr) i = lim = addr; else { i = 1; lim = AF_MAX; } for (; i <= lim; i++) { rnh = rt_tables_get_rnh(0, i); if (rnh == NULL) { if (have_addr) { db_printf("%s: AF %d not supported?\n", __func__, i); break; } continue; } if (!have_addr && i > 1) db_printf("\n"); db_printf("Route table for AF %d%s%s%s:\n", i, (i == AF_INET || i == AF_INET6) ? " (" : "", (i == AF_INET) ? "INET" : (i == AF_INET6) ? "INET6" : "", (i == AF_INET || i == AF_INET6) ? ")" : ""); error = rnh->rnh_walktree(&rnh->head, rt_dumpentry_ddb, NULL); if (error != 0) db_printf("%s: walktree(%d): %d\n", __func__, i, error); } } _DB_FUNC(_show, route, db_show_route_cmd, db_show_table, CS_OWN, NULL) { char buf[INET6_ADDRSTRLEN], *bp; const void *dst_addrp; struct sockaddr *dstp; struct rtentry *rt; union { struct sockaddr_in dest_sin; struct sockaddr_in6 dest_sin6; } u; uint16_t hextets[8]; unsigned i, tets; int t, af, exp, tokflags; /* * Undecoded address family. No double-colon expansion seen yet. */ af = -1; exp = -1; /* Assume INET6 to start; we can work back if guess was wrong. */ tokflags = DRT_WSPACE | DRT_HEX | DRT_HEXADECIMAL; /* * db_command has lexed 'show route' for us. */ t = db_read_token_flags(tokflags); if (t == tWSPACE) t = db_read_token_flags(tokflags); /* * tEOL: Just 'show route' isn't a valid mode. * tMINUS: It's either '-h' or some invalid option. Regardless, usage. */ if (t == tEOL || t == tMINUS) goto usage; db_unread_token(t); tets = nitems(hextets); /* * Each loop iteration, we expect to read one octet (v4) or hextet * (v6), followed by an appropriate field separator ('.' or ':' or * '::'). * * At the start of each loop, we're looking for a number (octet or * hextet). * * INET6 addresses have a special case where they may begin with '::'. */ for (i = 0; i < tets; i++) { t = db_read_token_flags(tokflags); if (t == tCOLONCOLON) { /* INET6 with leading '::' or invalid. */ if (i != 0) { db_printf("Parse error: unexpected extra " "colons.\n"); goto exit; } af = AF_INET6; exp = i; hextets[i] = 0; continue; } else if (t == tNUMBER) { /* * Lexer separates out '-' as tMINUS, but make the * assumption explicit here. */ MPASS(db_tok_number >= 0); if (af == AF_INET && db_tok_number > UINT8_MAX) { db_printf("Not a valid v4 octet: %ld\n", (long)db_tok_number); goto exit; } hextets[i] = db_tok_number; } else if (t == tEOL) { /* * We can only detect the end of an IPv6 address in * compact representation with EOL. */ if (af != AF_INET6 || exp < 0) { db_printf("Parse failed. Got unexpected EOF " "when the address is not a compact-" "representation IPv6 address.\n"); goto exit; } break; } else { db_printf("Parse failed. Unexpected token %d.\n", t); goto exit; } /* Next, look for a separator, if appropriate. */ if (i == tets - 1) continue; t = db_read_token_flags(tokflags); if (af < 0) { if (t == tCOLON) { af = AF_INET6; continue; } if (t == tCOLONCOLON) { af = AF_INET6; i++; hextets[i] = 0; exp = i; continue; } if (t == tDOT) { unsigned hn, dn; af = AF_INET; /* Need to fixup the first parsed number. */ if (hextets[0] > 0x255 || (hextets[0] & 0xf0) > 0x90 || (hextets[0] & 0xf) > 9) { db_printf("Not a valid v4 octet: %x\n", hextets[0]); goto exit; } hn = hextets[0]; dn = (hn >> 8) * 100 + ((hn >> 4) & 0xf) * 10 + (hn & 0xf); hextets[0] = dn; /* Switch to decimal for remaining octets. */ tokflags &= ~DRT_RADIX_MASK; tokflags |= DRT_DECIMAL; tets = 4; continue; } db_printf("Parse error. Unexpected token %d.\n", t); goto exit; } else if (af == AF_INET) { if (t == tDOT) continue; db_printf("Expected '.' (%d) between octets but got " "(%d).\n", tDOT, t); goto exit; } else if (af == AF_INET6) { if (t == tCOLON) continue; if (t == tCOLONCOLON) { if (exp < 0) { i++; hextets[i] = 0; exp = i; continue; } db_printf("Got bogus second '::' in v6 " "address.\n"); goto exit; } if (t == tEOL) { /* * Handle in the earlier part of the loop * because we need to handle trailing :: too. */ db_unread_token(t); continue; } db_printf("Expected ':' (%d) or '::' (%d) between " "hextets but got (%d).\n", tCOLON, tCOLONCOLON, t); goto exit; } } /* Check for trailing garbage. */ if (i == tets) { t = db_read_token_flags(tokflags); if (t != tEOL) { db_printf("Got unexpected garbage after address " "(%d).\n", t); goto exit; } } /* * Need to expand compact INET6 addresses. * * Technically '::' for a single ':0:' is MUST NOT but just in case, * don't bother expanding that form (exp >= 0 && i == tets case). */ if (af == AF_INET6 && exp >= 0 && i < tets) { if (exp + 1 < i) { memmove(&hextets[exp + 1 + (nitems(hextets) - i)], &hextets[exp + 1], (i - (exp + 1)) * sizeof(hextets[0])); } memset(&hextets[exp + 1], 0, (nitems(hextets) - i) * sizeof(hextets[0])); } memset(&u, 0, sizeof(u)); if (af == AF_INET) { u.dest_sin.sin_family = AF_INET; u.dest_sin.sin_len = sizeof(u.dest_sin); u.dest_sin.sin_addr.s_addr = htonl( ((uint32_t)hextets[0] << 24) | ((uint32_t)hextets[1] << 16) | ((uint32_t)hextets[2] << 8) | (uint32_t)hextets[3]); dstp = (void *)&u.dest_sin; dst_addrp = &u.dest_sin.sin_addr; } else if (af == AF_INET6) { u.dest_sin6.sin6_family = AF_INET6; u.dest_sin6.sin6_len = sizeof(u.dest_sin6); for (i = 0; i < nitems(hextets); i++) u.dest_sin6.sin6_addr.s6_addr16[i] = htons(hextets[i]); dstp = (void *)&u.dest_sin6; dst_addrp = &u.dest_sin6.sin6_addr; } else { MPASS(false); /* UNREACHABLE */ /* Appease Clang false positive: */ dstp = NULL; } bp = inet_ntop(af, dst_addrp, buf, sizeof(buf)); if (bp != NULL) db_printf("Looking up route to destination '%s'\n", bp); CURVNET_SET(vnet0); rt = rtalloc1(dstp, 0, RTF_RNH_LOCKED); CURVNET_RESTORE(); if (rt == NULL) { db_printf("Could not get route for that server.\n"); return; } rt_dumpentry_ddb((void *)rt, NULL); RTFREE_LOCKED(rt); return; usage: db_printf("Usage: 'show route
'\n" " Currently accepts only dotted-decimal INET or colon-separated\n" " hextet INET6 addresses.\n"); exit: db_skip_to_eol(); } #endif diff --git a/sys/netgraph/ng_ether.c b/sys/netgraph/ng_ether.c index 575f3740a6f1..3b1a9efc849e 100644 --- a/sys/netgraph/ng_ether.c +++ b/sys/netgraph/ng_ether.c @@ -1,879 +1,882 @@ /* * ng_ether.c */ /*- * Copyright (c) 1996-2000 Whistle Communications, Inc. * All rights reserved. * * Subject to the following obligations and disclaimer of warranty, use and * redistribution of this software, in source or object code forms, with or * without modifications are expressly permitted by Whistle Communications; * provided, however, that: * 1. Any and all reproductions of the source or object code must include the * copyright notice above and the following disclaimer of warranties; and * 2. No rights are granted, in any manner or form, to use Whistle * Communications, Inc. trademarks, including the mark "WHISTLE * COMMUNICATIONS" on advertising, endorsements, or otherwise except as * such appears in the above copyright notice or in the software. * * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. * * Authors: Archie Cobbs * Julian Elischer * * $FreeBSD$ */ /* * ng_ether(4) netgraph node type */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(ng_ether, 1); #define IFP2NG(ifp) ((ifp)->if_l2com) /* Per-node private data */ struct private { struct ifnet *ifp; /* associated interface */ hook_p upper; /* upper hook connection */ hook_p lower; /* lower hook connection */ hook_p orphan; /* orphan hook connection */ u_char autoSrcAddr; /* always overwrite source address */ u_char promisc; /* promiscuous mode enabled */ u_long hwassist; /* hardware checksum capabilities */ u_int flags; /* flags e.g. really die */ }; typedef struct private *priv_p; /* Hook pointers used by if_ethersubr.c to callback to netgraph */ extern void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); extern void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m); extern int (*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp); extern void (*ng_ether_attach_p)(struct ifnet *ifp); extern void (*ng_ether_detach_p)(struct ifnet *ifp); extern void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); /* Functional hooks called from if_ethersubr.c */ static void ng_ether_input(struct ifnet *ifp, struct mbuf **mp); static void ng_ether_input_orphan(struct ifnet *ifp, struct mbuf *m); static int ng_ether_output(struct ifnet *ifp, struct mbuf **mp); static void ng_ether_attach(struct ifnet *ifp); static void ng_ether_detach(struct ifnet *ifp); static void ng_ether_link_state(struct ifnet *ifp, int state); /* Other functions */ static int ng_ether_rcv_lower(hook_p node, item_p item); static int ng_ether_rcv_upper(hook_p node, item_p item); /* Netgraph node methods */ static ng_constructor_t ng_ether_constructor; static ng_rcvmsg_t ng_ether_rcvmsg; static ng_shutdown_t ng_ether_shutdown; static ng_newhook_t ng_ether_newhook; static ng_rcvdata_t ng_ether_rcvdata; static ng_disconnect_t ng_ether_disconnect; static int ng_ether_mod_event(module_t mod, int event, void *data); static eventhandler_tag ng_ether_ifnet_arrival_cookie; /* List of commands and how to convert arguments to/from ASCII */ static const struct ng_cmdlist ng_ether_cmdlist[] = { { NGM_ETHER_COOKIE, NGM_ETHER_GET_IFNAME, "getifname", NULL, &ng_parse_string_type }, { NGM_ETHER_COOKIE, NGM_ETHER_GET_IFINDEX, "getifindex", NULL, &ng_parse_int32_type }, { NGM_ETHER_COOKIE, NGM_ETHER_GET_ENADDR, "getenaddr", NULL, &ng_parse_enaddr_type }, { NGM_ETHER_COOKIE, NGM_ETHER_SET_ENADDR, "setenaddr", &ng_parse_enaddr_type, NULL }, { NGM_ETHER_COOKIE, NGM_ETHER_GET_PROMISC, "getpromisc", NULL, &ng_parse_int32_type }, { NGM_ETHER_COOKIE, NGM_ETHER_SET_PROMISC, "setpromisc", &ng_parse_int32_type, NULL }, { NGM_ETHER_COOKIE, NGM_ETHER_GET_AUTOSRC, "getautosrc", NULL, &ng_parse_int32_type }, { NGM_ETHER_COOKIE, NGM_ETHER_SET_AUTOSRC, "setautosrc", &ng_parse_int32_type, NULL }, { NGM_ETHER_COOKIE, NGM_ETHER_ADD_MULTI, "addmulti", &ng_parse_enaddr_type, NULL }, { NGM_ETHER_COOKIE, NGM_ETHER_DEL_MULTI, "delmulti", &ng_parse_enaddr_type, NULL }, { NGM_ETHER_COOKIE, NGM_ETHER_DETACH, "detach", NULL, NULL }, { 0 } }; static struct ng_type ng_ether_typestruct = { .version = NG_ABI_VERSION, .name = NG_ETHER_NODE_TYPE, .mod_event = ng_ether_mod_event, .constructor = ng_ether_constructor, .rcvmsg = ng_ether_rcvmsg, .shutdown = ng_ether_shutdown, .newhook = ng_ether_newhook, .rcvdata = ng_ether_rcvdata, .disconnect = ng_ether_disconnect, .cmdlist = ng_ether_cmdlist, }; NETGRAPH_INIT(ether, &ng_ether_typestruct); /****************************************************************** UTILITY FUNCTIONS ******************************************************************/ static void ng_ether_sanitize_ifname(const char *ifname, char *name) { int i; for (i = 0; i < IFNAMSIZ; i++) { if (ifname[i] == '.' || ifname[i] == ':') name[i] = '_'; else name[i] = ifname[i]; if (name[i] == '\0') break; } } /****************************************************************** ETHERNET FUNCTION HOOKS ******************************************************************/ /* * Handle a packet that has come in on an interface. We get to * look at it here before any upper layer protocols do. */ static void ng_ether_input(struct ifnet *ifp, struct mbuf **mp) { const node_p node = IFP2NG(ifp); const priv_p priv = NG_NODE_PRIVATE(node); int error; /* If "lower" hook not connected, let packet continue */ if (priv->lower == NULL) return; NG_SEND_DATA_ONLY(error, priv->lower, *mp); /* sets *mp = NULL */ } /* * Handle a packet that has come in on an interface, and which * does not match any of our known protocols (an ``orphan''). */ static void ng_ether_input_orphan(struct ifnet *ifp, struct mbuf *m) { const node_p node = IFP2NG(ifp); const priv_p priv = NG_NODE_PRIVATE(node); int error; /* If "orphan" hook not connected, discard packet */ if (priv->orphan == NULL) { m_freem(m); return; } NG_SEND_DATA_ONLY(error, priv->orphan, m); } /* * Handle a packet that is going out on an interface. * The Ethernet header is already attached to the mbuf. */ static int ng_ether_output(struct ifnet *ifp, struct mbuf **mp) { const node_p node = IFP2NG(ifp); const priv_p priv = NG_NODE_PRIVATE(node); int error = 0; /* If "upper" hook not connected, let packet continue */ if (priv->upper == NULL) return (0); /* Send it out "upper" hook */ NG_OUTBOUND_THREAD_REF(); NG_SEND_DATA_ONLY(error, priv->upper, *mp); NG_OUTBOUND_THREAD_UNREF(); return (error); } /* * A new Ethernet interface has been attached. * Create a new node for it, etc. */ static void ng_ether_attach(struct ifnet *ifp) { char name[IFNAMSIZ]; priv_p priv; node_p node; /* * Do not create / attach an ether node to this ifnet if * a netgraph node with the same name already exists. * This should prevent ether nodes to become attached to * eiface nodes, which may be problematic due to naming * clashes. */ ng_ether_sanitize_ifname(ifp->if_xname, name); if ((node = ng_name2noderef(NULL, name)) != NULL) { NG_NODE_UNREF(node); return; } /* Create node */ KASSERT(!IFP2NG(ifp), ("%s: node already exists?", __func__)); if (ng_make_node_common(&ng_ether_typestruct, &node) != 0) { log(LOG_ERR, "%s: can't %s for %s\n", __func__, "create node", ifp->if_xname); return; } /* Allocate private data */ priv = malloc(sizeof(*priv), M_NETGRAPH, M_NOWAIT | M_ZERO); if (priv == NULL) { log(LOG_ERR, "%s: can't %s for %s\n", __func__, "allocate memory", ifp->if_xname); NG_NODE_UNREF(node); return; } NG_NODE_SET_PRIVATE(node, priv); priv->ifp = ifp; IFP2NG(ifp) = node; priv->hwassist = ifp->if_hwassist; /* Try to give the node the same name as the interface */ if (ng_name_node(node, name) != 0) log(LOG_WARNING, "%s: can't name node %s\n", __func__, name); } /* * An Ethernet interface is being detached. * REALLY Destroy its node. */ static void ng_ether_detach(struct ifnet *ifp) { const node_p node = IFP2NG(ifp); const priv_p priv = NG_NODE_PRIVATE(node); taskqueue_drain(taskqueue_swi, &ifp->if_linktask); NG_NODE_REALLY_DIE(node); /* Force real removal of node */ /* * We can't assume the ifnet is still around when we run shutdown * So zap it now. XXX We HOPE that anything running at this time * handles it (as it should in the non netgraph case). */ IFP2NG(ifp) = NULL; priv->ifp = NULL; /* XXX race if interrupted an output packet */ ng_rmnode_self(node); /* remove all netgraph parts */ } /* * Notify graph about link event. * if_link_state_change() has already checked that the state has changed. */ static void ng_ether_link_state(struct ifnet *ifp, int state) { const node_p node = IFP2NG(ifp); const priv_p priv = NG_NODE_PRIVATE(node); struct ng_mesg *msg; int cmd, dummy_error = 0; if (state == LINK_STATE_UP) cmd = NGM_LINK_IS_UP; else if (state == LINK_STATE_DOWN) cmd = NGM_LINK_IS_DOWN; else return; if (priv->lower != NULL) { NG_MKMESSAGE(msg, NGM_FLOW_COOKIE, cmd, 0, M_NOWAIT); if (msg != NULL) NG_SEND_MSG_HOOK(dummy_error, node, msg, priv->lower, 0); } if (priv->orphan != NULL) { NG_MKMESSAGE(msg, NGM_FLOW_COOKIE, cmd, 0, M_NOWAIT); if (msg != NULL) NG_SEND_MSG_HOOK(dummy_error, node, msg, priv->orphan, 0); } } /* * Interface arrival notification handler. * The notification is produced in two cases: * o a new interface arrives * o an existing interface got renamed * Currently the first case is handled by ng_ether_attach via special * hook ng_ether_attach_p. */ static void ng_ether_ifnet_arrival_event(void *arg __unused, struct ifnet *ifp) { char name[IFNAMSIZ]; node_p node; /* Only ethernet interfaces are of interest. */ if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN) return; /* * Just return if it's a new interface without an ng_ether companion. */ node = IFP2NG(ifp); if (node == NULL) return; /* Try to give the node the same name as the new interface name */ ng_ether_sanitize_ifname(ifp->if_xname, name); if (ng_name_node(node, name) != 0) log(LOG_WARNING, "%s: can't re-name node %s\n", __func__, name); } /****************************************************************** NETGRAPH NODE METHODS ******************************************************************/ /* * It is not possible or allowable to create a node of this type. * Nodes get created when the interface is attached (or, when * this node type's KLD is loaded). */ static int ng_ether_constructor(node_p node) { return (EINVAL); } /* * Check for attaching a new hook. */ static int ng_ether_newhook(node_p node, hook_p hook, const char *name) { const priv_p priv = NG_NODE_PRIVATE(node); hook_p *hookptr; /* Divert hook is an alias for lower */ if (strcmp(name, NG_ETHER_HOOK_DIVERT) == 0) name = NG_ETHER_HOOK_LOWER; /* Which hook? */ if (strcmp(name, NG_ETHER_HOOK_UPPER) == 0) { hookptr = &priv->upper; NG_HOOK_SET_RCVDATA(hook, ng_ether_rcv_upper); NG_HOOK_SET_TO_INBOUND(hook); } else if (strcmp(name, NG_ETHER_HOOK_LOWER) == 0) { hookptr = &priv->lower; NG_HOOK_SET_RCVDATA(hook, ng_ether_rcv_lower); } else if (strcmp(name, NG_ETHER_HOOK_ORPHAN) == 0) { hookptr = &priv->orphan; NG_HOOK_SET_RCVDATA(hook, ng_ether_rcv_lower); } else return (EINVAL); /* Check if already connected (shouldn't be, but doesn't hurt) */ if (*hookptr != NULL) return (EISCONN); /* Disable hardware checksums while 'upper' hook is connected */ if (hookptr == &priv->upper) priv->ifp->if_hwassist = 0; NG_HOOK_HI_STACK(hook); /* OK */ *hookptr = hook; return (0); } /* * Receive an incoming control message. */ static int ng_ether_rcvmsg(node_p node, item_p item, hook_p lasthook) { const priv_p priv = NG_NODE_PRIVATE(node); struct ng_mesg *resp = NULL; int error = 0; struct ng_mesg *msg; NGI_GET_MSG(item, msg); switch (msg->header.typecookie) { case NGM_ETHER_COOKIE: switch (msg->header.cmd) { case NGM_ETHER_GET_IFNAME: NG_MKRESPONSE(resp, msg, IFNAMSIZ, M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } strlcpy(resp->data, priv->ifp->if_xname, IFNAMSIZ); break; case NGM_ETHER_GET_IFINDEX: NG_MKRESPONSE(resp, msg, sizeof(u_int32_t), M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } *((u_int32_t *)resp->data) = priv->ifp->if_index; break; case NGM_ETHER_GET_ENADDR: NG_MKRESPONSE(resp, msg, ETHER_ADDR_LEN, M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } bcopy(IF_LLADDR(priv->ifp), resp->data, ETHER_ADDR_LEN); break; case NGM_ETHER_SET_ENADDR: { if (msg->header.arglen != ETHER_ADDR_LEN) { error = EINVAL; break; } error = if_setlladdr(priv->ifp, (u_char *)msg->data, ETHER_ADDR_LEN); break; } case NGM_ETHER_GET_PROMISC: NG_MKRESPONSE(resp, msg, sizeof(u_int32_t), M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } *((u_int32_t *)resp->data) = priv->promisc; break; case NGM_ETHER_SET_PROMISC: { u_char want; if (msg->header.arglen != sizeof(u_int32_t)) { error = EINVAL; break; } want = !!*((u_int32_t *)msg->data); if (want ^ priv->promisc) { if ((error = ifpromisc(priv->ifp, want)) != 0) break; priv->promisc = want; } break; } case NGM_ETHER_GET_AUTOSRC: NG_MKRESPONSE(resp, msg, sizeof(u_int32_t), M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } *((u_int32_t *)resp->data) = priv->autoSrcAddr; break; case NGM_ETHER_SET_AUTOSRC: if (msg->header.arglen != sizeof(u_int32_t)) { error = EINVAL; break; } priv->autoSrcAddr = !!*((u_int32_t *)msg->data); break; case NGM_ETHER_ADD_MULTI: { struct sockaddr_dl sa_dl; struct ifmultiaddr *ifma; if (msg->header.arglen != ETHER_ADDR_LEN) { error = EINVAL; break; } bzero(&sa_dl, sizeof(struct sockaddr_dl)); sa_dl.sdl_len = sizeof(struct sockaddr_dl); sa_dl.sdl_family = AF_LINK; sa_dl.sdl_alen = ETHER_ADDR_LEN; bcopy((void *)msg->data, LLADDR(&sa_dl), ETHER_ADDR_LEN); /* * Netgraph is only permitted to join groups once * via the if_addmulti() KPI, because it cannot hold * struct ifmultiaddr * between calls. It may also * lose a race while we check if the membership * already exists. */ if_maddr_rlock(priv->ifp); ifma = if_findmulti(priv->ifp, (struct sockaddr *)&sa_dl); if_maddr_runlock(priv->ifp); if (ifma != NULL) { error = EADDRINUSE; } else { error = if_addmulti(priv->ifp, (struct sockaddr *)&sa_dl, &ifma); } break; } case NGM_ETHER_DEL_MULTI: { struct sockaddr_dl sa_dl; if (msg->header.arglen != ETHER_ADDR_LEN) { error = EINVAL; break; } bzero(&sa_dl, sizeof(struct sockaddr_dl)); sa_dl.sdl_len = sizeof(struct sockaddr_dl); sa_dl.sdl_family = AF_LINK; sa_dl.sdl_alen = ETHER_ADDR_LEN; bcopy((void *)msg->data, LLADDR(&sa_dl), ETHER_ADDR_LEN); error = if_delmulti(priv->ifp, (struct sockaddr *)&sa_dl); break; } case NGM_ETHER_DETACH: ng_ether_detach(priv->ifp); break; default: error = EINVAL; break; } break; default: error = EINVAL; break; } NG_RESPOND_MSG(error, node, item, resp); NG_FREE_MSG(msg); return (error); } /* * Receive data on a hook. * Since we use per-hook recveive methods this should never be called. */ static int ng_ether_rcvdata(hook_p hook, item_p item) { NG_FREE_ITEM(item); panic("%s: weird hook", __func__); } /* * Handle an mbuf received on the "lower" or "orphan" hook. */ static int ng_ether_rcv_lower(hook_p hook, item_p item) { struct mbuf *m; const node_p node = NG_HOOK_NODE(hook); const priv_p priv = NG_NODE_PRIVATE(node); struct ifnet *const ifp = priv->ifp; NGI_GET_M(item, m); NG_FREE_ITEM(item); /* Check whether interface is ready for packets */ if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) { NG_FREE_M(m); return (ENETDOWN); } /* Make sure header is fully pulled up */ if (m->m_pkthdr.len < sizeof(struct ether_header)) { NG_FREE_M(m); return (EINVAL); } if (m->m_len < sizeof(struct ether_header) && (m = m_pullup(m, sizeof(struct ether_header))) == NULL) return (ENOBUFS); /* Drop in the MAC address if desired */ if (priv->autoSrcAddr) { /* Make the mbuf writable if it's not already */ if (!M_WRITABLE(m) && (m = m_pullup(m, sizeof(struct ether_header))) == NULL) return (ENOBUFS); /* Overwrite source MAC address */ bcopy(IF_LLADDR(ifp), mtod(m, struct ether_header *)->ether_shost, ETHER_ADDR_LEN); } /* Send it on its way */ return ether_output_frame(ifp, m); } /* * Handle an mbuf received on the "upper" hook. */ static int ng_ether_rcv_upper(hook_p hook, item_p item) { + struct epoch_tracker et; struct mbuf *m; const node_p node = NG_HOOK_NODE(hook); const priv_p priv = NG_NODE_PRIVATE(node); struct ifnet *ifp = priv->ifp; NGI_GET_M(item, m); NG_FREE_ITEM(item); /* Check length and pull off header */ if (m->m_pkthdr.len < sizeof(struct ether_header)) { NG_FREE_M(m); return (EINVAL); } if (m->m_len < sizeof(struct ether_header) && (m = m_pullup(m, sizeof(struct ether_header))) == NULL) return (ENOBUFS); m->m_pkthdr.rcvif = ifp; /* Pass the packet to the bridge, it may come back to us */ if (ifp->if_bridge) { BRIDGE_INPUT(ifp, m); if (m == NULL) return (0); } /* Route packet back in */ + NET_EPOCH_ENTER(et); ether_demux(ifp, m); + NET_EPOCH_EXIT(et); return (0); } /* * Shutdown node. This resets the node but does not remove it * unless the REALLY_DIE flag is set. */ static int ng_ether_shutdown(node_p node) { const priv_p priv = NG_NODE_PRIVATE(node); if (node->nd_flags & NGF_REALLY_DIE) { /* * WE came here because the ethernet card is being unloaded, * so stop being persistent. * Actually undo all the things we did on creation. * Assume the ifp has already been freed. */ NG_NODE_SET_PRIVATE(node, NULL); free(priv, M_NETGRAPH); NG_NODE_UNREF(node); /* free node itself */ return (0); } if (priv->promisc) { /* disable promiscuous mode */ (void)ifpromisc(priv->ifp, 0); priv->promisc = 0; } NG_NODE_REVIVE(node); /* Signal ng_rmnode we are persisant */ return (0); } /* * Hook disconnection. */ static int ng_ether_disconnect(hook_p hook) { const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); if (hook == priv->upper) { priv->upper = NULL; if (priv->ifp != NULL) /* restore h/w csum */ priv->ifp->if_hwassist = priv->hwassist; } else if (hook == priv->lower) priv->lower = NULL; else if (hook == priv->orphan) priv->orphan = NULL; else panic("%s: weird hook", __func__); if ((NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0) && (NG_NODE_IS_VALID(NG_HOOK_NODE(hook)))) ng_rmnode_self(NG_HOOK_NODE(hook)); /* reset node */ return (0); } /****************************************************************** INITIALIZATION ******************************************************************/ /* * Handle loading and unloading for this node type. */ static int ng_ether_mod_event(module_t mod, int event, void *data) { int error = 0; switch (event) { case MOD_LOAD: /* Register function hooks */ if (ng_ether_attach_p != NULL) { error = EEXIST; break; } ng_ether_attach_p = ng_ether_attach; ng_ether_detach_p = ng_ether_detach; ng_ether_output_p = ng_ether_output; ng_ether_input_p = ng_ether_input; ng_ether_input_orphan_p = ng_ether_input_orphan; ng_ether_link_state_p = ng_ether_link_state; ng_ether_ifnet_arrival_cookie = EVENTHANDLER_REGISTER(ifnet_arrival_event, ng_ether_ifnet_arrival_event, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_UNLOAD: /* * Note that the base code won't try to unload us until * all nodes have been removed, and that can't happen * until all Ethernet interfaces are removed. In any * case, we know there are no nodes left if the action * is MOD_UNLOAD, so there's no need to detach any nodes. */ EVENTHANDLER_DEREGISTER(ifnet_arrival_event, ng_ether_ifnet_arrival_cookie); /* Unregister function hooks */ ng_ether_attach_p = NULL; ng_ether_detach_p = NULL; ng_ether_output_p = NULL; ng_ether_input_p = NULL; ng_ether_input_orphan_p = NULL; ng_ether_link_state_p = NULL; break; default: error = EOPNOTSUPP; break; } return (error); } static void vnet_ng_ether_init(const void *unused) { struct ifnet *ifp; /* If module load was rejected, don't attach to vnets. */ if (ng_ether_attach_p != ng_ether_attach) return; /* Create nodes for any already-existing Ethernet interfaces. */ IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) ng_ether_attach(ifp); } IFNET_RUNLOCK(); } VNET_SYSINIT(vnet_ng_ether_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_ng_ether_init, NULL); diff --git a/sys/netgraph/ng_iface.c b/sys/netgraph/ng_iface.c index f1ddce70c86d..605a1f7d9e24 100644 --- a/sys/netgraph/ng_iface.c +++ b/sys/netgraph/ng_iface.c @@ -1,817 +1,820 @@ /* * ng_iface.c */ /*- * Copyright (c) 1996-1999 Whistle Communications, Inc. * All rights reserved. * * Subject to the following obligations and disclaimer of warranty, use and * redistribution of this software, in source or object code forms, with or * without modifications are expressly permitted by Whistle Communications; * provided, however, that: * 1. Any and all reproductions of the source or object code must include the * copyright notice above and the following disclaimer of warranties; and * 2. No rights are granted, in any manner or form, to use Whistle * Communications, Inc. trademarks, including the mark "WHISTLE * COMMUNICATIONS" on advertising, endorsements, or otherwise except as * such appears in the above copyright notice or in the software. * * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. * * Author: Archie Cobbs * * $FreeBSD$ * $Whistle: ng_iface.c,v 1.33 1999/11/01 09:24:51 julian Exp $ */ /* * This node is also a system networking interface. It has * a hook for each protocol (IP, AppleTalk, etc). Packets * are simply relayed between the interface and the hooks. * * Interfaces are named ng0, ng1, etc. New nodes take the * first available interface name. * * This node also includes Berkeley packet filter support. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef NG_SEPARATE_MALLOC static MALLOC_DEFINE(M_NETGRAPH_IFACE, "netgraph_iface", "netgraph iface node"); #else #define M_NETGRAPH_IFACE M_NETGRAPH #endif static SYSCTL_NODE(_net_graph, OID_AUTO, iface, CTLFLAG_RW, 0, "Point to point netgraph interface"); VNET_DEFINE_STATIC(int, ng_iface_max_nest) = 2; #define V_ng_iface_max_nest VNET(ng_iface_max_nest) SYSCTL_INT(_net_graph_iface, OID_AUTO, max_nesting, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ng_iface_max_nest), 0, "Max nested tunnels"); /* This struct describes one address family */ struct iffam { sa_family_t family; /* Address family */ const char *hookname; /* Name for hook */ }; typedef const struct iffam *iffam_p; /* List of address families supported by our interface */ const static struct iffam gFamilies[] = { { AF_INET, NG_IFACE_HOOK_INET }, { AF_INET6, NG_IFACE_HOOK_INET6 }, { AF_ATM, NG_IFACE_HOOK_ATM }, { AF_NATM, NG_IFACE_HOOK_NATM }, }; #define NUM_FAMILIES nitems(gFamilies) /* Node private data */ struct ng_iface_private { struct ifnet *ifp; /* Our interface */ int unit; /* Interface unit number */ node_p node; /* Our netgraph node */ hook_p hooks[NUM_FAMILIES]; /* Hook for each address family */ struct rmlock lock; /* Protect private data changes */ }; typedef struct ng_iface_private *priv_p; #define PRIV_RLOCK(priv, t) rm_rlock(&priv->lock, t) #define PRIV_RUNLOCK(priv, t) rm_runlock(&priv->lock, t) #define PRIV_WLOCK(priv) rm_wlock(&priv->lock) #define PRIV_WUNLOCK(priv) rm_wunlock(&priv->lock) /* Interface methods */ static void ng_iface_start(struct ifnet *ifp); static int ng_iface_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int ng_iface_output(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, struct route *ro); static void ng_iface_bpftap(struct ifnet *ifp, struct mbuf *m, sa_family_t family); static int ng_iface_send(struct ifnet *ifp, struct mbuf *m, sa_family_t sa); #ifdef DEBUG static void ng_iface_print_ioctl(struct ifnet *ifp, int cmd, caddr_t data); #endif /* Netgraph methods */ static int ng_iface_mod_event(module_t, int, void *); static ng_constructor_t ng_iface_constructor; static ng_rcvmsg_t ng_iface_rcvmsg; static ng_shutdown_t ng_iface_shutdown; static ng_newhook_t ng_iface_newhook; static ng_rcvdata_t ng_iface_rcvdata; static ng_disconnect_t ng_iface_disconnect; /* Helper stuff */ static iffam_p get_iffam_from_af(sa_family_t family); static iffam_p get_iffam_from_hook(priv_p priv, hook_p hook); static iffam_p get_iffam_from_name(const char *name); static hook_p *get_hook_from_iffam(priv_p priv, iffam_p iffam); /* List of commands and how to convert arguments to/from ASCII */ static const struct ng_cmdlist ng_iface_cmds[] = { { NGM_IFACE_COOKIE, NGM_IFACE_GET_IFNAME, "getifname", NULL, &ng_parse_string_type }, { NGM_IFACE_COOKIE, NGM_IFACE_POINT2POINT, "point2point", NULL, NULL }, { NGM_IFACE_COOKIE, NGM_IFACE_BROADCAST, "broadcast", NULL, NULL }, { NGM_IFACE_COOKIE, NGM_IFACE_GET_IFINDEX, "getifindex", NULL, &ng_parse_uint32_type }, { 0 } }; /* Node type descriptor */ static struct ng_type typestruct = { .version = NG_ABI_VERSION, .name = NG_IFACE_NODE_TYPE, .mod_event = ng_iface_mod_event, .constructor = ng_iface_constructor, .rcvmsg = ng_iface_rcvmsg, .shutdown = ng_iface_shutdown, .newhook = ng_iface_newhook, .rcvdata = ng_iface_rcvdata, .disconnect = ng_iface_disconnect, .cmdlist = ng_iface_cmds, }; NETGRAPH_INIT(iface, &typestruct); VNET_DEFINE_STATIC(struct unrhdr *, ng_iface_unit); #define V_ng_iface_unit VNET(ng_iface_unit) /************************************************************************ HELPER STUFF ************************************************************************/ /* * Get the family descriptor from the family ID */ static __inline iffam_p get_iffam_from_af(sa_family_t family) { iffam_p iffam; int k; for (k = 0; k < NUM_FAMILIES; k++) { iffam = &gFamilies[k]; if (iffam->family == family) return (iffam); } return (NULL); } /* * Get the family descriptor from the hook */ static __inline iffam_p get_iffam_from_hook(priv_p priv, hook_p hook) { int k; for (k = 0; k < NUM_FAMILIES; k++) if (priv->hooks[k] == hook) return (&gFamilies[k]); return (NULL); } /* * Get the hook from the iffam descriptor */ static __inline hook_p * get_hook_from_iffam(priv_p priv, iffam_p iffam) { return (&priv->hooks[iffam - gFamilies]); } /* * Get the iffam descriptor from the name */ static __inline iffam_p get_iffam_from_name(const char *name) { iffam_p iffam; int k; for (k = 0; k < NUM_FAMILIES; k++) { iffam = &gFamilies[k]; if (!strcmp(iffam->hookname, name)) return (iffam); } return (NULL); } /************************************************************************ INTERFACE STUFF ************************************************************************/ /* * Process an ioctl for the virtual interface */ static int ng_iface_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifreq *const ifr = (struct ifreq *) data; int error = 0; #ifdef DEBUG ng_iface_print_ioctl(ifp, command, data); #endif switch (command) { /* These two are mostly handled at a higher layer */ case SIOCSIFADDR: ifp->if_flags |= IFF_UP; ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE); break; case SIOCGIFADDR: break; /* Set flags */ case SIOCSIFFLAGS: /* * If the interface is marked up and stopped, then start it. * If it is marked down and running, then stop it. */ if (ifr->ifr_flags & IFF_UP) { if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE); ifp->if_drv_flags |= IFF_DRV_RUNNING; } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); } break; /* Set the interface MTU */ case SIOCSIFMTU: if (ifr->ifr_mtu > NG_IFACE_MTU_MAX || ifr->ifr_mtu < NG_IFACE_MTU_MIN) error = EINVAL; else ifp->if_mtu = ifr->ifr_mtu; break; /* Stuff that's not supported */ case SIOCADDMULTI: case SIOCDELMULTI: error = 0; break; case SIOCSIFPHYS: error = EOPNOTSUPP; break; default: error = EINVAL; break; } return (error); } /* * This routine is called to deliver a packet out the interface. * We simply look at the address family and relay the packet to * the corresponding hook, if it exists and is connected. */ static int ng_iface_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { uint32_t af; int error; /* Check interface flags */ if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) { m_freem(m); return (ENETDOWN); } /* Protect from deadly infinite recursion. */ error = if_tunnel_check_nesting(ifp, m, NGM_IFACE_COOKIE, V_ng_iface_max_nest); if (error) { m_freem(m); return (error); } /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else af = dst->sa_family; /* Berkeley packet filter */ ng_iface_bpftap(ifp, m, af); if (ALTQ_IS_ENABLED(&ifp->if_snd)) { M_PREPEND(m, sizeof(sa_family_t), M_NOWAIT); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); return (ENOBUFS); } *(sa_family_t *)m->m_data = af; error = (ifp->if_transmit)(ifp, m); } else error = ng_iface_send(ifp, m, af); return (error); } /* * Start method is used only when ALTQ is enabled. */ static void ng_iface_start(struct ifnet *ifp) { struct mbuf *m; sa_family_t sa; KASSERT(ALTQ_IS_ENABLED(&ifp->if_snd), ("%s without ALTQ", __func__)); for(;;) { IFQ_DRV_DEQUEUE(&ifp->if_snd, m); if (m == NULL) break; sa = *mtod(m, sa_family_t *); m_adj(m, sizeof(sa_family_t)); ng_iface_send(ifp, m, sa); } } /* * Flash a packet by the BPF (requires prepending 4 byte AF header) * Note the phoney mbuf; this is OK because BPF treats it read-only. */ static void ng_iface_bpftap(struct ifnet *ifp, struct mbuf *m, sa_family_t family) { KASSERT(family != AF_UNSPEC, ("%s: family=AF_UNSPEC", __func__)); if (bpf_peers_present(ifp->if_bpf)) { int32_t family4 = (int32_t)family; bpf_mtap2(ifp->if_bpf, &family4, sizeof(family4), m); } } /* * This routine does actual delivery of the packet into the * netgraph(4). It is called from ng_iface_start() and * ng_iface_output(). */ static int ng_iface_send(struct ifnet *ifp, struct mbuf *m, sa_family_t sa) { struct rm_priotracker priv_tracker; const priv_p priv = (priv_p) ifp->if_softc; const iffam_p iffam = get_iffam_from_af(sa); hook_p hook; int error; int len; /* Check address family to determine hook (if known) */ if (iffam == NULL) { m_freem(m); log(LOG_WARNING, "%s: can't handle af%d\n", ifp->if_xname, sa); return (EAFNOSUPPORT); } /* Copy length before the mbuf gets invalidated. */ len = m->m_pkthdr.len; PRIV_RLOCK(priv, &priv_tracker); hook = *get_hook_from_iffam(priv, iffam); if (hook == NULL) { NG_FREE_M(m); PRIV_RUNLOCK(priv, &priv_tracker); return ENETDOWN; } NG_HOOK_REF(hook); PRIV_RUNLOCK(priv, &priv_tracker); NG_OUTBOUND_THREAD_REF(); NG_SEND_DATA_ONLY(error, hook, m); NG_OUTBOUND_THREAD_UNREF(); NG_HOOK_UNREF(hook); /* Update stats. */ if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); } return (error); } #ifdef DEBUG /* * Display an ioctl to the virtual interface */ static void ng_iface_print_ioctl(struct ifnet *ifp, int command, caddr_t data) { char *str; switch (command & IOC_DIRMASK) { case IOC_VOID: str = "IO"; break; case IOC_OUT: str = "IOR"; break; case IOC_IN: str = "IOW"; break; case IOC_INOUT: str = "IORW"; break; default: str = "IO??"; } log(LOG_DEBUG, "%s: %s('%c', %d, char[%d])\n", ifp->if_xname, str, IOCGROUP(command), command & 0xff, IOCPARM_LEN(command)); } #endif /* DEBUG */ /************************************************************************ NETGRAPH NODE STUFF ************************************************************************/ /* * Constructor for a node */ static int ng_iface_constructor(node_p node) { struct ifnet *ifp; priv_p priv; /* Allocate node and interface private structures */ priv = malloc(sizeof(*priv), M_NETGRAPH_IFACE, M_WAITOK | M_ZERO); ifp = if_alloc(IFT_PROPVIRTUAL); if (ifp == NULL) { free(priv, M_NETGRAPH_IFACE); return (ENOMEM); } rm_init(&priv->lock, "ng_iface private rmlock"); /* Link them together */ ifp->if_softc = priv; priv->ifp = ifp; /* Get an interface unit number */ priv->unit = alloc_unr(V_ng_iface_unit); /* Link together node and private info */ NG_NODE_SET_PRIVATE(node, priv); priv->node = node; /* Initialize interface structure */ if_initname(ifp, NG_IFACE_IFACE_NAME, priv->unit); ifp->if_output = ng_iface_output; ifp->if_start = ng_iface_start; ifp->if_ioctl = ng_iface_ioctl; ifp->if_mtu = NG_IFACE_MTU_DEFAULT; ifp->if_flags = (IFF_SIMPLEX|IFF_POINTOPOINT|IFF_NOARP|IFF_MULTICAST); ifp->if_type = IFT_PROPVIRTUAL; /* XXX */ ifp->if_addrlen = 0; /* XXX */ ifp->if_hdrlen = 0; /* XXX */ ifp->if_baudrate = 64000; /* XXX */ IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_snd.ifq_drv_maxlen = ifqmaxlen; IFQ_SET_READY(&ifp->if_snd); /* Give this node the same name as the interface (if possible) */ if (ng_name_node(node, ifp->if_xname) != 0) log(LOG_WARNING, "%s: can't acquire netgraph name\n", ifp->if_xname); /* Attach the interface */ if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); /* Done */ return (0); } /* * Give our ok for a hook to be added */ static int ng_iface_newhook(node_p node, hook_p hook, const char *name) { const iffam_p iffam = get_iffam_from_name(name); const priv_p priv = NG_NODE_PRIVATE(node); hook_p *hookptr; if (iffam == NULL) return (EPFNOSUPPORT); PRIV_WLOCK(priv); hookptr = get_hook_from_iffam(priv, iffam); if (*hookptr != NULL) { PRIV_WUNLOCK(priv); return (EISCONN); } *hookptr = hook; NG_HOOK_HI_STACK(hook); NG_HOOK_SET_TO_INBOUND(hook); PRIV_WUNLOCK(priv); return (0); } /* * Receive a control message */ static int ng_iface_rcvmsg(node_p node, item_p item, hook_p lasthook) { const priv_p priv = NG_NODE_PRIVATE(node); struct ifnet *const ifp = priv->ifp; struct ng_mesg *resp = NULL; int error = 0; struct ng_mesg *msg; NGI_GET_MSG(item, msg); switch (msg->header.typecookie) { case NGM_IFACE_COOKIE: switch (msg->header.cmd) { case NGM_IFACE_GET_IFNAME: NG_MKRESPONSE(resp, msg, IFNAMSIZ, M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } strlcpy(resp->data, ifp->if_xname, IFNAMSIZ); break; case NGM_IFACE_POINT2POINT: case NGM_IFACE_BROADCAST: { /* Deny request if interface is UP */ if ((ifp->if_flags & IFF_UP) != 0) return (EBUSY); /* Change flags */ switch (msg->header.cmd) { case NGM_IFACE_POINT2POINT: ifp->if_flags |= IFF_POINTOPOINT; ifp->if_flags &= ~IFF_BROADCAST; break; case NGM_IFACE_BROADCAST: ifp->if_flags &= ~IFF_POINTOPOINT; ifp->if_flags |= IFF_BROADCAST; break; } break; } case NGM_IFACE_GET_IFINDEX: NG_MKRESPONSE(resp, msg, sizeof(uint32_t), M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } *((uint32_t *)resp->data) = priv->ifp->if_index; break; default: error = EINVAL; break; } break; case NGM_FLOW_COOKIE: switch (msg->header.cmd) { case NGM_LINK_IS_UP: if_link_state_change(ifp, LINK_STATE_UP); break; case NGM_LINK_IS_DOWN: if_link_state_change(ifp, LINK_STATE_DOWN); break; default: break; } break; default: error = EINVAL; break; } NG_RESPOND_MSG(error, node, item, resp); NG_FREE_MSG(msg); return (error); } /* * Recive data from a hook. Pass the packet to the correct input routine. */ static int ng_iface_rcvdata(hook_p hook, item_p item) { const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); const iffam_p iffam = get_iffam_from_hook(priv, hook); struct ifnet *const ifp = priv->ifp; + struct epoch_tracker et; struct mbuf *m; int isr; NGI_GET_M(item, m); NG_FREE_ITEM(item); /* Sanity checks */ KASSERT(iffam != NULL, ("%s: iffam", __func__)); M_ASSERTPKTHDR(m); if ((ifp->if_flags & IFF_UP) == 0) { NG_FREE_M(m); return (ENETDOWN); } /* Update interface stats */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); /* Note receiving interface */ m->m_pkthdr.rcvif = ifp; /* Berkeley packet filter */ ng_iface_bpftap(ifp, m, iffam->family); /* Send packet */ switch (iffam->family) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif default: m_freem(m); return (EAFNOSUPPORT); } random_harvest_queue(m, sizeof(*m), RANDOM_NET_NG); M_SETFIB(m, ifp->if_fib); + NET_EPOCH_ENTER(et); netisr_dispatch(isr, m); + NET_EPOCH_EXIT(et); return (0); } /* * Shutdown and remove the node and its associated interface. */ static int ng_iface_shutdown(node_p node) { const priv_p priv = NG_NODE_PRIVATE(node); /* * The ifnet may be in a different vnet than the netgraph node, * hence we have to change the current vnet context here. */ CURVNET_SET_QUIET(priv->ifp->if_vnet); bpfdetach(priv->ifp); if_detach(priv->ifp); if_free(priv->ifp); CURVNET_RESTORE(); priv->ifp = NULL; free_unr(V_ng_iface_unit, priv->unit); rm_destroy(&priv->lock); free(priv, M_NETGRAPH_IFACE); NG_NODE_SET_PRIVATE(node, NULL); NG_NODE_UNREF(node); return (0); } /* * Hook disconnection. Note that we do *not* shutdown when all * hooks have been disconnected. */ static int ng_iface_disconnect(hook_p hook) { const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); const iffam_p iffam = get_iffam_from_hook(priv, hook); if (iffam == NULL) panic("%s", __func__); PRIV_WLOCK(priv); *get_hook_from_iffam(priv, iffam) = NULL; PRIV_WUNLOCK(priv); return (0); } /* * Handle loading and unloading for this node type. */ static int ng_iface_mod_event(module_t mod, int event, void *data) { int error = 0; switch (event) { case MOD_LOAD: case MOD_UNLOAD: break; default: error = EOPNOTSUPP; break; } return (error); } static void vnet_ng_iface_init(const void *unused) { V_ng_iface_unit = new_unrhdr(0, 0xffff, NULL); } VNET_SYSINIT(vnet_ng_iface_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_ng_iface_init, NULL); static void vnet_ng_iface_uninit(const void *unused) { delete_unrhdr(V_ng_iface_unit); } VNET_SYSUNINIT(vnet_ng_iface_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_ng_iface_uninit, NULL); diff --git a/sys/netgraph/ng_ip_input.c b/sys/netgraph/ng_ip_input.c index 5e6dce38f6d1..88e0e11c08d3 100644 --- a/sys/netgraph/ng_ip_input.c +++ b/sys/netgraph/ng_ip_input.c @@ -1,142 +1,147 @@ /* * ng_ip_input.c */ /*- * SPDX-License-Identifier: BSD-3-Clause AND BSD-2-Clause * * Copyright 2001 The Aerospace Corporation. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of The Aerospace Corporation may not be used to endorse or * promote products derived from this software. * * THIS SOFTWARE IS PROVIDED BY THE AEROSPACE CORPORATION "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AEROSPACE CORPORATION BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1996-1999 Whistle Communications, Inc. * All rights reserved. * * Subject to the following obligations and disclaimer of warranty, use and * redistribution of this software, in source or object code forms, with or * without modifications are expressly permitted by Whistle Communications; * provided, however, that: * 1. Any and all reproductions of the source or object code must include the * copyright notice above and the following disclaimer of warranties; and * 2. No rights are granted, in any manner or form, to use Whistle * Communications, Inc. trademarks, including the mark "WHISTLE * COMMUNICATIONS" on advertising, endorsements, or otherwise except as * such appears in the above copyright notice or in the software. * * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. * * Author: Brooks Davis * Derived from: ng_hole.c * * $FreeBSD$ */ /* * This node simply takes IP packets and calls ip_input on them */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Netgraph methods */ static ng_constructor_t ngipi_cons; static ng_rcvdata_t ngipi_rcvdata; static ng_disconnect_t ngipi_disconnect; static struct ng_type typestruct = { .version = NG_ABI_VERSION, .name = NG_IP_INPUT_NODE_TYPE, .constructor = ngipi_cons, .rcvdata = ngipi_rcvdata, .disconnect = ngipi_disconnect, }; NETGRAPH_INIT(ip_input, &typestruct); /* * Be obliging. but no work to do. */ static int ngipi_cons(node_p node) { return(0); } /* * Receive data */ static int ngipi_rcvdata(hook_p hook, item_p item) { struct mbuf *m; NGI_GET_M(item, m); NG_FREE_ITEM(item); if (curthread->td_ng_outbound) netisr_queue(NETISR_IP, m); - else + else { + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); netisr_dispatch(NETISR_IP, m); + NET_EPOCH_EXIT(et); + } return 0; } /* * Hook disconnection */ static int ngipi_disconnect(hook_p hook) { if (NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0) ng_rmnode_self(NG_HOOK_NODE(hook)); return (0); } diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c index 0c85349f88a8..cb3c335d0897 100644 --- a/sys/netinet/if_ether.c +++ b/sys/netinet/if_ether.c @@ -1,1518 +1,1513 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ether.c 8.1 (Berkeley) 6/10/93 */ /* * Ethernet address resolution protocol. * TODO: * add "inuse/lock" bit (or ref. count) along with valid bit */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #endif #include #define SIN(s) ((const struct sockaddr_in *)(s)) static struct timeval arp_lastlog; static int arp_curpps; static int arp_maxpps = 1; /* Simple ARP state machine */ enum arp_llinfo_state { ARP_LLINFO_INCOMPLETE = 0, /* No LLE data */ ARP_LLINFO_REACHABLE, /* LLE is valid */ ARP_LLINFO_VERIFY, /* LLE is valid, need refresh */ ARP_LLINFO_DELETED, /* LLE is deleted */ }; SYSCTL_DECL(_net_link_ether); static SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); static SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, ""); /* timer values */ VNET_DEFINE_STATIC(int, arpt_keep) = (20*60); /* once resolved, good for 20 * minutes */ VNET_DEFINE_STATIC(int, arp_maxtries) = 5; VNET_DEFINE_STATIC(int, arp_proxyall) = 0; VNET_DEFINE_STATIC(int, arpt_down) = 20; /* keep incomplete entries for * 20 seconds */ VNET_DEFINE_STATIC(int, arpt_rexmit) = 1; /* retransmit arp entries, sec*/ VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */ VNET_PCPUSTAT_SYSINIT(arpstat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(arpstat); #endif /* VIMAGE */ VNET_DEFINE_STATIC(int, arp_maxhold) = 1; #define V_arpt_keep VNET(arpt_keep) #define V_arpt_down VNET(arpt_down) #define V_arpt_rexmit VNET(arpt_rexmit) #define V_arp_maxtries VNET(arp_maxtries) #define V_arp_proxyall VNET(arp_proxyall) #define V_arp_maxhold VNET(arp_maxhold) SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arpt_keep), 0, "ARP entry lifetime in seconds"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_maxtries), 0, "ARP resolution attempts before returning error"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_proxyall), 0, "Enable proxy ARP for all suitable requests"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arpt_down), 0, "Incomplete ARP entry lifetime in seconds"); SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat, arpstat, "ARP statistics (struct arpstat, net/if_arp.h)"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_maxhold), 0, "Number of packets to hold per ARP entry"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second, CTLFLAG_RW, &arp_maxpps, 0, "Maximum number of remotely triggered ARP messages that can be " "logged per second"); /* * Due to the exponential backoff algorithm used for the interval between GARP * retransmissions, the maximum number of retransmissions is limited for * sanity. This limit corresponds to a maximum interval between retransmissions * of 2^16 seconds ~= 18 hours. * * Making this limit more dynamic is more complicated than worthwhile, * especially since sending out GARPs spaced days apart would be of little * use. A maximum dynamic limit would look something like: * * const int max = fls(INT_MAX / hz) - 1; */ #define MAX_GARP_RETRANSMITS 16 static int sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS); static int garp_rexmit_count = 0; /* GARP retransmission setting. */ SYSCTL_PROC(_net_link_ether_inet, OID_AUTO, garp_rexmit_count, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, &garp_rexmit_count, 0, sysctl_garp_rexmit, "I", "Number of times to retransmit GARP packets;" " 0 to disable, maximum of 16"); VNET_DEFINE_STATIC(int, arp_log_level) = LOG_INFO; /* Min. log(9) level. */ #define V_arp_log_level VNET(arp_log_level) SYSCTL_INT(_net_link_ether_arp, OID_AUTO, log_level, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_log_level), 0, "Minimum log(9) level for recording rate limited arp log messages. " "The higher will be log more (emerg=0, info=6 (default), debug=7)."); #define ARP_LOG(pri, ...) do { \ if ((pri) <= V_arp_log_level && \ ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps)) \ log((pri), "arp: " __VA_ARGS__); \ } while (0) static void arpintr(struct mbuf *); static void arptimer(void *); #ifdef INET static void in_arpinput(struct mbuf *); #endif static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp, int bridged, struct llentry *la); static void arp_mark_lle_reachable(struct llentry *la); static void arp_iflladdr(void *arg __unused, struct ifnet *ifp); static eventhandler_tag iflladdr_tag; static const struct netisr_handler arp_nh = { .nh_name = "arp", .nh_handler = arpintr, .nh_proto = NETISR_ARP, .nh_policy = NETISR_POLICY_SOURCE, }; /* * Timeout routine. Age arp_tab entries periodically. */ static void arptimer(void *arg) { struct llentry *lle = (struct llentry *)arg; struct ifnet *ifp; int r_skip_req; if (lle->la_flags & LLE_STATIC) { return; } LLE_WLOCK(lle); if (callout_pending(&lle->lle_timer)) { /* * Here we are a bit odd here in the treatment of * active/pending. If the pending bit is set, it got * rescheduled before I ran. The active * bit we ignore, since if it was stopped * in ll_tablefree() and was currently running * it would have return 0 so the code would * not have deleted it since the callout could * not be stopped so we want to go through * with the delete here now. If the callout * was restarted, the pending bit will be back on and * we just want to bail since the callout_reset would * return 1 and our reference would have been removed * by arpresolve() below. */ LLE_WUNLOCK(lle); return; } ifp = lle->lle_tbl->llt_ifp; CURVNET_SET(ifp->if_vnet); switch (lle->ln_state) { case ARP_LLINFO_REACHABLE: /* * Expiration time is approaching. * Let's try to refresh entry if it is still * in use. * * Set r_skip_req to get feedback from * fast path. Change state and re-schedule * ourselves. */ LLE_REQ_LOCK(lle); lle->r_skip_req = 1; LLE_REQ_UNLOCK(lle); lle->ln_state = ARP_LLINFO_VERIFY; callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit); LLE_WUNLOCK(lle); CURVNET_RESTORE(); return; case ARP_LLINFO_VERIFY: LLE_REQ_LOCK(lle); r_skip_req = lle->r_skip_req; LLE_REQ_UNLOCK(lle); if (r_skip_req == 0 && lle->la_preempt > 0) { /* Entry was used, issue refresh request */ + struct epoch_tracker et; struct in_addr dst; + dst = lle->r_l3addr.addr4; lle->la_preempt--; callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit); LLE_WUNLOCK(lle); + NET_EPOCH_ENTER(et); arprequest(ifp, NULL, &dst, NULL); + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return; } /* Nothing happened. Reschedule if not too late */ if (lle->la_expire > time_uptime) { callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit); LLE_WUNLOCK(lle); CURVNET_RESTORE(); return; } break; case ARP_LLINFO_INCOMPLETE: case ARP_LLINFO_DELETED: break; } if ((lle->la_flags & LLE_DELETED) == 0) { int evt; if (lle->la_flags & LLE_VALID) evt = LLENTRY_EXPIRED; else evt = LLENTRY_TIMEDOUT; EVENTHANDLER_INVOKE(lle_event, lle, evt); } callout_stop(&lle->lle_timer); /* XXX: LOR avoidance. We still have ref on lle. */ LLE_WUNLOCK(lle); IF_AFDATA_LOCK(ifp); LLE_WLOCK(lle); /* Guard against race with other llentry_free(). */ if (lle->la_flags & LLE_LINKED) { LLE_REMREF(lle); lltable_unlink_entry(lle->lle_tbl, lle); } IF_AFDATA_UNLOCK(ifp); size_t pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); ARPSTAT_INC(timeouts); CURVNET_RESTORE(); } /* * Stores link-layer header for @ifp in format suitable for if_output() * into buffer @buf. Resulting header length is stored in @bufsize. * * Returns 0 on success. */ static int arp_fillheader(struct ifnet *ifp, struct arphdr *ah, int bcast, u_char *buf, size_t *bufsize) { struct if_encap_req ereq; int error; bzero(buf, *bufsize); bzero(&ereq, sizeof(ereq)); ereq.buf = buf; ereq.bufsize = *bufsize; ereq.rtype = IFENCAP_LL; ereq.family = AF_ARP; ereq.lladdr = ar_tha(ah); ereq.hdata = (u_char *)ah; if (bcast) ereq.flags = IFENCAP_FLAG_BROADCAST; error = ifp->if_requestencap(ifp, &ereq); if (error == 0) *bufsize = ereq.bufsize; return (error); } /* * Broadcast an ARP request. Caller specifies: * - arp header source ip address * - arp header target ip address * - arp header source ethernet address */ static int arprequest_internal(struct ifnet *ifp, const struct in_addr *sip, const struct in_addr *tip, u_char *enaddr) { struct mbuf *m; struct arphdr *ah; struct sockaddr sa; u_char *carpaddr = NULL; uint8_t linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; struct route ro; int error; + NET_EPOCH_ASSERT(); + if (sip == NULL) { /* * The caller did not supply a source address, try to find * a compatible one among those assigned to this interface. */ - struct epoch_tracker et; struct ifaddr *ifa; - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; if (ifa->ifa_carp) { if ((*carp_iamatch_p)(ifa, &carpaddr) == 0) continue; sip = &IA_SIN(ifa)->sin_addr; } else { carpaddr = NULL; sip = &IA_SIN(ifa)->sin_addr; } if (0 == ((sip->s_addr ^ tip->s_addr) & IA_MASKSIN(ifa)->sin_addr.s_addr)) break; /* found it. */ } - NET_EPOCH_EXIT(et); if (sip == NULL) { printf("%s: cannot find matching address\n", __func__); return (EADDRNOTAVAIL); } } if (enaddr == NULL) enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp); if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) return (ENOMEM); m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) + 2 * ifp->if_addrlen; m->m_pkthdr.len = m->m_len; M_ALIGN(m, m->m_len); ah = mtod(m, struct arphdr *); bzero((caddr_t)ah, m->m_len); #ifdef MAC mac_netinet_arp_send(ifp, m); #endif ah->ar_pro = htons(ETHERTYPE_IP); ah->ar_hln = ifp->if_addrlen; /* hardware address length */ ah->ar_pln = sizeof(struct in_addr); /* protocol address length */ ah->ar_op = htons(ARPOP_REQUEST); bcopy(enaddr, ar_sha(ah), ah->ar_hln); bcopy(sip, ar_spa(ah), ah->ar_pln); bcopy(tip, ar_tpa(ah), ah->ar_pln); sa.sa_family = AF_ARP; sa.sa_len = 2; /* Calculate link header for sending frame */ bzero(&ro, sizeof(ro)); linkhdrsize = sizeof(linkhdr); error = arp_fillheader(ifp, ah, 1, linkhdr, &linkhdrsize); if (error != 0 && error != EAFNOSUPPORT) { ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n", if_name(ifp), error); return (error); } ro.ro_prepend = linkhdr; ro.ro_plen = linkhdrsize; ro.ro_flags = 0; m->m_flags |= M_BCAST; m_clrprotoflags(m); /* Avoid confusing lower layers. */ error = (*ifp->if_output)(ifp, m, &sa, &ro); ARPSTAT_INC(txrequests); if (error) { ARPSTAT_INC(txerrors); ARP_LOG(LOG_DEBUG, "Failed to send ARP packet on %s: %d\n", if_name(ifp), error); } return (error); } void arprequest(struct ifnet *ifp, const struct in_addr *sip, const struct in_addr *tip, u_char *enaddr) { (void) arprequest_internal(ifp, sip, tip, enaddr); } /* * Resolve an IP address into an ethernet address - heavy version. * Used internally by arpresolve(). * We have already checked that we can't use an existing lle without * modification so we have to acquire an LLE_EXCLUSIVE lle lock. * * On success, desten and pflags are filled in and the function returns 0; * If the packet must be held pending resolution, we return EWOULDBLOCK * On other errors, we return the corresponding error code. * Note that m_freem() handles NULL. */ static int arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m, const struct sockaddr *dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { struct llentry *la = NULL, *la_tmp; struct mbuf *curr = NULL; struct mbuf *next = NULL; int error, renew; char *lladdr; int ll_len; + NET_EPOCH_ASSERT(); + if (pflags != NULL) *pflags = 0; if (plle != NULL) *plle = NULL; - if ((flags & LLE_CREATE) == 0) { - struct epoch_tracker et; - - NET_EPOCH_ENTER(et); + if ((flags & LLE_CREATE) == 0) la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); - NET_EPOCH_EXIT(et); - } if (la == NULL && (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) { la = lltable_alloc_entry(LLTABLE(ifp), 0, dst); if (la == NULL) { char addrbuf[INET_ADDRSTRLEN]; log(LOG_DEBUG, "arpresolve: can't allocate llinfo for %s on %s\n", inet_ntoa_r(SIN(dst)->sin_addr, addrbuf), if_name(ifp)); m_freem(m); return (EINVAL); } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(la); la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); /* Prefer ANY existing lle over newly-created one */ if (la_tmp == NULL) lltable_link_entry(LLTABLE(ifp), la); IF_AFDATA_WUNLOCK(ifp); if (la_tmp != NULL) { lltable_free_entry(LLTABLE(ifp), la); la = la_tmp; } } if (la == NULL) { m_freem(m); return (EINVAL); } if ((la->la_flags & LLE_VALID) && ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) { if (flags & LLE_ADDRONLY) { lladdr = la->ll_addr; ll_len = ifp->if_addrlen; } else { lladdr = la->r_linkdata; ll_len = la->r_hdrlen; } bcopy(lladdr, desten, ll_len); /* Notify LLE code that the entry was used by datapath */ llentry_mark_used(la); if (pflags != NULL) *pflags = la->la_flags & (LLE_VALID|LLE_IFADDR); if (plle) { LLE_ADDREF(la); *plle = la; } LLE_WUNLOCK(la); return (0); } renew = (la->la_asked == 0 || la->la_expire != time_uptime); /* * There is an arptab entry, but no ethernet address * response yet. Add the mbuf to the list, dropping * the oldest packet if we have exceeded the system * setting. */ if (m != NULL) { if (la->la_numheld >= V_arp_maxhold) { if (la->la_hold != NULL) { next = la->la_hold->m_nextpkt; m_freem(la->la_hold); la->la_hold = next; la->la_numheld--; ARPSTAT_INC(dropped); } } if (la->la_hold != NULL) { curr = la->la_hold; while (curr->m_nextpkt != NULL) curr = curr->m_nextpkt; curr->m_nextpkt = m; } else la->la_hold = m; la->la_numheld++; } /* * Return EWOULDBLOCK if we have tried less than arp_maxtries. It * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH * if we have already sent arp_maxtries ARP requests. Retransmit the * ARP request, but not faster than one request per second. */ if (la->la_asked < V_arp_maxtries) error = EWOULDBLOCK; /* First request. */ else error = is_gw != 0 ? EHOSTUNREACH : EHOSTDOWN; if (renew) { int canceled, e; LLE_ADDREF(la); la->la_expire = time_uptime; canceled = callout_reset(&la->lle_timer, hz * V_arpt_down, arptimer, la); if (canceled) LLE_REMREF(la); la->la_asked++; LLE_WUNLOCK(la); e = arprequest_internal(ifp, NULL, &SIN(dst)->sin_addr, NULL); /* * Only overwrite 'error' in case of error; in case of success * the proper return value was already set above. */ if (e != 0) return (e); return (error); } LLE_WUNLOCK(la); return (error); } /* * Lookups link header based on an IP address. * On input: * ifp is the interface we use * is_gw != 0 if @dst represents gateway to some destination * m is the mbuf. May be NULL if we don't have a packet. * dst is the next hop, * desten is the storage to put LL header. * flags returns subset of lle flags: LLE_VALID | LLE_IFADDR * * On success, full/partial link header and flags are filled in and * the function returns 0. * If the packet must be held pending resolution, we return EWOULDBLOCK * On other errors, we return the corresponding error code. * Note that m_freem() handles NULL. */ int arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m, const struct sockaddr *dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { - struct epoch_tracker et; struct llentry *la = NULL; + NET_EPOCH_ASSERT(); + if (pflags != NULL) *pflags = 0; if (plle != NULL) *plle = NULL; if (m != NULL) { if (m->m_flags & M_BCAST) { /* broadcast */ (void)memcpy(desten, ifp->if_broadcastaddr, ifp->if_addrlen); return (0); } if (m->m_flags & M_MCAST) { /* multicast */ ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); return (0); } } - NET_EPOCH_ENTER(et); la = lla_lookup(LLTABLE(ifp), plle ? LLE_EXCLUSIVE : LLE_UNLOCKED, dst); if (la != NULL && (la->r_flags & RLLE_VALID) != 0) { /* Entry found, let's copy lle info */ bcopy(la->r_linkdata, desten, la->r_hdrlen); if (pflags != NULL) *pflags = LLE_VALID | (la->r_flags & RLLE_IFADDR); /* Notify the LLE handling code that the entry was used. */ llentry_mark_used(la); if (plle) { LLE_ADDREF(la); *plle = la; LLE_WUNLOCK(la); } - NET_EPOCH_EXIT(et); return (0); } if (plle && la) LLE_WUNLOCK(la); - NET_EPOCH_EXIT(et); return (arpresolve_full(ifp, is_gw, la == NULL ? LLE_CREATE : 0, m, dst, desten, pflags, plle)); } /* * Common length and type checks are done here, * then the protocol-specific routine is called. */ static void arpintr(struct mbuf *m) { struct arphdr *ar; struct ifnet *ifp; char *layer; int hlen; ifp = m->m_pkthdr.rcvif; if (m->m_len < sizeof(struct arphdr) && ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) { ARP_LOG(LOG_NOTICE, "packet with short header received on %s\n", if_name(ifp)); return; } ar = mtod(m, struct arphdr *); /* Check if length is sufficient */ if (m->m_len < arphdr_len(ar)) { m = m_pullup(m, arphdr_len(ar)); if (m == NULL) { ARP_LOG(LOG_NOTICE, "short packet received on %s\n", if_name(ifp)); return; } ar = mtod(m, struct arphdr *); } hlen = 0; layer = ""; switch (ntohs(ar->ar_hrd)) { case ARPHRD_ETHER: hlen = ETHER_ADDR_LEN; /* RFC 826 */ layer = "ethernet"; break; case ARPHRD_INFINIBAND: hlen = 20; /* RFC 4391, INFINIBAND_ALEN */ layer = "infiniband"; break; case ARPHRD_IEEE1394: hlen = 0; /* SHALL be 16 */ /* RFC 2734 */ layer = "firewire"; /* * Restrict too long hardware addresses. * Currently we are capable of handling 20-byte * addresses ( sizeof(lle->ll_addr) ) */ if (ar->ar_hln >= 20) hlen = 16; break; default: ARP_LOG(LOG_NOTICE, "packet with unknown hardware format 0x%02d received on " "%s\n", ntohs(ar->ar_hrd), if_name(ifp)); m_freem(m); return; } if (hlen != 0 && hlen != ar->ar_hln) { ARP_LOG(LOG_NOTICE, "packet with invalid %s address length %d received on %s\n", layer, ar->ar_hln, if_name(ifp)); m_freem(m); return; } ARPSTAT_INC(received); switch (ntohs(ar->ar_pro)) { #ifdef INET case ETHERTYPE_IP: in_arpinput(m); return; #endif } m_freem(m); } #ifdef INET /* * ARP for Internet protocols on 10 Mb/s Ethernet. * Algorithm is that given in RFC 826. * In addition, a sanity check is performed on the sender * protocol address, to catch impersonators. * We no longer handle negotiations for use of trailer protocol: * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent * along with IP replies if we wanted trailers sent to us, * and also sent them in response to IP replies. * This allowed either end to announce the desire to receive * trailer packets. * We no longer reply to requests for ETHERTYPE_TRAIL protocol either, * but formerly didn't normally send requests. */ static int log_arp_wrong_iface = 1; static int log_arp_movements = 1; static int log_arp_permanent_modify = 1; static int allow_multicast = 0; SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW, &log_arp_wrong_iface, 0, "log arp packets arriving on the wrong interface"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW, &log_arp_movements, 0, "log arp replies from MACs different than the one in the cache"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW, &log_arp_permanent_modify, 0, "log arp replies from MACs different than the one in the permanent arp entry"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW, &allow_multicast, 0, "accept multicast addresses"); static void in_arpinput(struct mbuf *m) { struct rm_priotracker in_ifa_tracker; struct arphdr *ah; struct ifnet *ifp = m->m_pkthdr.rcvif; struct llentry *la = NULL, *la_tmp; struct ifaddr *ifa; struct in_ifaddr *ia; struct sockaddr sa; struct in_addr isaddr, itaddr, myaddr; u_int8_t *enaddr = NULL; int op; int bridged = 0, is_bridge = 0; int carped; struct sockaddr_in sin; struct sockaddr *dst; struct nhop4_basic nh4; uint8_t linkhdr[LLE_MAX_LINKHDR]; struct route ro; size_t linkhdrsize; int lladdr_off; int error; char addrbuf[INET_ADDRSTRLEN]; - struct epoch_tracker et; + + NET_EPOCH_ASSERT(); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr.s_addr = 0; if (ifp->if_bridge) bridged = 1; if (ifp->if_type == IFT_BRIDGE) is_bridge = 1; /* * We already have checked that mbuf contains enough contiguous data * to hold entire arp message according to the arp header. */ ah = mtod(m, struct arphdr *); /* * ARP is only for IPv4 so we can reject packets with * a protocol length not equal to an IPv4 address. */ if (ah->ar_pln != sizeof(struct in_addr)) { ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n", sizeof(struct in_addr)); goto drop; } if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) { ARP_LOG(LOG_NOTICE, "%*D is multicast\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":"); goto drop; } op = ntohs(ah->ar_op); (void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr)); (void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr)); if (op == ARPOP_REPLY) ARPSTAT_INC(rxreplies); /* * For a bridge, we want to check the address irrespective * of the receive interface. (This will change slightly * when we have clusters of interfaces). */ IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || ia->ia_ifp == ifp) && itaddr.s_addr == ia->ia_addr.sin_addr.s_addr && (ia->ia_ifa.ifa_carp == NULL || (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } } LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash) if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || ia->ia_ifp == ifp) && isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } #define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \ (ia->ia_ifp->if_bridge == ifp->if_softc && \ !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) && \ addr == ia->ia_addr.sin_addr.s_addr) /* * Check the case when bridge shares its MAC address with * some of its children, so packets are claimed by bridge * itself (bridge_input() does it first), but they are really * meant to be destined to the bridge member. */ if (is_bridge) { LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) { ifa_ref(&ia->ia_ifa); ifp = ia->ia_ifp; IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } } } #undef BDG_MEMBER_MATCHES_ARP IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * No match, use the first inet address on the receive interface * as a dummy address for the rest of the function. */ - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && (ifa->ifa_carp == NULL || (*carp_iamatch_p)(ifa, &enaddr))) { ia = ifatoia(ifa); ifa_ref(ifa); - NET_EPOCH_EXIT(et); goto match; } - NET_EPOCH_EXIT(et); /* * If bridging, fall back to using any inet address. */ IN_IFADDR_RLOCK(&in_ifa_tracker); if (!bridged || (ia = CK_STAILQ_FIRST(&V_in_ifaddrhead)) == NULL) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto drop; } ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); match: if (!enaddr) enaddr = (u_int8_t *)IF_LLADDR(ifp); carped = (ia->ia_ifa.ifa_carp != NULL); myaddr = ia->ia_addr.sin_addr; ifa_free(&ia->ia_ifa); if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen)) goto drop; /* it's from me, ignore it. */ if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) { ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address " "%s!\n", inet_ntoa_r(isaddr, addrbuf)); goto drop; } if (ifp->if_addrlen != ah->ar_hln) { ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, " "i/f %d (ignored)\n", ifp->if_addrlen, (u_char *) ar_sha(ah), ":", ah->ar_hln, ifp->if_addrlen); goto drop; } /* * Warn if another host is using the same IP address, but only if the * IP address isn't 0.0.0.0, which is used for DHCP only, in which * case we suppress the warning to avoid false positive complaints of * potential misconfiguration. */ if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr && myaddr.s_addr != 0) { ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", inet_ntoa_r(isaddr, addrbuf), ifp->if_xname); itaddr = myaddr; ARPSTAT_INC(dupips); goto reply; } if (ifp->if_flags & IFF_STATICARP) goto reply; bzero(&sin, sizeof(sin)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr = isaddr; dst = (struct sockaddr *)&sin; - NET_EPOCH_ENTER(et); la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); - NET_EPOCH_EXIT(et); if (la != NULL) arp_check_update_lle(ah, isaddr, ifp, bridged, la); else if (itaddr.s_addr == myaddr.s_addr) { /* * Request/reply to our address, but no lle exists yet. * Calculate full link prepend to use in lle. */ linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr, &linkhdrsize, &lladdr_off) != 0) goto reply; /* Allocate new entry */ la = lltable_alloc_entry(LLTABLE(ifp), 0, dst); if (la == NULL) { /* * lle creation may fail if source address belongs * to non-directly connected subnet. However, we * will try to answer the request instead of dropping * frame. */ goto reply; } lltable_set_entry_addr(ifp, la, linkhdr, linkhdrsize, lladdr_off); IF_AFDATA_WLOCK(ifp); LLE_WLOCK(la); la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); /* * Check if lle still does not exists. * If it does, that means that we either * 1) have configured it explicitly, via * 1a) 'arp -s' static entry or * 1b) interface address static record * or * 2) it was the result of sending first packet to-host * or * 3) it was another arp reply packet we handled in * different thread. * * In all cases except 3) we definitely need to prefer * existing lle. For the sake of simplicity, prefer any * existing lle over newly-create one. */ if (la_tmp == NULL) lltable_link_entry(LLTABLE(ifp), la); IF_AFDATA_WUNLOCK(ifp); if (la_tmp == NULL) { arp_mark_lle_reachable(la); LLE_WUNLOCK(la); } else { /* Free newly-create entry and handle packet */ lltable_free_entry(LLTABLE(ifp), la); la = la_tmp; la_tmp = NULL; arp_check_update_lle(ah, isaddr, ifp, bridged, la); /* arp_check_update_lle() returns @la unlocked */ } la = NULL; } reply: if (op != ARPOP_REQUEST) goto drop; ARPSTAT_INC(rxrequests); if (itaddr.s_addr == myaddr.s_addr) { /* Shortcut.. the receiving interface is the target. */ (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); } else { struct llentry *lle = NULL; sin.sin_addr = itaddr; - NET_EPOCH_ENTER(et); lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin); - NET_EPOCH_EXIT(et); if ((lle != NULL) && (lle->la_flags & LLE_PUB)) { (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), lle->ll_addr, ah->ar_hln); LLE_RUNLOCK(lle); } else { if (lle != NULL) LLE_RUNLOCK(lle); if (!V_arp_proxyall) goto drop; /* XXX MRT use table 0 for arp reply */ if (fib4_lookup_nh_basic(0, itaddr, 0, 0, &nh4) != 0) goto drop; /* * Don't send proxies for nodes on the same interface * as this one came out of, or we'll get into a fight * over who claims what Ether address. */ if (nh4.nh_ifp == ifp) goto drop; (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); /* * Also check that the node which sent the ARP packet * is on the interface we expect it to be on. This * avoids ARP chaos if an interface is connected to the * wrong network. */ /* XXX MRT use table 0 for arp checks */ if (fib4_lookup_nh_basic(0, isaddr, 0, 0, &nh4) != 0) goto drop; if (nh4.nh_ifp != ifp) { ARP_LOG(LOG_INFO, "proxy: ignoring request" " from %s via %s\n", inet_ntoa_r(isaddr, addrbuf), ifp->if_xname); goto drop; } #ifdef DEBUG_PROXY printf("arp: proxying for %s\n", inet_ntoa_r(itaddr, addrbuf)); #endif } } if (itaddr.s_addr == myaddr.s_addr && IN_LINKLOCAL(ntohl(itaddr.s_addr))) { /* RFC 3927 link-local IPv4; always reply by broadcast. */ #ifdef DEBUG_LINKLOCAL printf("arp: sending reply for link-local addr %s\n", inet_ntoa_r(itaddr, addrbuf)); #endif m->m_flags |= M_BCAST; m->m_flags &= ~M_MCAST; } else { /* default behaviour; never reply by broadcast. */ m->m_flags &= ~(M_BCAST|M_MCAST); } (void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln); (void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln); ah->ar_op = htons(ARPOP_REPLY); ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */ m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln); m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = NULL; sa.sa_family = AF_ARP; sa.sa_len = 2; /* Calculate link header for sending frame */ bzero(&ro, sizeof(ro)); linkhdrsize = sizeof(linkhdr); error = arp_fillheader(ifp, ah, 0, linkhdr, &linkhdrsize); /* * arp_fillheader() may fail due to lack of support inside encap request * routing. This is not necessary an error, AF_ARP can/should be handled * by if_output(). */ if (error != 0 && error != EAFNOSUPPORT) { ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n", if_name(ifp), error); return; } ro.ro_prepend = linkhdr; ro.ro_plen = linkhdrsize; ro.ro_flags = 0; m_clrprotoflags(m); /* Avoid confusing lower layers. */ (*ifp->if_output)(ifp, m, &sa, &ro); ARPSTAT_INC(txreplies); return; drop: m_freem(m); } #endif /* * Checks received arp data against existing @la. * Updates lle state/performs notification if necessary. */ static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp, int bridged, struct llentry *la) { struct sockaddr sa; struct mbuf *m_hold, *m_hold_next; uint8_t linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; char addrbuf[INET_ADDRSTRLEN]; LLE_WLOCK_ASSERT(la); /* the following is not an error when doing bridging */ if (!bridged && la->lle_tbl->llt_ifp != ifp) { if (log_arp_wrong_iface) ARP_LOG(LOG_WARNING, "%s is on %s " "but got reply from %*D on %s\n", inet_ntoa_r(isaddr, addrbuf), la->lle_tbl->llt_ifp->if_xname, ifp->if_addrlen, (u_char *)ar_sha(ah), ":", ifp->if_xname); LLE_WUNLOCK(la); return; } if ((la->la_flags & LLE_VALID) && bcmp(ar_sha(ah), la->ll_addr, ifp->if_addrlen)) { if (la->la_flags & LLE_STATIC) { LLE_WUNLOCK(la); if (log_arp_permanent_modify) ARP_LOG(LOG_ERR, "%*D attempts to modify " "permanent entry for %s on %s\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", inet_ntoa_r(isaddr, addrbuf), ifp->if_xname); return; } if (log_arp_movements) { ARP_LOG(LOG_INFO, "%s moved from %*D " "to %*D on %s\n", inet_ntoa_r(isaddr, addrbuf), ifp->if_addrlen, (u_char *)la->ll_addr, ":", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", ifp->if_xname); } } /* Calculate full link prepend to use in lle */ linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr, &linkhdrsize, &lladdr_off) != 0) return; /* Check if something has changed */ if (memcmp(la->r_linkdata, linkhdr, linkhdrsize) != 0 || (la->la_flags & LLE_VALID) == 0) { /* Try to perform LLE update */ if (lltable_try_set_entry_addr(ifp, la, linkhdr, linkhdrsize, lladdr_off) == 0) return; /* Clear fast path feedback request if set */ la->r_skip_req = 0; } arp_mark_lle_reachable(la); /* * The packets are all freed within the call to the output * routine. * * NB: The lock MUST be released before the call to the * output routine. */ if (la->la_hold != NULL) { m_hold = la->la_hold; la->la_hold = NULL; la->la_numheld = 0; lltable_fill_sa_entry(la, &sa); LLE_WUNLOCK(la); for (; m_hold != NULL; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_hold->m_nextpkt = NULL; /* Avoid confusing lower layers. */ m_clrprotoflags(m_hold); (*ifp->if_output)(ifp, m_hold, &sa, NULL); } } else LLE_WUNLOCK(la); } static void arp_mark_lle_reachable(struct llentry *la) { int canceled, wtime; LLE_WLOCK_ASSERT(la); la->ln_state = ARP_LLINFO_REACHABLE; EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED); if (!(la->la_flags & LLE_STATIC)) { LLE_ADDREF(la); la->la_expire = time_uptime + V_arpt_keep; wtime = V_arpt_keep - V_arp_maxtries * V_arpt_rexmit; if (wtime < 0) wtime = V_arpt_keep; canceled = callout_reset(&la->lle_timer, hz * wtime, arptimer, la); if (canceled) LLE_REMREF(la); } la->la_asked = 0; la->la_preempt = V_arp_maxtries; } /* * Add permanent link-layer record for given interface address. */ static __noinline void arp_add_ifa_lle(struct ifnet *ifp, const struct sockaddr *dst) { struct llentry *lle, *lle_tmp; /* * Interface address LLE record is considered static * because kernel code relies on LLE_STATIC flag to check * if these entries can be rewriten by arp updates. */ lle = lltable_alloc_entry(LLTABLE(ifp), LLE_IFADDR | LLE_STATIC, dst); if (lle == NULL) { log(LOG_INFO, "arp_ifinit: cannot create arp " "entry for interface address\n"); return; } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(lle); /* Unlink any entry if exists */ lle_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); if (lle_tmp != NULL) lltable_unlink_entry(LLTABLE(ifp), lle_tmp); lltable_link_entry(LLTABLE(ifp), lle); IF_AFDATA_WUNLOCK(ifp); if (lle_tmp != NULL) EVENTHANDLER_INVOKE(lle_event, lle_tmp, LLENTRY_EXPIRED); EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED); LLE_WUNLOCK(lle); if (lle_tmp != NULL) lltable_free_entry(LLTABLE(ifp), lle_tmp); } /* * Handle the garp_rexmit_count. Like sysctl_handle_int(), but limits the range * of valid values. */ static int sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS) { int error; int rexmit_count = *(int *)arg1; error = sysctl_handle_int(oidp, &rexmit_count, 0, req); /* Enforce limits on any new value that may have been set. */ if (!error && req->newptr) { /* A new value was set. */ if (rexmit_count < 0) { rexmit_count = 0; } else if (rexmit_count > MAX_GARP_RETRANSMITS) { rexmit_count = MAX_GARP_RETRANSMITS; } *(int *)arg1 = rexmit_count; } return (error); } /* * Retransmit a Gratuitous ARP (GARP) and, if necessary, schedule a callout to * retransmit it again. A pending callout owns a reference to the ifa. */ static void garp_rexmit(void *arg) { struct in_ifaddr *ia = arg; if (callout_pending(&ia->ia_garp_timer) || !callout_active(&ia->ia_garp_timer)) { IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp); ifa_free(&ia->ia_ifa); return; } CURVNET_SET(ia->ia_ifa.ifa_ifp->if_vnet); /* * Drop lock while the ARP request is generated. */ IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp); arprequest(ia->ia_ifa.ifa_ifp, &IA_SIN(ia)->sin_addr, &IA_SIN(ia)->sin_addr, IF_LLADDR(ia->ia_ifa.ifa_ifp)); /* * Increment the count of retransmissions. If the count has reached the * maximum value, stop sending the GARP packets. Otherwise, schedule * the callout to retransmit another GARP packet. */ ++ia->ia_garp_count; if (ia->ia_garp_count >= garp_rexmit_count) { ifa_free(&ia->ia_ifa); } else { int rescheduled; IF_ADDR_WLOCK(ia->ia_ifa.ifa_ifp); rescheduled = callout_reset(&ia->ia_garp_timer, (1 << ia->ia_garp_count) * hz, garp_rexmit, ia); IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp); if (rescheduled) { ifa_free(&ia->ia_ifa); } } CURVNET_RESTORE(); } /* * Start the GARP retransmit timer. * * A single GARP is always transmitted when an IPv4 address is added * to an interface and that is usually sufficient. However, in some * circumstances, such as when a shared address is passed between * cluster nodes, this single GARP may occasionally be dropped or * lost. This can lead to neighbors on the network link working with a * stale ARP cache and sending packets destined for that address to * the node that previously owned the address, which may not respond. * * To avoid this situation, GARP retransmits can be enabled by setting * the net.link.ether.inet.garp_rexmit_count sysctl to a value greater * than zero. The setting represents the maximum number of * retransmissions. The interval between retransmissions is calculated * using an exponential backoff algorithm, doubling each time, so the * retransmission intervals are: {1, 2, 4, 8, 16, ...} (seconds). */ static void garp_timer_start(struct ifaddr *ifa) { struct in_ifaddr *ia = (struct in_ifaddr *) ifa; IF_ADDR_WLOCK(ia->ia_ifa.ifa_ifp); ia->ia_garp_count = 0; if (callout_reset(&ia->ia_garp_timer, (1 << ia->ia_garp_count) * hz, garp_rexmit, ia) == 0) { ifa_ref(ifa); } IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp); } void arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa) { + struct epoch_tracker et; const struct sockaddr_in *dst_in; const struct sockaddr *dst; if (ifa->ifa_carp != NULL) return; dst = ifa->ifa_addr; dst_in = (const struct sockaddr_in *)dst; if (ntohl(dst_in->sin_addr.s_addr) == INADDR_ANY) return; + NET_EPOCH_ENTER(et); arp_announce_ifaddr(ifp, dst_in->sin_addr, IF_LLADDR(ifp)); + NET_EPOCH_EXIT(et); if (garp_rexmit_count > 0) { garp_timer_start(ifa); } arp_add_ifa_lle(ifp, dst); } void arp_announce_ifaddr(struct ifnet *ifp, struct in_addr addr, u_char *enaddr) { if (ntohl(addr.s_addr) != INADDR_ANY) arprequest(ifp, &addr, &addr, enaddr); } /* * Sends gratuitous ARPs for each ifaddr to notify other * nodes about the address change. */ static __noinline void arp_handle_ifllchange(struct ifnet *ifp) { struct ifaddr *ifa; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit(ifp, ifa); } } /* * A handler for interface link layer address change event. */ static void arp_iflladdr(void *arg __unused, struct ifnet *ifp) { lltable_update_ifaddr(LLTABLE(ifp)); if ((ifp->if_flags & IFF_UP) != 0) arp_handle_ifllchange(ifp); } static void vnet_arp_init(void) { if (IS_DEFAULT_VNET(curvnet)) { netisr_register(&arp_nh); iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event, arp_iflladdr, NULL, EVENTHANDLER_PRI_ANY); } #ifdef VIMAGE else netisr_register_vnet(&arp_nh); #endif } VNET_SYSINIT(vnet_arp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, vnet_arp_init, 0); #ifdef VIMAGE /* * We have to unregister ARP along with IP otherwise we risk doing INADDR_HASH * lookups after destroying the hash. Ideally this would go on SI_ORDER_3.5. */ static void vnet_arp_destroy(__unused void *arg) { netisr_unregister_vnet(&arp_nh); } VNET_SYSUNINIT(vnet_arp_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, vnet_arp_destroy, NULL); #endif diff --git a/sys/netinet/igmp.c b/sys/netinet/igmp.c index beb0f98209be..81d09c926998 100644 --- a/sys/netinet/igmp.c +++ b/sys/netinet/igmp.c @@ -1,3660 +1,3646 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ /* * Internet Group Management Protocol (IGMP) routines. * [RFC1112, RFC2236, RFC3376] * * Written by Steve Deering, Stanford, May 1988. * Modified by Rosen Sharma, Stanford, Aug 1994. * Modified by Bill Fenner, Xerox PARC, Feb 1995. * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson. * * MULTICAST Revision: 3.5.1.4 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IGMPV3 #define KTR_IGMPV3 KTR_INET #endif static struct igmp_ifsoftc * igi_alloc_locked(struct ifnet *); static void igi_delete_locked(const struct ifnet *); static void igmp_dispatch_queue(struct mbufq *, int, const int); static void igmp_fasttimo_vnet(void); static void igmp_final_leave(struct in_multi *, struct igmp_ifsoftc *); static int igmp_handle_state_change(struct in_multi *, struct igmp_ifsoftc *); static int igmp_initial_join(struct in_multi *, struct igmp_ifsoftc *); static int igmp_input_v1_query(struct ifnet *, const struct ip *, const struct igmp *); static int igmp_input_v2_query(struct ifnet *, const struct ip *, const struct igmp *); static int igmp_input_v3_query(struct ifnet *, const struct ip *, /*const*/ struct igmpv3 *); static int igmp_input_v3_group_query(struct in_multi *, struct igmp_ifsoftc *, int, /*const*/ struct igmpv3 *); static int igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *, /*const*/ struct igmp *); static int igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *, /*const*/ struct igmp *); static void igmp_intr(struct mbuf *); static int igmp_isgroupreported(const struct in_addr); static struct mbuf * igmp_ra_alloc(void); #ifdef KTR static char * igmp_rec_type_to_str(const int); #endif static void igmp_set_version(struct igmp_ifsoftc *, const int); static void igmp_slowtimo_vnet(void); static int igmp_v1v2_queue_report(struct in_multi *, const int); static void igmp_v1v2_process_group_timer(struct in_multi *, const int); static void igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *); static void igmp_v2_update_group(struct in_multi *, const int); static void igmp_v3_cancel_link_timers(struct igmp_ifsoftc *); static void igmp_v3_dispatch_general_query(struct igmp_ifsoftc *); static struct mbuf * igmp_v3_encap_report(struct ifnet *, struct mbuf *); static int igmp_v3_enqueue_group_record(struct mbufq *, struct in_multi *, const int, const int, const int); static int igmp_v3_enqueue_filter_change(struct mbufq *, struct in_multi *); static void igmp_v3_process_group_timers(struct in_multi_head *, struct mbufq *, struct mbufq *, struct in_multi *, const int); static int igmp_v3_merge_state_changes(struct in_multi *, struct mbufq *); static void igmp_v3_suppress_group_record(struct in_multi *); static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS); static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS); static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS); static const struct netisr_handler igmp_nh = { .nh_name = "igmp", .nh_handler = igmp_intr, .nh_proto = NETISR_IGMP, .nh_policy = NETISR_POLICY_SOURCE, }; /* * System-wide globals. * * Unlocked access to these is OK, except for the global IGMP output * queue. The IGMP subsystem lock ends up being system-wide for the moment, * because all VIMAGEs have to share a global output queue, as netisrs * themselves are not virtualized. * * Locking: * * The permitted lock order is: IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * Any may be taken independently; if any are held at the same * time, the above lock order must be followed. * * All output is delegated to the netisr. * Now that Giant has been eliminated, the netisr may be inlined. * * IN_MULTI_LIST_LOCK covers in_multi. * * IGMP_LOCK covers igmp_ifsoftc and any global variables in this file, * including the output queue. * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of * per-link state iterators. * * igmp_ifsoftc is valid as long as PF_INET is attached to the interface, * therefore it is not refcounted. * We allow unlocked reads of igmp_ifsoftc when accessed via in_multi. * * Reference counting * * IGMP acquires its own reference every time an in_multi is passed to * it and the group is being joined for the first time. * * IGMP releases its reference(s) on in_multi in a deferred way, * because the operations which process the release run as part of * a loop whose control variables are directly affected by the release * (that, and not recursing on the IF_ADDR_LOCK). * * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds * to a vnet in ifp->if_vnet. * * SMPng: XXX We may potentially race operations on ifma_protospec. * The problem is that we currently lack a clean way of taking the * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing, * as anything which modifies ifma needs to be covered by that lock. * So check for ifma_protospec being NULL before proceeding. */ struct mtx igmp_mtx; struct mbuf *m_raopt; /* Router Alert option */ static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); /* * VIMAGE-wide globals. * * The IGMPv3 timers themselves need to run per-image, however, * protosw timers run globally (see tcp). * An ifnet can only be in one vimage at a time, and the loopback * ifnet, loif, is itself virtualized. * It would otherwise be possible to seriously hose IGMP state, * and create inconsistencies in upstream multicast routing, if you have * multiple VIMAGEs running on the same link joining different multicast * groups, UNLESS the "primary IP address" is different. This is because * IGMP for IPv4 does not force link-local addresses to be used for each * node, unlike MLD for IPv6. * Obviously the IGMPv3 per-interface state has per-vimage granularity * also as a result. * * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection * policy to control the address used by IGMP on the link. */ VNET_DEFINE_STATIC(int, interface_timers_running); /* IGMPv3 general * query response */ VNET_DEFINE_STATIC(int, state_change_timers_running); /* IGMPv3 state-change * retransmit */ VNET_DEFINE_STATIC(int, current_state_timers_running); /* IGMPv1/v2 host * report; IGMPv3 g/sg * query response */ #define V_interface_timers_running VNET(interface_timers_running) #define V_state_change_timers_running VNET(state_change_timers_running) #define V_current_state_timers_running VNET(current_state_timers_running) VNET_DEFINE_STATIC(LIST_HEAD(, igmp_ifsoftc), igi_head) = LIST_HEAD_INITIALIZER(igi_head); VNET_DEFINE_STATIC(struct igmpstat, igmpstat) = { .igps_version = IGPS_VERSION_3, .igps_len = sizeof(struct igmpstat), }; VNET_DEFINE_STATIC(struct timeval, igmp_gsrdelay) = {10, 0}; #define V_igi_head VNET(igi_head) #define V_igmpstat VNET(igmpstat) #define V_igmp_gsrdelay VNET(igmp_gsrdelay) VNET_DEFINE_STATIC(int, igmp_recvifkludge) = 1; VNET_DEFINE_STATIC(int, igmp_sendra) = 1; VNET_DEFINE_STATIC(int, igmp_sendlocal) = 1; VNET_DEFINE_STATIC(int, igmp_v1enable) = 1; VNET_DEFINE_STATIC(int, igmp_v2enable) = 1; VNET_DEFINE_STATIC(int, igmp_legacysupp); VNET_DEFINE_STATIC(int, igmp_default_version) = IGMP_VERSION_3; #define V_igmp_recvifkludge VNET(igmp_recvifkludge) #define V_igmp_sendra VNET(igmp_sendra) #define V_igmp_sendlocal VNET(igmp_sendlocal) #define V_igmp_v1enable VNET(igmp_v1enable) #define V_igmp_v2enable VNET(igmp_v2enable) #define V_igmp_legacysupp VNET(igmp_legacysupp) #define V_igmp_default_version VNET(igmp_default_version) /* * Virtualized sysctls. */ SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmpstat), igmpstat, ""); SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_recvifkludge), 0, "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendra), 0, "Send IP Router Alert option in IGMPv2/v3 messages"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendlocal), 0, "Send IGMP membership reports for 224.0.0.0/24 groups"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v1enable), 0, "Enable backwards compatibility with IGMPv1"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v2enable), 0, "Enable backwards compatibility with IGMPv2"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_legacysupp), 0, "Allow v1/v2 reports to suppress v3 group responses"); SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I", "Default version of IGMP to run on each interface"); SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I", "Rate limit for IGMPv3 Group-and-Source queries in seconds"); /* * Non-virtualized sysctls. */ static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo, "Per-interface IGMPv3 state"); static __inline void igmp_save_context(struct mbuf *m, struct ifnet *ifp) { #ifdef VIMAGE m->m_pkthdr.PH_loc.ptr = ifp->if_vnet; #endif /* VIMAGE */ m->m_pkthdr.flowid = ifp->if_index; } static __inline void igmp_scrub_context(struct mbuf *m) { m->m_pkthdr.PH_loc.ptr = NULL; m->m_pkthdr.flowid = 0; } /* * Restore context from a queued IGMP output chain. * Return saved ifindex. * * VIMAGE: The assertion is there to make sure that we * actually called CURVNET_SET() with what's in the mbuf chain. */ static __inline uint32_t igmp_restore_context(struct mbuf *m) { #ifdef notyet #if defined(VIMAGE) && defined(INVARIANTS) KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr), ("%s: called when curvnet was not restored", __func__)); #endif #endif return (m->m_pkthdr.flowid); } /* * Retrieve or set default IGMP version. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by IGMP lock. */ static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS) { int error; int new; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); IGMP_LOCK(); new = V_igmp_default_version; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) goto out_locked; if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) { error = EINVAL; goto out_locked; } CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d", V_igmp_default_version, new); V_igmp_default_version = new; out_locked: IGMP_UNLOCK(); return (error); } /* * Retrieve or set threshold between group-source queries in seconds. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by IGMP lock. */ static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS) { int error; int i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); IGMP_LOCK(); i = V_igmp_gsrdelay.tv_sec; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) goto out_locked; if (i < -1 || i >= 60) { error = EINVAL; goto out_locked; } CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d", V_igmp_gsrdelay.tv_sec, i); V_igmp_gsrdelay.tv_sec = i; out_locked: IGMP_UNLOCK(); return (error); } /* * Expose struct igmp_ifsoftc to userland, keyed by ifindex. * For use by ifmcstat(8). * * SMPng: NOTE: Does an unlocked ifindex space read. * VIMAGE: Assume curvnet set by caller. The node handler itself * is not directly virtualized. */ static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS) { int *name; int error; u_int namelen; struct ifnet *ifp; struct igmp_ifsoftc *igi; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo)); if (error) return (error); IN_MULTI_LIST_LOCK(); IGMP_LOCK(); if (name[0] <= 0 || name[0] > V_if_index) { error = ENOENT; goto out_locked; } error = ENOENT; ifp = ifnet_byindex(name[0]); if (ifp == NULL) goto out_locked; LIST_FOREACH(igi, &V_igi_head, igi_link) { if (ifp == igi->igi_ifp) { struct igmp_ifinfo info; info.igi_version = igi->igi_version; info.igi_v1_timer = igi->igi_v1_timer; info.igi_v2_timer = igi->igi_v2_timer; info.igi_v3_timer = igi->igi_v3_timer; info.igi_flags = igi->igi_flags; info.igi_rv = igi->igi_rv; info.igi_qi = igi->igi_qi; info.igi_qri = igi->igi_qri; info.igi_uri = igi->igi_uri; error = SYSCTL_OUT(req, &info, sizeof(info)); break; } } out_locked: IGMP_UNLOCK(); IN_MULTI_LIST_UNLOCK(); return (error); } /* * Dispatch an entire queue of pending packet chains * using the netisr. * VIMAGE: Assumes the vnet pointer has been set. */ static void igmp_dispatch_queue(struct mbufq *mq, int limit, const int loop) { + struct epoch_tracker et; struct mbuf *m; + NET_EPOCH_ENTER(et); while ((m = mbufq_dequeue(mq)) != NULL) { CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, mq, m); if (loop) m->m_flags |= M_IGMP_LOOP; netisr_dispatch(NETISR_IGMP, m); if (--limit == 0) break; } + NET_EPOCH_EXIT(et); } /* * Filter outgoing IGMP report state by group. * * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1). * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are * disabled for all groups in the 224.0.0.0/24 link-local scope. However, * this may break certain IGMP snooping switches which rely on the old * report behaviour. * * Return zero if the given group is one for which IGMP reports * should be suppressed, or non-zero if reports should be issued. */ static __inline int igmp_isgroupreported(const struct in_addr addr) { if (in_allhosts(addr) || ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr))))) return (0); return (1); } /* * Construct a Router Alert option to use in outgoing packets. */ static struct mbuf * igmp_ra_alloc(void) { struct mbuf *m; struct ipoption *p; m = m_get(M_WAITOK, MT_DATA); p = mtod(m, struct ipoption *); p->ipopt_dst.s_addr = INADDR_ANY; p->ipopt_list[0] = (char)IPOPT_RA; /* Router Alert Option */ p->ipopt_list[1] = 0x04; /* 4 bytes long */ p->ipopt_list[2] = IPOPT_EOL; /* End of IP option list */ p->ipopt_list[3] = 0x00; /* pad byte */ m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1]; return (m); } /* * Attach IGMP when PF_INET is attached to an interface. */ struct igmp_ifsoftc * igmp_domifattach(struct ifnet *ifp) { struct igmp_ifsoftc *igi; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi = igi_alloc_locked(ifp); if (!(ifp->if_flags & IFF_MULTICAST)) igi->igi_flags |= IGIF_SILENT; IGMP_UNLOCK(); return (igi); } /* * VIMAGE: assume curvnet set by caller. */ static struct igmp_ifsoftc * igi_alloc_locked(/*const*/ struct ifnet *ifp) { struct igmp_ifsoftc *igi; IGMP_LOCK_ASSERT(); igi = malloc(sizeof(struct igmp_ifsoftc), M_IGMP, M_NOWAIT|M_ZERO); if (igi == NULL) goto out; igi->igi_ifp = ifp; igi->igi_version = V_igmp_default_version; igi->igi_flags = 0; igi->igi_rv = IGMP_RV_INIT; igi->igi_qi = IGMP_QI_INIT; igi->igi_qri = IGMP_QRI_INIT; igi->igi_uri = IGMP_URI_INIT; mbufq_init(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS); LIST_INSERT_HEAD(&V_igi_head, igi, igi_link); CTR2(KTR_IGMPV3, "allocate igmp_ifsoftc for ifp %p(%s)", ifp, ifp->if_xname); out: return (igi); } /* * Hook for ifdetach. * * NOTE: Some finalization tasks need to run before the protocol domain * is detached, but also before the link layer does its cleanup. * * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK(). * XXX This is also bitten by unlocked ifma_protospec access. */ void igmp_ifdetach(struct ifnet *ifp) { struct igmp_ifsoftc *igi; struct ifmultiaddr *ifma, *next; struct in_multi *inm; struct in_multi_head inm_free_tmp; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); SLIST_INIT(&inm_free_tmp); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; if (igi->igi_version == IGMP_VERSION_3) { IF_ADDR_WLOCK(ifp); restart: CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_state == IGMP_LEAVING_MEMBER) inm_rele_locked(&inm_free_tmp, inm); inm_clear_recorded(inm); if (__predict_false(ifma_restart)) { ifma_restart = false; goto restart; } } IF_ADDR_WUNLOCK(ifp); inm_release_list_deferred(&inm_free_tmp); } IGMP_UNLOCK(); } /* * Hook for domifdetach. */ void igmp_domifdetach(struct ifnet *ifp) { CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi_delete_locked(ifp); IGMP_UNLOCK(); } static void igi_delete_locked(const struct ifnet *ifp) { struct igmp_ifsoftc *igi, *tigi; CTR3(KTR_IGMPV3, "%s: freeing igmp_ifsoftc for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK_ASSERT(); LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) { if (igi->igi_ifp == ifp) { /* * Free deferred General Query responses. */ mbufq_drain(&igi->igi_gq); LIST_REMOVE(igi, igi_link); free(igi, M_IGMP); return; } } } /* * Process a received IGMPv1 query. * Return non-zero if the message should be dropped. * * VIMAGE: The curvnet pointer is derived from the input ifp. */ static int igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { - struct epoch_tracker et; struct ifmultiaddr *ifma; struct igmp_ifsoftc *igi; struct in_multi *inm; + NET_EPOCH_ASSERT(); + /* * IGMPv1 Host Mmembership Queries SHOULD always be addressed to * 224.0.0.1. They are always treated as General Queries. * igmp_group is always ignored. Do not drop it as a userland * daemon may wish to see it. * XXX SMPng: unlocked increments in igmpstat assumed atomic. */ if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) { IGMPSTAT_INC(igps_rcv_badqueries); return (0); } IGMPSTAT_INC(igps_rcv_gen_queries); IN_MULTI_LIST_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Switch to IGMPv1 host compatibility mode. */ igmp_set_version(igi, IGMP_VERSION_1); CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname); /* * Start the timers in all of our group records * for the interface on which the query arrived, * except those which are already running. */ - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_timer != 0) continue; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_V1V2_MAX_RI * PR_FASTHZ); V_current_state_timers_running = 1; break; case IGMP_LEAVING_MEMBER: break; } } - NET_EPOCH_EXIT(et); out_locked: IGMP_UNLOCK(); IN_MULTI_LIST_UNLOCK(); return (0); } /* * Process a received IGMPv2 general or group-specific query. */ static int igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { - struct epoch_tracker et; struct ifmultiaddr *ifma; struct igmp_ifsoftc *igi; struct in_multi *inm; int is_general_query; uint16_t timer; + NET_EPOCH_ASSERT(); + is_general_query = 0; /* * Validate address fields upfront. * XXX SMPng: unlocked increments in igmpstat assumed atomic. */ if (in_nullhost(igmp->igmp_group)) { /* * IGMPv2 General Query. * If this was not sent to the all-hosts group, ignore it. */ if (!in_allhosts(ip->ip_dst)) return (0); IGMPSTAT_INC(igps_rcv_gen_queries); is_general_query = 1; } else { /* IGMPv2 Group-Specific Query. */ IGMPSTAT_INC(igps_rcv_group_queries); } IN_MULTI_LIST_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Ignore v2 query if in v1 Compatibility Mode. */ if (igi->igi_version == IGMP_VERSION_1) goto out_locked; igmp_set_version(igi, IGMP_VERSION_2); timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; if (is_general_query) { /* * For each reporting group joined on this * interface, kick the report timer. */ CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)", ifp, ifp->if_xname); - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; igmp_v2_update_group(inm, timer); } - NET_EPOCH_EXIT(et); } else { /* * Group-specific IGMPv2 query, we need only * look up the single group to process it. */ inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { CTR3(KTR_IGMPV3, "process v2 query 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); igmp_v2_update_group(inm, timer); } } out_locked: IGMP_UNLOCK(); IN_MULTI_LIST_UNLOCK(); return (0); } /* * Update the report timer on a group in response to an IGMPv2 query. * * If we are becoming the reporting member for this group, start the timer. * If we already are the reporting member for this group, and timer is * below the threshold, reset it. * * We may be updating the group for the first time since we switched * to IGMPv3. If we are, then we must clear any recorded source lists, * and transition to REPORTING state; the group timer is overloaded * for group and group-source query responses. * * Unlike IGMPv3, the delay per group should be jittered * to avoid bursts of IGMPv2 reports. */ static void igmp_v2_update_group(struct in_multi *inm, const int timer) { CTR4(KTR_IGMPV3, "0x%08x: %s/%s timer=%d", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname, timer); IN_MULTI_LIST_LOCK_ASSERT(); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_REPORTING_MEMBER: if (inm->inm_timer != 0 && inm->inm_timer <= timer) { CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, " "skipping.", __func__); break; } /* FALLTHROUGH */ case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__); inm->inm_state = IGMP_REPORTING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; break; case IGMP_SLEEPING_MEMBER: CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__); inm->inm_state = IGMP_AWAKENING_MEMBER; break; case IGMP_LEAVING_MEMBER: break; } } /* * Process a received IGMPv3 general, group-specific or * group-and-source-specific query. * Assumes m has already been pulled up to the full IGMP message length. * Return 0 if successful, otherwise an appropriate error code is returned. */ static int igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip, /*const*/ struct igmpv3 *igmpv3) { struct igmp_ifsoftc *igi; struct in_multi *inm; int is_general_query; uint32_t maxresp, nsrc, qqi; uint16_t timer; uint8_t qrv; is_general_query = 0; CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname); maxresp = igmpv3->igmp_code; /* in 1/10ths of a second */ if (maxresp >= 128) { maxresp = IGMP_MANT(igmpv3->igmp_code) << (IGMP_EXP(igmpv3->igmp_code) + 3); } /* * Robustness must never be less than 2 for on-wire IGMPv3. * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make * an exception for interfaces whose IGMPv3 state changes * are redirected to loopback (e.g. MANET). */ qrv = IGMP_QRV(igmpv3->igmp_misc); if (qrv < 2) { CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__, qrv, IGMP_RV_INIT); qrv = IGMP_RV_INIT; } qqi = igmpv3->igmp_qqi; if (qqi >= 128) { qqi = IGMP_MANT(igmpv3->igmp_qqi) << (IGMP_EXP(igmpv3->igmp_qqi) + 3); } timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; nsrc = ntohs(igmpv3->igmp_numsrc); /* * Validate address fields and versions upfront before * accepting v3 query. * XXX SMPng: Unlocked access to igmpstat counters here. */ if (in_nullhost(igmpv3->igmp_group)) { /* * IGMPv3 General Query. * * General Queries SHOULD be directed to 224.0.0.1. * A general query with a source list has undefined * behaviour; discard it. */ IGMPSTAT_INC(igps_rcv_gen_queries); if (!in_allhosts(ip->ip_dst) || nsrc > 0) { IGMPSTAT_INC(igps_rcv_badqueries); return (0); } is_general_query = 1; } else { /* Group or group-source specific query. */ if (nsrc == 0) IGMPSTAT_INC(igps_rcv_group_queries); else IGMPSTAT_INC(igps_rcv_gsr_queries); } IN_MULTI_LIST_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Discard the v3 query if we're in Compatibility Mode. * The RFC is not obviously worded that hosts need to stay in * compatibility mode until the Old Version Querier Present * timer expires. */ if (igi->igi_version != IGMP_VERSION_3) { CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)", igi->igi_version, ifp, ifp->if_xname); goto out_locked; } igmp_set_version(igi, IGMP_VERSION_3); igi->igi_rv = qrv; igi->igi_qi = qqi; igi->igi_qri = maxresp; CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi, maxresp); if (is_general_query) { /* * Schedule a current-state report on this ifp for * all groups, possibly containing source lists. * If there is a pending General Query response * scheduled earlier than the selected delay, do * not schedule any other reports. * Otherwise, reset the interface timer. */ CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)", ifp, ifp->if_xname); if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) { igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer); V_interface_timers_running = 1; } } else { /* * Group-source-specific queries are throttled on * a per-group basis to defeat denial-of-service attempts. * Queries for groups we are not a member of on this * link are simply ignored. */ inm = inm_lookup(ifp, igmpv3->igmp_group); if (inm == NULL) goto out_locked; if (nsrc > 0) { if (!ratecheck(&inm->inm_lastgsrtv, &V_igmp_gsrdelay)) { CTR1(KTR_IGMPV3, "%s: GS query throttled.", __func__); IGMPSTAT_INC(igps_drop_gsr_queries); goto out_locked; } } CTR3(KTR_IGMPV3, "process v3 0x%08x query on ifp %p(%s)", ntohl(igmpv3->igmp_group.s_addr), ifp, ifp->if_xname); /* * If there is a pending General Query response * scheduled sooner than the selected delay, no * further report need be scheduled. * Otherwise, prepare to respond to the * group-specific or group-and-source query. */ if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) igmp_input_v3_group_query(inm, igi, timer, igmpv3); } out_locked: IGMP_UNLOCK(); IN_MULTI_LIST_UNLOCK(); return (0); } /* * Process a received IGMPv3 group-specific or group-and-source-specific * query. * Return <0 if any error occurred. Currently this is ignored. */ static int igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifsoftc *igi, int timer, /*const*/ struct igmpv3 *igmpv3) { int retval; uint16_t nsrc; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); retval = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LEAVING_MEMBER: return (retval); break; case IGMP_REPORTING_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: break; } nsrc = ntohs(igmpv3->igmp_numsrc); /* * Deal with group-specific queries upfront. * If any group query is already pending, purge any recorded * source-list state if it exists, and schedule a query response * for this group-specific query. */ if (nsrc == 0) { if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) { inm_clear_recorded(inm); timer = min(inm->inm_timer, timer); } inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; return (retval); } /* * Deal with the case where a group-and-source-specific query has * been received but a group-specific query is already pending. */ if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) { timer = min(inm->inm_timer, timer); inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; return (retval); } /* * Finally, deal with the case where a group-and-source-specific * query has been received, where a response to a previous g-s-r * query exists, or none exists. * In this case, we need to parse the source-list which the Querier * has provided us with and check if we have any source list filter * entries at T1 for these sources. If we do not, there is no need * schedule a report and the query may be dropped. * If we do, we must record them and schedule a current-state * report for those sources. * FIXME: Handling source lists larger than 1 mbuf requires that * we pass the mbuf chain pointer down to this function, and use * m_getptr() to walk the chain. */ if (inm->inm_nsrc > 0) { const struct in_addr *ap; int i, nrecorded; ap = (const struct in_addr *)(igmpv3 + 1); nrecorded = 0; for (i = 0; i < nsrc; i++, ap++) { retval = inm_record_source(inm, ap->s_addr); if (retval < 0) break; nrecorded += retval; } if (nrecorded > 0) { CTR1(KTR_IGMPV3, "%s: schedule response to SG query", __func__); inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; } } return (retval); } /* * Process a received IGMPv1 host membership report. * * NOTE: 0.0.0.0 workaround breaks const correctness. */ static int igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_multi *inm; IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) return (0); if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } /* * RFC 3376, Section 4.2.13, 9.2, 9.3: * Booting clients may use the source address 0.0.0.0. Some * IGMP daemons may not know how to use IP_RECVIF to determine * the interface upon which this message was received. * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { - struct epoch_tracker et; - - NET_EPOCH_ENTER(et); IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) ip->ip_src.s_addr = htonl(ia->ia_subnet); - NET_EPOCH_EXIT(et); } CTR3(KTR_IGMPV3, "process v1 report 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); /* * IGMPv1 report suppression. * If we are a member of this group, and our membership should be * reported, stop our group timer and transition to the 'lazy' state. */ IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifsoftc *igi; igi = inm->inm_igi; if (igi == NULL) { KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); goto out_locked; } IGMPSTAT_INC(igps_rcv_ourreports); /* * If we are in IGMPv3 host mode, do not allow the * other host's IGMPv1 report to suppress our reports * unless explicitly configured to do so. */ if (igi->igi_version == IGMP_VERSION_3) { if (V_igmp_legacysupp) igmp_v3_suppress_group_record(inm); goto out_locked; } inm->inm_timer = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); case IGMP_SLEEPING_MEMBER: inm->inm_state = IGMP_SLEEPING_MEMBER; break; case IGMP_REPORTING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); if (igi->igi_version == IGMP_VERSION_1) inm->inm_state = IGMP_LAZY_MEMBER; else if (igi->igi_version == IGMP_VERSION_2) inm->inm_state = IGMP_SLEEPING_MEMBER; break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } out_locked: IN_MULTI_LIST_UNLOCK(); return (0); } /* * Process a received IGMPv2 host membership report. * * NOTE: 0.0.0.0 workaround breaks const correctness. */ static int igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { struct rm_priotracker in_ifa_tracker; - struct epoch_tracker et; struct in_ifaddr *ia; struct in_multi *inm; /* * Make sure we don't hear our own membership report. Fast * leave requires knowing that we are the only member of a * group. */ - NET_EPOCH_ENTER(et); IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) { - NET_EPOCH_EXIT(et); return (0); } IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) { - NET_EPOCH_EXIT(et); return (0); } if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { - NET_EPOCH_EXIT(et); IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } /* * RFC 3376, Section 4.2.13, 9.2, 9.3: * Booting clients may use the source address 0.0.0.0. Some * IGMP daemons may not know how to use IP_RECVIF to determine * the interface upon which this message was received. * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { if (ia != NULL) ip->ip_src.s_addr = htonl(ia->ia_subnet); } - NET_EPOCH_EXIT(et); CTR3(KTR_IGMPV3, "process v2 report 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); /* * IGMPv2 report suppression. * If we are a member of this group, and our membership should be * reported, and our group timer is pending or about to be reset, * stop our group timer by transitioning to the 'lazy' state. */ IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifsoftc *igi; igi = inm->inm_igi; KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); IGMPSTAT_INC(igps_rcv_ourreports); /* * If we are in IGMPv3 host mode, do not allow the * other host's IGMPv1 report to suppress our reports * unless explicitly configured to do so. */ if (igi->igi_version == IGMP_VERSION_3) { if (V_igmp_legacysupp) igmp_v3_suppress_group_record(inm); goto out_locked; } inm->inm_timer = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_AWAKENING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); case IGMP_LAZY_MEMBER: inm->inm_state = IGMP_LAZY_MEMBER; break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } out_locked: IN_MULTI_LIST_UNLOCK(); return (0); } int igmp_input(struct mbuf **mp, int *offp, int proto) { int iphlen; struct ifnet *ifp; struct igmp *igmp; struct ip *ip; struct mbuf *m; int igmplen; int minlen; int queryver; CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, *mp, *offp); m = *mp; ifp = m->m_pkthdr.rcvif; *mp = NULL; IGMPSTAT_INC(igps_rcv_total); ip = mtod(m, struct ip *); iphlen = *offp; igmplen = ntohs(ip->ip_len) - iphlen; /* * Validate lengths. */ if (igmplen < IGMP_MINLEN) { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); return (IPPROTO_DONE); } /* * Always pullup to the minimum size for v1/v2 or v3 * to amortize calls to m_pullup(). */ minlen = iphlen; if (igmplen >= IGMP_V3_QUERY_MINLEN) minlen += IGMP_V3_QUERY_MINLEN; else minlen += IGMP_MINLEN; if ((!M_WRITABLE(m) || m->m_len < minlen) && (m = m_pullup(m, minlen)) == NULL) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); /* * Validate checksum. */ m->m_data += iphlen; m->m_len -= iphlen; igmp = mtod(m, struct igmp *); if (in_cksum(m, igmplen)) { IGMPSTAT_INC(igps_rcv_badsum); m_freem(m); return (IPPROTO_DONE); } m->m_data -= iphlen; m->m_len += iphlen; /* * IGMP control traffic is link-scope, and must have a TTL of 1. * DVMRP traffic (e.g. mrinfo, mtrace) is an exception; * probe packets may come from beyond the LAN. */ if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) { IGMPSTAT_INC(igps_rcv_badttl); m_freem(m); return (IPPROTO_DONE); } switch (igmp->igmp_type) { case IGMP_HOST_MEMBERSHIP_QUERY: if (igmplen == IGMP_MINLEN) { if (igmp->igmp_code == 0) queryver = IGMP_VERSION_1; else queryver = IGMP_VERSION_2; } else if (igmplen >= IGMP_V3_QUERY_MINLEN) { queryver = IGMP_VERSION_3; } else { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); return (IPPROTO_DONE); } switch (queryver) { case IGMP_VERSION_1: IGMPSTAT_INC(igps_rcv_v1v2_queries); if (!V_igmp_v1enable) break; if (igmp_input_v1_query(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_VERSION_2: IGMPSTAT_INC(igps_rcv_v1v2_queries); if (!V_igmp_v2enable) break; if (igmp_input_v2_query(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_VERSION_3: { struct igmpv3 *igmpv3; uint16_t igmpv3len; uint16_t nsrc; IGMPSTAT_INC(igps_rcv_v3_queries); igmpv3 = (struct igmpv3 *)igmp; /* * Validate length based on source count. */ nsrc = ntohs(igmpv3->igmp_numsrc); if (nsrc * sizeof(in_addr_t) > UINT16_MAX - iphlen - IGMP_V3_QUERY_MINLEN) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } /* * m_pullup() may modify m, so pullup in * this scope. */ igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN + sizeof(struct in_addr) * nsrc; if ((!M_WRITABLE(m) || m->m_len < igmpv3len) && (m = m_pullup(m, igmpv3len)) == NULL) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *) + iphlen); if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) { m_freem(m); return (IPPROTO_DONE); } } break; } break; case IGMP_v1_HOST_MEMBERSHIP_REPORT: if (!V_igmp_v1enable) break; if (igmp_input_v1_report(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_v2_HOST_MEMBERSHIP_REPORT: if (!V_igmp_v2enable) break; if (!ip_checkrouteralert(m)) IGMPSTAT_INC(igps_rcv_nora); if (igmp_input_v2_report(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_v3_HOST_MEMBERSHIP_REPORT: /* * Hosts do not need to process IGMPv3 membership reports, * as report suppression is no longer required. */ if (!ip_checkrouteralert(m)) IGMPSTAT_INC(igps_rcv_nora); break; default: break; } /* * Pass all valid IGMP packets up to any process(es) listening on a * raw IGMP socket. */ *mp = m; return (rip_input(mp, offp, proto)); } /* * Fast timeout handler (global). * VIMAGE: Timeout handlers are expected to service all vimages. */ void igmp_fasttimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_fasttimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Fast timeout handler (per-vnet). * Sends are shuffled off to a netisr to deal with Giant. * * VIMAGE: Assume caller has set up our curvnet. */ static void igmp_fasttimo_vnet(void) { struct mbufq scq; /* State-change packets */ struct mbufq qrq; /* Query response packets */ struct ifnet *ifp; struct igmp_ifsoftc *igi; struct ifmultiaddr *ifma, *next; struct in_multi *inm; struct in_multi_head inm_free_tmp; int loop, uri_fasthz; loop = 0; uri_fasthz = 0; /* * Quick check to see if any work needs to be done, in order to * minimize the overhead of fasttimo processing. * SMPng: XXX Unlocked reads. */ if (!V_current_state_timers_running && !V_interface_timers_running && !V_state_change_timers_running) return; SLIST_INIT(&inm_free_tmp); IN_MULTI_LIST_LOCK(); IGMP_LOCK(); /* * IGMPv3 General Query response timer processing. */ if (V_interface_timers_running) { CTR1(KTR_IGMPV3, "%s: interface timers running", __func__); V_interface_timers_running = 0; LIST_FOREACH(igi, &V_igi_head, igi_link) { if (igi->igi_v3_timer == 0) { /* Do nothing. */ } else if (--igi->igi_v3_timer == 0) { igmp_v3_dispatch_general_query(igi); } else { V_interface_timers_running = 1; } } } if (!V_current_state_timers_running && !V_state_change_timers_running) goto out_locked; V_current_state_timers_running = 0; V_state_change_timers_running = 0; CTR1(KTR_IGMPV3, "%s: state change timers running", __func__); /* * IGMPv1/v2/v3 host report and state-change timer processing. * Note: Processing a v3 group timer may remove a node. */ LIST_FOREACH(igi, &V_igi_head, igi_link) { ifp = igi->igi_ifp; if (igi->igi_version == IGMP_VERSION_3) { loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri * PR_FASTHZ); mbufq_init(&qrq, IGMP_MAX_G_GS_PACKETS); mbufq_init(&scq, IGMP_MAX_STATE_CHANGE_PACKETS); } IF_ADDR_WLOCK(ifp); restart: CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; switch (igi->igi_version) { case IGMP_VERSION_1: case IGMP_VERSION_2: igmp_v1v2_process_group_timer(inm, igi->igi_version); break; case IGMP_VERSION_3: igmp_v3_process_group_timers(&inm_free_tmp, &qrq, &scq, inm, uri_fasthz); break; } if (__predict_false(ifma_restart)) { ifma_restart = false; goto restart; } } IF_ADDR_WUNLOCK(ifp); if (igi->igi_version == IGMP_VERSION_3) { igmp_dispatch_queue(&qrq, 0, loop); igmp_dispatch_queue(&scq, 0, loop); /* * Free the in_multi reference(s) for this * IGMP lifecycle. */ inm_release_list_deferred(&inm_free_tmp); } } out_locked: IGMP_UNLOCK(); IN_MULTI_LIST_UNLOCK(); } /* * Update host report group timer for IGMPv1/v2. * Will update the global pending timer flags. */ static void igmp_v1v2_process_group_timer(struct in_multi *inm, const int version) { int report_timer_expired; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); if (inm->inm_timer == 0) { report_timer_expired = 0; } else if (--inm->inm_timer == 0) { report_timer_expired = 1; } else { V_current_state_timers_running = 1; return; } switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: break; case IGMP_REPORTING_MEMBER: if (report_timer_expired) { inm->inm_state = IGMP_IDLE_MEMBER; (void)igmp_v1v2_queue_report(inm, (version == IGMP_VERSION_2) ? IGMP_v2_HOST_MEMBERSHIP_REPORT : IGMP_v1_HOST_MEMBERSHIP_REPORT); } break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } /* * Update a group's timers for IGMPv3. * Will update the global pending timer flags. * Note: Unlocked read from igi. */ static void igmp_v3_process_group_timers(struct in_multi_head *inmh, struct mbufq *qrq, struct mbufq *scq, struct in_multi *inm, const int uri_fasthz) { int query_response_timer_expired; int state_change_retransmit_timer_expired; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); query_response_timer_expired = 0; state_change_retransmit_timer_expired = 0; /* * During a transition from v1/v2 compatibility mode back to v3, * a group record in REPORTING state may still have its group * timer active. This is a no-op in this function; it is easier * to deal with it here than to complicate the slow-timeout path. */ if (inm->inm_timer == 0) { query_response_timer_expired = 0; } else if (--inm->inm_timer == 0) { query_response_timer_expired = 1; } else { V_current_state_timers_running = 1; } if (inm->inm_sctimer == 0) { state_change_retransmit_timer_expired = 0; } else if (--inm->inm_sctimer == 0) { state_change_retransmit_timer_expired = 1; } else { V_state_change_timers_running = 1; } /* We are in fasttimo, so be quick about it. */ if (!state_change_retransmit_timer_expired && !query_response_timer_expired) return; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_IDLE_MEMBER: break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: /* * Respond to a previously pending Group-Specific * or Group-and-Source-Specific query by enqueueing * the appropriate Current-State report for * immediate transmission. */ if (query_response_timer_expired) { int retval __unused; retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1, (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); inm->inm_state = IGMP_REPORTING_MEMBER; /* XXX Clear recorded sources for next time. */ inm_clear_recorded(inm); } /* FALLTHROUGH */ case IGMP_REPORTING_MEMBER: case IGMP_LEAVING_MEMBER: if (state_change_retransmit_timer_expired) { /* * State-change retransmission timer fired. * If there are any further pending retransmissions, * set the global pending state-change flag, and * reset the timer. */ if (--inm->inm_scrv > 0) { inm->inm_sctimer = uri_fasthz; V_state_change_timers_running = 1; } /* * Retransmit the previously computed state-change * report. If there are no further pending * retransmissions, the mbuf queue will be consumed. * Update T0 state to T1 as we have now sent * a state-change. */ (void)igmp_v3_merge_state_changes(inm, scq); inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); /* * If we are leaving the group for good, make sure * we release IGMP's reference to it. * This release must be deferred using a SLIST, * as we are called from a loop which traverses * the in_ifmultiaddr TAILQ. */ if (inm->inm_state == IGMP_LEAVING_MEMBER && inm->inm_scrv == 0) { inm->inm_state = IGMP_NOT_MEMBER; inm_rele_locked(inmh, inm); } } break; } } /* * Suppress a group's pending response to a group or source/group query. * * Do NOT suppress state changes. This leads to IGMPv3 inconsistency. * Do NOT update ST1/ST0 as this operation merely suppresses * the currently pending group record. * Do NOT suppress the response to a general query. It is possible but * it would require adding another state or flag. */ static void igmp_v3_suppress_group_record(struct in_multi *inm) { IN_MULTI_LIST_LOCK_ASSERT(); KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3, ("%s: not IGMPv3 mode on link", __func__)); if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER) return; if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) inm_clear_recorded(inm); inm->inm_timer = 0; inm->inm_state = IGMP_REPORTING_MEMBER; } /* * Switch to a different IGMP version on the given interface, * as per Section 7.2.1. */ static void igmp_set_version(struct igmp_ifsoftc *igi, const int version) { int old_version_timer; IGMP_LOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__, version, igi->igi_ifp, igi->igi_ifp->if_xname); if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) { /* * Compute the "Older Version Querier Present" timer as per * Section 8.12. */ old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri; old_version_timer *= PR_SLOWHZ; if (version == IGMP_VERSION_1) { igi->igi_v1_timer = old_version_timer; igi->igi_v2_timer = 0; } else if (version == IGMP_VERSION_2) { igi->igi_v1_timer = 0; igi->igi_v2_timer = old_version_timer; } } if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { if (igi->igi_version != IGMP_VERSION_2) { igi->igi_version = IGMP_VERSION_2; igmp_v3_cancel_link_timers(igi); } } else if (igi->igi_v1_timer > 0) { if (igi->igi_version != IGMP_VERSION_1) { igi->igi_version = IGMP_VERSION_1; igmp_v3_cancel_link_timers(igi); } } } /* * Cancel pending IGMPv3 timers for the given link and all groups * joined on it; state-change, general-query, and group-query timers. * * Only ever called on a transition from v3 to Compatibility mode. Kill * the timers stone dead (this may be expensive for large N groups), they * will be restarted if Compatibility Mode deems that they must be due to * query processing. */ static void igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in_multi *inm; struct in_multi_head inm_free_tmp; - struct epoch_tracker et; CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); + NET_EPOCH_ASSERT(); + SLIST_INIT(&inm_free_tmp); /* * Stop the v3 General Query Response on this link stone dead. * If fasttimo is woken up due to V_interface_timers_running, * the flag will be cleared if there are no pending link timers. */ igi->igi_v3_timer = 0; /* * Now clear the current-state and state-change report timers * for all memberships scoped to this link. */ ifp = igi->igi_ifp; - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: /* * These states are either not relevant in v3 mode, * or are unreported. Do nothing. */ break; case IGMP_LEAVING_MEMBER: /* * If we are leaving the group and switching to * compatibility mode, we need to release the final * reference held for issuing the INCLUDE {}, and * transition to REPORTING to ensure the host leave * message is sent upstream to the old querier -- * transition to NOT would lose the leave and race. */ inm_rele_locked(&inm_free_tmp, inm); /* FALLTHROUGH */ case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: inm_clear_recorded(inm); /* FALLTHROUGH */ case IGMP_REPORTING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; break; } /* * Always clear state-change and group report timers. * Free any pending IGMPv3 state-change records. */ inm->inm_sctimer = 0; inm->inm_timer = 0; mbufq_drain(&inm->inm_scq); } - NET_EPOCH_EXIT(et); inm_release_list_deferred(&inm_free_tmp); } /* * Update the Older Version Querier Present timers for a link. * See Section 7.2.1 of RFC 3376. */ static void igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *igi) { IGMP_LOCK_ASSERT(); if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) { /* * IGMPv1 and IGMPv2 Querier Present timers expired. * * Revert to IGMPv3. */ if (igi->igi_version != IGMP_VERSION_3) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_version = IGMP_VERSION_3; } } else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { /* * IGMPv1 Querier Present timer expired, * IGMPv2 Querier Present timer running. * If IGMPv2 was disabled since last timeout, * revert to IGMPv3. * If IGMPv2 is enabled, revert to IGMPv2. */ if (!V_igmp_v2enable) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v2_timer = 0; igi->igi_version = IGMP_VERSION_3; } else { --igi->igi_v2_timer; if (igi->igi_version != IGMP_VERSION_2) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_2, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_version = IGMP_VERSION_2; igmp_v3_cancel_link_timers(igi); } } } else if (igi->igi_v1_timer > 0) { /* * IGMPv1 Querier Present timer running. * Stop IGMPv2 timer if running. * * If IGMPv1 was disabled since last timeout, * revert to IGMPv3. * If IGMPv1 is enabled, reset IGMPv2 timer if running. */ if (!V_igmp_v1enable) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v1_timer = 0; igi->igi_version = IGMP_VERSION_3; } else { --igi->igi_v1_timer; } if (igi->igi_v2_timer > 0) { CTR3(KTR_IGMPV3, "%s: cancel v2 timer on %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v2_timer = 0; } } } /* * Global slowtimo handler. * VIMAGE: Timeout handlers are expected to service all vimages. */ void igmp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_slowtimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Per-vnet slowtimo handler. */ static void igmp_slowtimo_vnet(void) { struct igmp_ifsoftc *igi; IGMP_LOCK(); LIST_FOREACH(igi, &V_igi_head, igi_link) { igmp_v1v2_process_querier_timers(igi); } IGMP_UNLOCK(); } /* * Dispatch an IGMPv1/v2 host report or leave message. * These are always small enough to fit inside a single mbuf. */ static int igmp_v1v2_queue_report(struct in_multi *inm, const int type) { struct ifnet *ifp; struct igmp *igmp; struct ip *ip; struct mbuf *m; + NET_EPOCH_ASSERT(); IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); ifp = inm->inm_ifp; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOMEM); M_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp)); m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp); m->m_data += sizeof(struct ip); m->m_len = sizeof(struct igmp); igmp = mtod(m, struct igmp *); igmp->igmp_type = type; igmp->igmp_code = 0; igmp->igmp_group = inm->inm_addr; igmp->igmp_cksum = 0; igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp)); m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = 0; ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp)); ip->ip_off = 0; ip->ip_p = IPPROTO_IGMP; ip->ip_src.s_addr = INADDR_ANY; if (type == IGMP_HOST_LEAVE_MESSAGE) ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP); else ip->ip_dst = inm->inm_addr; igmp_save_context(m, ifp); m->m_flags |= M_IGMPV2; if (inm->inm_igi->igi_flags & IGIF_LOOPBACK) m->m_flags |= M_IGMP_LOOP; CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m); netisr_dispatch(NETISR_IGMP, m); return (0); } /* * Process a state change from the upper layer for the given IPv4 group. * * Each socket holds a reference on the in_multi in its own ip_moptions. * The socket layer will have made the necessary updates to.the group * state, it is now up to IGMP to issue a state change report if there * has been any change between T0 (when the last state-change was issued) * and T1 (now). * * We use the IGMPv3 state machine at group level. The IGMP module * however makes the decision as to which IGMP protocol version to speak. * A state change *from* INCLUDE {} always means an initial join. * A state change *to* INCLUDE {} always means a final leave. * * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can * save ourselves a bunch of work; any exclusive mode groups need not * compute source filter lists. * * VIMAGE: curvnet should have been set by caller, as this routine * is called from the socket option handlers. */ int igmp_change_state(struct in_multi *inm) { struct igmp_ifsoftc *igi; struct ifnet *ifp; int error; error = 0; IN_MULTI_LOCK_ASSERT(); /* * Try to detect if the upper layer just asked us to change state * for an interface which has now gone away. */ KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); /* * If we detect a state transition to or from MCAST_UNDEFINED * for this group, then we are starting or finishing an IGMP * life cycle for this group. */ if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) { CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__, inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode); if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_IGMPV3, "%s: initial join", __func__); error = igmp_initial_join(inm, igi); goto out_locked; } else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_IGMPV3, "%s: final leave", __func__); igmp_final_leave(inm, igi); goto out_locked; } } else { CTR1(KTR_IGMPV3, "%s: filter set change", __func__); } error = igmp_handle_state_change(inm, igi); out_locked: IGMP_UNLOCK(); return (error); } /* * Perform the initial join for an IGMP group. * * When joining a group: * If the group should have its IGMP traffic suppressed, do nothing. * IGMPv1 starts sending IGMPv1 host membership reports. * IGMPv2 starts sending IGMPv2 host membership reports. * IGMPv3 will schedule an IGMPv3 state-change report containing the * initial state of the membership. */ static int igmp_initial_join(struct in_multi *inm, struct igmp_ifsoftc *igi) { struct ifnet *ifp; struct mbufq *mq; int error, retval, syncstates; CTR4(KTR_IGMPV3, "%s: initial join 0x%08x on ifp %p(%s)", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp, inm->inm_ifp->if_xname); error = 0; syncstates = 1; ifp = inm->inm_ifp; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); /* * Groups joined on loopback or marked as 'not reported', * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and * are never reported in any IGMP protocol exchanges. * All other groups enter the appropriate IGMP state machine * for the version in use on this link. * A link marked as IGIF_SILENT causes IGMP to be completely * disabled for the link. */ if ((ifp->if_flags & IFF_LOOPBACK) || (igi->igi_flags & IGIF_SILENT) || !igmp_isgroupreported(inm->inm_addr)) { CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); inm->inm_state = IGMP_SILENT_MEMBER; inm->inm_timer = 0; } else { /* * Deal with overlapping in_multi lifecycle. * If this group was LEAVING, then make sure * we drop the reference we picked up to keep the * group around for the final INCLUDE {} enqueue. */ if (igi->igi_version == IGMP_VERSION_3 && inm->inm_state == IGMP_LEAVING_MEMBER) { MPASS(inm->inm_refcount > 1); inm_rele_locked(NULL, inm); } inm->inm_state = IGMP_REPORTING_MEMBER; switch (igi->igi_version) { case IGMP_VERSION_1: case IGMP_VERSION_2: inm->inm_state = IGMP_IDLE_MEMBER; error = igmp_v1v2_queue_report(inm, (igi->igi_version == IGMP_VERSION_2) ? IGMP_v2_HOST_MEMBERSHIP_REPORT : IGMP_v1_HOST_MEMBERSHIP_REPORT); if (error == 0) { inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_V1V2_MAX_RI * PR_FASTHZ); V_current_state_timers_running = 1; } break; case IGMP_VERSION_3: /* * Defer update of T0 to T1, until the first copy * of the state change has been transmitted. */ syncstates = 0; /* * Immediately enqueue a State-Change Report for * this interface, freeing any previous reports. * Don't kick the timers if there is nothing to do, * or if an error occurred. */ mq = &inm->inm_scq; mbufq_drain(mq); retval = igmp_v3_enqueue_group_record(mq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) { error = retval * -1; break; } /* * Schedule transmission of pending state-change * report up to RV times for this link. The timer * will fire at the next igmp_fasttimo (~200ms), * giving us an opportunity to merge the reports. */ if (igi->igi_flags & IGIF_LOOPBACK) { inm->inm_scrv = 1; } else { KASSERT(igi->igi_rv > 1, ("%s: invalid robustness %d", __func__, igi->igi_rv)); inm->inm_scrv = igi->igi_rv; } inm->inm_sctimer = 1; V_state_change_timers_running = 1; error = 0; break; } } /* * Only update the T0 state if state change is atomic, * i.e. we don't need to wait for a timer to fire before we * can consider the state change to have been communicated. */ if (syncstates) { inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); } return (error); } /* * Issue an intermediate state change during the IGMP life-cycle. */ static int igmp_handle_state_change(struct in_multi *inm, struct igmp_ifsoftc *igi) { struct ifnet *ifp; int retval; CTR4(KTR_IGMPV3, "%s: state change for 0x%08x on ifp %p(%s)", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp, inm->inm_ifp->if_xname); ifp = inm->inm_ifp; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); if ((ifp->if_flags & IFF_LOOPBACK) || (igi->igi_flags & IGIF_SILENT) || !igmp_isgroupreported(inm->inm_addr) || (igi->igi_version != IGMP_VERSION_3)) { if (!igmp_isgroupreported(inm->inm_addr)) { CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); } CTR1(KTR_IGMPV3, "%s: nothing to do", __func__); inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); return (0); } mbufq_drain(&inm->inm_scq); retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) return (-retval); /* * If record(s) were enqueued, start the state-change * report timer for this group. */ inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv); inm->inm_sctimer = 1; V_state_change_timers_running = 1; return (0); } /* * Perform the final leave for an IGMP group. * * When leaving a group: * IGMPv1 does nothing. * IGMPv2 sends a host leave message, if and only if we are the reporter. * IGMPv3 enqueues a state-change report containing a transition * to INCLUDE {} for immediate transmission. */ static void igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi) { int syncstates; syncstates = 1; CTR4(KTR_IGMPV3, "%s: final leave 0x%08x on ifp %p(%s)", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp, inm->inm_ifp->if_xname); IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_LEAVING_MEMBER: /* Already leaving or left; do nothing. */ CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: if (igi->igi_version == IGMP_VERSION_2) { #ifdef INVARIANTS if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) panic("%s: IGMPv3 state reached, not IGMPv3 mode", __func__); #endif igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE); inm->inm_state = IGMP_NOT_MEMBER; } else if (igi->igi_version == IGMP_VERSION_3) { /* * Stop group timer and all pending reports. * Immediately enqueue a state-change report * TO_IN {} to be sent on the next fast timeout, * giving us an opportunity to merge reports. */ mbufq_drain(&inm->inm_scq); inm->inm_timer = 0; if (igi->igi_flags & IGIF_LOOPBACK) { inm->inm_scrv = 1; } else { inm->inm_scrv = igi->igi_rv; } CTR4(KTR_IGMPV3, "%s: Leaving 0x%08x/%s with %d " "pending retransmissions.", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname, inm->inm_scrv); if (inm->inm_scrv == 0) { inm->inm_state = IGMP_NOT_MEMBER; inm->inm_sctimer = 0; } else { int retval __unused; inm_acquire_locked(inm); retval = igmp_v3_enqueue_group_record( &inm->inm_scq, inm, 1, 0, 0); KASSERT(retval != 0, ("%s: enqueue record = %d", __func__, retval)); inm->inm_state = IGMP_LEAVING_MEMBER; inm->inm_sctimer = 1; V_state_change_timers_running = 1; syncstates = 0; } break; } break; case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: /* Our reports are suppressed; do nothing. */ break; } if (syncstates) { inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); } } /* * Enqueue an IGMPv3 group record to the given output queue. * * XXX This function could do with having the allocation code * split out, and the multiple-tree-walks coalesced into a single * routine as has been done in igmp_v3_enqueue_filter_change(). * * If is_state_change is zero, a current-state record is appended. * If is_state_change is non-zero, a state-change report is appended. * * If is_group_query is non-zero, an mbuf packet chain is allocated. * If is_group_query is zero, and if there is a packet with free space * at the tail of the queue, it will be appended to providing there * is enough free space. * Otherwise a new mbuf packet chain is allocated. * * If is_source_query is non-zero, each source is checked to see if * it was recorded for a Group-Source query, and will be omitted if * it is not both in-mode and recorded. * * The function will attempt to allocate leading space in the packet * for the IP/IGMP header to be prepended without fragmenting the chain. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm, const int is_state_change, const int is_group_query, const int is_source_query) { struct igmp_grouprec ig; struct igmp_grouprec *pig; struct ifnet *ifp; struct ip_msource *ims, *nims; struct mbuf *m0, *m, *md; int is_filter_list_change; int minrec0len, m0srcs, msrcs, nbytes, off; int record_has_sources; int now; int type; in_addr_t naddr; uint8_t mode; IN_MULTI_LIST_LOCK_ASSERT(); ifp = inm->inm_ifp; is_filter_list_change = 0; m = NULL; m0 = NULL; m0srcs = 0; msrcs = 0; nbytes = 0; nims = NULL; record_has_sources = 1; pig = NULL; type = IGMP_DO_NOTHING; mode = inm->inm_st[1].iss_fmode; /* * If we did not transition out of ASM mode during t0->t1, * and there are no source nodes to process, we can skip * the generation of source records. */ if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 && inm->inm_nsrc == 0) record_has_sources = 0; if (is_state_change) { /* * Queue a state change record. * If the mode did not change, and there are non-ASM * listeners or source filters present, * we potentially need to issue two records for the group. * If we are transitioning to MCAST_UNDEFINED, we need * not send any sources. * If there are ASM listeners, and there was no filter * mode transition of any kind, do nothing. */ if (mode != inm->inm_st[0].iss_fmode) { if (mode == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: change to EXCLUDE", __func__); type = IGMP_CHANGE_TO_EXCLUDE_MODE; } else { CTR1(KTR_IGMPV3, "%s: change to INCLUDE", __func__); type = IGMP_CHANGE_TO_INCLUDE_MODE; if (mode == MCAST_UNDEFINED) record_has_sources = 0; } } else { if (record_has_sources) { is_filter_list_change = 1; } else { type = IGMP_DO_NOTHING; } } } else { /* * Queue a current state record. */ if (mode == MCAST_EXCLUDE) { type = IGMP_MODE_IS_EXCLUDE; } else if (mode == MCAST_INCLUDE) { type = IGMP_MODE_IS_INCLUDE; KASSERT(inm->inm_st[1].iss_asm == 0, ("%s: inm %p is INCLUDE but ASM count is %d", __func__, inm, inm->inm_st[1].iss_asm)); } } /* * Generate the filter list changes using a separate function. */ if (is_filter_list_change) return (igmp_v3_enqueue_filter_change(mq, inm)); if (type == IGMP_DO_NOTHING) { CTR3(KTR_IGMPV3, "%s: nothing to do for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); return (0); } /* * If any sources are present, we must be able to fit at least * one in the trailing space of the tail packet's mbuf, * ideally more. */ minrec0len = sizeof(struct igmp_grouprec); if (record_has_sources) minrec0len += sizeof(in_addr_t); CTR4(KTR_IGMPV3, "%s: queueing %s for 0x%08x/%s", __func__, igmp_rec_type_to_str(type), ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); /* * Check if we have a packet in the tail of the queue for this * group into which the first group record for this group will fit. * Otherwise allocate a new packet. * Always allocate leading space for IP+RA_OPT+IGMP+REPORT. * Note: Group records for G/GSR query responses MUST be sent * in their own packet. */ m0 = mbufq_last(mq); if (!is_group_query && m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && (m0->m_pkthdr.len + minrec0len) < (ifp->if_mtu - IGMP_LEADINGSPACE)) { m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); m = m0; CTR1(KTR_IGMPV3, "%s: use existing packet", __func__); } else { if (mbufq_full(mq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } m = NULL; m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); if (!is_state_change && !is_group_query) { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); igmp_save_context(m, ifp); CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__); } /* * Append group record. * If we have sources, we don't know how many yet. */ ig.ig_type = type; ig.ig_datalen = 0; ig.ig_numsrc = 0; ig.ig_group = inm->inm_addr; if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct igmp_grouprec); /* * Append as many sources as will fit in the first packet. * If we are appending to a new packet, the chain allocation * may potentially use clusters; use m_getptr() in this case. * If we are appending to an existing packet, we need to obtain * a pointer to the group record after m_append(), in case a new * mbuf was allocated. * Only append sources which are in-mode at t1. If we are * transitioning to MCAST_UNDEFINED state on the group, do not * include source entries. * Only report recorded sources in our filter set when responding * to a group-source query. */ if (record_has_sources) { if (m == m0) { md = m_last(m); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + md->m_len - nbytes); } else { md = m_getptr(m, 0, &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); } msrcs = 0; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); now = ims_get_mode(inm, ims, 1); CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now); if ((now != mode) || (now == mode && mode == MCAST_UNDEFINED)) { CTR1(KTR_IGMPV3, "%s: skip node", __func__); continue; } if (is_source_query && ims->ims_stp == 0) { CTR1(KTR_IGMPV3, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_IGMPV3, "%s: append node", __func__); naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(in_addr_t); ++msrcs; if (msrcs == m0srcs) break; } CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__, msrcs); pig->ig_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(in_addr_t)); } if (is_source_query && msrcs == 0) { CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__); if (m != m0) m_freem(m); return (0); } /* * We are good to go with first packet. */ if (m != m0) { CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__); m->m_pkthdr.PH_vt.vt_nrecs = 1; mbufq_enqueue(mq, m); } else m->m_pkthdr.PH_vt.vt_nrecs++; /* * No further work needed if no source list in packet(s). */ if (!record_has_sources) return (nbytes); /* * Whilst sources remain to be announced, we need to allocate * a new packet and fill out as many sources as will fit. * Always try for a cluster first. */ while (nims != NULL) { if (mbufq_full(mq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); igmp_save_context(m, ifp); md = m_getptr(m, 0, &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__); if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 1; nbytes += sizeof(struct igmp_grouprec); m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); msrcs = 0; RB_FOREACH_FROM(ims, ip_msource_tree, nims) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); now = ims_get_mode(inm, ims, 1); if ((now != mode) || (now == mode && mode == MCAST_UNDEFINED)) { CTR1(KTR_IGMPV3, "%s: skip node", __func__); continue; } if (is_source_query && ims->ims_stp == 0) { CTR1(KTR_IGMPV3, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_IGMPV3, "%s: append node", __func__); naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } ++msrcs; if (msrcs == m0srcs) break; } pig->ig_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(in_addr_t)); CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__); mbufq_enqueue(mq, m); } return (nbytes); } /* * Type used to mark record pass completion. * We exploit the fact we can cast to this easily from the * current filter modes on each ip_msource node. */ typedef enum { REC_NONE = 0x00, /* MCAST_UNDEFINED */ REC_ALLOW = 0x01, /* MCAST_INCLUDE */ REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ REC_FULL = REC_ALLOW | REC_BLOCK } rectype_t; /* * Enqueue an IGMPv3 filter list change to the given output queue. * * Source list filter state is held in an RB-tree. When the filter list * for a group is changed without changing its mode, we need to compute * the deltas between T0 and T1 for each source in the filter set, * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. * * As we may potentially queue two record types, and the entire R-B tree * needs to be walked at once, we break this out into its own function * so we can generate a tightly packed queue of packets. * * XXX This could be written to only use one tree walk, although that makes * serializing into the mbuf chains a bit harder. For now we do two walks * which makes things easier on us, and it may or may not be harder on * the L2 cache. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int igmp_v3_enqueue_filter_change(struct mbufq *mq, struct in_multi *inm) { static const int MINRECLEN = sizeof(struct igmp_grouprec) + sizeof(in_addr_t); struct ifnet *ifp; struct igmp_grouprec ig; struct igmp_grouprec *pig; struct ip_msource *ims, *nims; struct mbuf *m, *m0, *md; in_addr_t naddr; int m0srcs, nbytes, npbytes, off, rsrcs, schanged; int nallow, nblock; uint8_t mode, now, then; rectype_t crt, drt, nrt; IN_MULTI_LIST_LOCK_ASSERT(); if (inm->inm_nsrc == 0 || (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0)) return (0); ifp = inm->inm_ifp; /* interface */ mode = inm->inm_st[1].iss_fmode; /* filter mode at t1 */ crt = REC_NONE; /* current group record type */ drt = REC_NONE; /* mask of completed group record types */ nrt = REC_NONE; /* record type for current node */ m0srcs = 0; /* # source which will fit in current mbuf chain */ nbytes = 0; /* # of bytes appended to group's state-change queue */ npbytes = 0; /* # of bytes appended this packet */ rsrcs = 0; /* # sources encoded in current record */ schanged = 0; /* # nodes encoded in overall filter change */ nallow = 0; /* # of source entries in ALLOW_NEW */ nblock = 0; /* # of source entries in BLOCK_OLD */ nims = NULL; /* next tree node pointer */ /* * For each possible filter record mode. * The first kind of source we encounter tells us which * is the first kind of record we start appending. * If a node transitioned to UNDEFINED at t1, its mode is treated * as the inverse of the group's filter mode. */ while (drt != REC_FULL) { do { m0 = mbufq_last(mq); if (m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && (m0->m_pkthdr.len + MINRECLEN) < (ifp->if_mtu - IGMP_LEADINGSPACE)) { m = m0; m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); CTR1(KTR_IGMPV3, "%s: use previous packet", __func__); } else { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) { CTR1(KTR_IGMPV3, "%s: m_get*() failed", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 0; igmp_save_context(m, ifp); m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); npbytes = 0; CTR1(KTR_IGMPV3, "%s: allocated new packet", __func__); } /* * Append the IGMP group record header to the * current packet's data area. * Recalculate pointer to free space for next * group record, in case m_append() allocated * a new mbuf or cluster. */ memset(&ig, 0, sizeof(ig)); ig.ig_group = inm->inm_addr; if (!m_append(m, sizeof(ig), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed", __func__); return (-ENOMEM); } npbytes += sizeof(struct igmp_grouprec); if (m != m0) { /* new packet; offset in c hain */ md = m_getptr(m, npbytes - sizeof(struct igmp_grouprec), &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); } else { /* current packet; offset from last append */ md = m_last(m); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + md->m_len - sizeof(struct igmp_grouprec)); } /* * Begin walking the tree for this record type * pass, or continue from where we left off * previously if we had to allocate a new packet. * Only report deltas in-mode at t1. * We need not report included sources as allowed * if we are in inclusive mode on the group, * however the converse is not true. */ rsrcs = 0; if (nims == NULL) nims = RB_MIN(ip_msource_tree, &inm->inm_srcs); RB_FOREACH_FROM(ims, ip_msource_tree, nims) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); now = ims_get_mode(inm, ims, 1); then = ims_get_mode(inm, ims, 0); CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d", __func__, then, now); if (now == then) { CTR1(KTR_IGMPV3, "%s: skip unchanged", __func__); continue; } if (mode == MCAST_EXCLUDE && now == MCAST_INCLUDE) { CTR1(KTR_IGMPV3, "%s: skip IN src on EX group", __func__); continue; } nrt = (rectype_t)now; if (nrt == REC_NONE) nrt = (rectype_t)(~mode & REC_FULL); if (schanged++ == 0) { crt = nrt; } else if (crt != nrt) continue; naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed", __func__); return (-ENOMEM); } nallow += !!(crt == REC_ALLOW); nblock += !!(crt == REC_BLOCK); if (++rsrcs == m0srcs) break; } /* * If we did not append any tree nodes on this * pass, back out of allocations. */ if (rsrcs == 0) { npbytes -= sizeof(struct igmp_grouprec); if (m != m0) { CTR1(KTR_IGMPV3, "%s: m_free(m)", __func__); m_freem(m); } else { CTR1(KTR_IGMPV3, "%s: m_adj(m, -ig)", __func__); m_adj(m, -((int)sizeof( struct igmp_grouprec))); } continue; } npbytes += (rsrcs * sizeof(in_addr_t)); if (crt == REC_ALLOW) pig->ig_type = IGMP_ALLOW_NEW_SOURCES; else if (crt == REC_BLOCK) pig->ig_type = IGMP_BLOCK_OLD_SOURCES; pig->ig_numsrc = htons(rsrcs); /* * Count the new group record, and enqueue this * packet if it wasn't already queued. */ m->m_pkthdr.PH_vt.vt_nrecs++; if (m != m0) mbufq_enqueue(mq, m); nbytes += npbytes; } while (nims != NULL); drt |= crt; crt = (~crt & REC_FULL); } CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__, nallow, nblock); return (nbytes); } static int igmp_v3_merge_state_changes(struct in_multi *inm, struct mbufq *scq) { struct mbufq *gq; struct mbuf *m; /* pending state-change */ struct mbuf *m0; /* copy of pending state-change */ struct mbuf *mt; /* last state-change in packet */ int docopy, domerge; u_int recslen; docopy = 0; domerge = 0; recslen = 0; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); /* * If there are further pending retransmissions, make a writable * copy of each queued state-change message before merging. */ if (inm->inm_scrv > 0) docopy = 1; gq = &inm->inm_scq; #ifdef KTR if (mbufq_first(gq) == NULL) { CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty", __func__, inm); } #endif m = mbufq_first(gq); while (m != NULL) { /* * Only merge the report into the current packet if * there is sufficient space to do so; an IGMPv3 report * packet may only contain 65,535 group records. * Always use a simple mbuf chain concatentation to do this, * as large state changes for single groups may have * allocated clusters. */ domerge = 0; mt = mbufq_last(scq); if (mt != NULL) { recslen = m_length(m, NULL); if ((mt->m_pkthdr.PH_vt.vt_nrecs + m->m_pkthdr.PH_vt.vt_nrecs <= IGMP_V3_REPORT_MAXRECS) && (mt->m_pkthdr.len + recslen <= (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE))) domerge = 1; } if (!domerge && mbufq_full(gq)) { CTR2(KTR_IGMPV3, "%s: outbound queue full, skipping whole packet %p", __func__, m); mt = m->m_nextpkt; if (!docopy) m_freem(m); m = mt; continue; } if (!docopy) { CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m); m0 = mbufq_dequeue(gq); m = m0->m_nextpkt; } else { CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m); m0 = m_dup(m, M_NOWAIT); if (m0 == NULL) return (ENOMEM); m0->m_nextpkt = NULL; m = m->m_nextpkt; } if (!domerge) { CTR3(KTR_IGMPV3, "%s: queueing %p to scq %p)", __func__, m0, scq); mbufq_enqueue(scq, m0); } else { struct mbuf *mtl; /* last mbuf of packet mt */ CTR3(KTR_IGMPV3, "%s: merging %p with scq tail %p)", __func__, m0, mt); mtl = m_last(mt); m0->m_flags &= ~M_PKTHDR; mt->m_pkthdr.len += recslen; mt->m_pkthdr.PH_vt.vt_nrecs += m0->m_pkthdr.PH_vt.vt_nrecs; mtl->m_next = m0; } } return (0); } /* * Respond to a pending IGMPv3 General Query. */ static void igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi) { - struct epoch_tracker et; struct ifmultiaddr *ifma; struct ifnet *ifp; struct in_multi *inm; int retval __unused, loop; IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); + NET_EPOCH_ASSERT(); KASSERT(igi->igi_version == IGMP_VERSION_3, ("%s: called when version %d", __func__, igi->igi_version)); /* * Check that there are some packets queued. If so, send them first. * For large number of groups the reply to general query can take * many packets, we should finish sending them before starting of * queuing the new reply. */ if (mbufq_len(&igi->igi_gq) != 0) goto send; ifp = igi->igi_ifp; - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; KASSERT(ifp == inm->inm_ifp, ("%s: inconsistent ifp", __func__)); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; retval = igmp_v3_enqueue_group_record(&igi->igi_gq, inm, 0, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } - NET_EPOCH_EXIT(et); send: loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop); /* * Slew transmission of bursts over 500ms intervals. */ if (mbufq_first(&igi->igi_gq) != NULL) { igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY( IGMP_RESPONSE_BURST_INTERVAL); V_interface_timers_running = 1; } } /* * Transmit the next pending IGMP message in the output queue. * * We get called from netisr_processqueue(). A mutex private to igmpoq * will be acquired and released around this routine. * * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis. * MRT: Nothing needs to be done, as IGMP traffic is always local to * a link and uses a link-scope multicast address. */ static void igmp_intr(struct mbuf *m) { struct ip_moptions imo; struct ifnet *ifp; struct mbuf *ipopts, *m0; int error; uint32_t ifindex; CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m); /* * Set VNET image pointer from enqueued mbuf chain * before doing anything else. Whilst we use interface * indexes to guard against interface detach, they are * unique to each VIMAGE and must be retrieved. */ CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr)); ifindex = igmp_restore_context(m); /* * Check if the ifnet still exists. This limits the scope of * any race in the absence of a global ifp lock for low cost * (an array lookup). */ ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.", __func__, m, ifindex); m_freem(m); IPSTAT_INC(ips_noroute); goto out; } ipopts = V_igmp_sendra ? m_raopt : NULL; imo.imo_multicast_ttl = 1; imo.imo_multicast_vif = -1; imo.imo_multicast_loop = (V_ip_mrouter != NULL); /* * If the user requested that IGMP traffic be explicitly * redirected to the loopback interface (e.g. they are running a * MANET interface and the routing protocol needs to see the * updates), handle this now. */ if (m->m_flags & M_IGMP_LOOP) imo.imo_multicast_ifp = V_loif; else imo.imo_multicast_ifp = ifp; if (m->m_flags & M_IGMPV2) { m0 = m; } else { m0 = igmp_v3_encap_report(ifp, m); if (m0 == NULL) { CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m); m_freem(m); IPSTAT_INC(ips_odropped); goto out; } } igmp_scrub_context(m0); m_clrprotoflags(m); m0->m_pkthdr.rcvif = V_loif; #ifdef MAC mac_netinet_igmp_send(ifp, m0); #endif error = ip_output(m0, ipopts, NULL, 0, &imo, NULL); if (error) { CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error); goto out; } IGMPSTAT_INC(igps_snd_reports); out: /* * We must restore the existing vnet pointer before * continuing as we are run from netisr context. */ CURVNET_RESTORE(); } /* * Encapsulate an IGMPv3 report. * * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf * chain has already had its IP/IGMPv3 header prepended. In this case * the function will not attempt to prepend; the lengths and checksums * will however be re-computed. * * Returns a pointer to the new mbuf chain head, or NULL if the * allocation failed. */ static struct mbuf * igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) { struct rm_priotracker in_ifa_tracker; struct igmp_report *igmp; struct ip *ip; int hdrlen, igmpreclen; KASSERT((m->m_flags & M_PKTHDR), ("%s: mbuf chain %p is !M_PKTHDR", __func__, m)); igmpreclen = m_length(m, NULL); hdrlen = sizeof(struct ip) + sizeof(struct igmp_report); if (m->m_flags & M_IGMPV3_HDR) { igmpreclen -= hdrlen; } else { M_PREPEND(m, hdrlen, M_NOWAIT); if (m == NULL) return (NULL); m->m_flags |= M_IGMPV3_HDR; } CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen); m->m_data += sizeof(struct ip); m->m_len -= sizeof(struct ip); igmp = mtod(m, struct igmp_report *); igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT; igmp->ir_rsv1 = 0; igmp->ir_rsv2 = 0; igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs); igmp->ir_cksum = 0; igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen); m->m_pkthdr.PH_vt.vt_nrecs = 0; m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = IPTOS_PREC_INTERNETCONTROL; ip->ip_len = htons(hdrlen + igmpreclen); ip->ip_off = htons(IP_DF); ip->ip_p = IPPROTO_IGMP; ip->ip_sum = 0; ip->ip_src.s_addr = INADDR_ANY; if (m->m_flags & M_IGMP_LOOP) { - struct epoch_tracker et; struct in_ifaddr *ia; - NET_EPOCH_ENTER(et); IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) ip->ip_src = ia->ia_addr.sin_addr; - NET_EPOCH_EXIT(et); } ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP); return (m); } #ifdef KTR static char * igmp_rec_type_to_str(const int type) { switch (type) { case IGMP_CHANGE_TO_EXCLUDE_MODE: return "TO_EX"; break; case IGMP_CHANGE_TO_INCLUDE_MODE: return "TO_IN"; break; case IGMP_MODE_IS_EXCLUDE: return "MODE_EX"; break; case IGMP_MODE_IS_INCLUDE: return "MODE_IN"; break; case IGMP_ALLOW_NEW_SOURCES: return "ALLOW_NEW"; break; case IGMP_BLOCK_OLD_SOURCES: return "BLOCK_OLD"; break; default: break; } return "unknown"; } #endif #ifdef VIMAGE static void vnet_igmp_init(const void *unused __unused) { netisr_register_vnet(&igmp_nh); } VNET_SYSINIT(vnet_igmp_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_igmp_init, NULL); static void vnet_igmp_uninit(const void *unused __unused) { /* This can happen when we shutdown the entire network stack. */ CTR1(KTR_IGMPV3, "%s: tearing down", __func__); netisr_unregister_vnet(&igmp_nh); } VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_igmp_uninit, NULL); #endif #ifdef DDB DB_SHOW_COMMAND(igi_list, db_show_igi_list) { struct igmp_ifsoftc *igi, *tigi; LIST_HEAD(_igi_list, igmp_ifsoftc) *igi_head; if (!have_addr) { db_printf("usage: show igi_list \n"); return; } igi_head = (struct _igi_list *)addr; LIST_FOREACH_SAFE(igi, igi_head, igi_link, tigi) { db_printf("igmp_ifsoftc %p:\n", igi); db_printf(" ifp %p\n", igi->igi_ifp); db_printf(" version %u\n", igi->igi_version); db_printf(" v1_timer %u\n", igi->igi_v1_timer); db_printf(" v2_timer %u\n", igi->igi_v2_timer); db_printf(" v3_timer %u\n", igi->igi_v3_timer); db_printf(" flags %#x\n", igi->igi_flags); db_printf(" rv %u\n", igi->igi_rv); db_printf(" qi %u\n", igi->igi_qi); db_printf(" qri %u\n", igi->igi_qri); db_printf(" uri %u\n", igi->igi_uri); /* struct mbufq igi_gq; */ db_printf("\n"); } } #endif static int igmp_modevent(module_t mod, int type, void *unused __unused) { switch (type) { case MOD_LOAD: CTR1(KTR_IGMPV3, "%s: initializing", __func__); IGMP_LOCK_INIT(); m_raopt = igmp_ra_alloc(); netisr_register(&igmp_nh); break; case MOD_UNLOAD: CTR1(KTR_IGMPV3, "%s: tearing down", __func__); netisr_unregister(&igmp_nh); m_free(m_raopt); m_raopt = NULL; IGMP_LOCK_DESTROY(); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t igmp_mod = { "igmp", igmp_modevent, 0 }; DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE); diff --git a/sys/netinet/in.c b/sys/netinet/in.c index cbb171395662..c83f0ae83fc4 100644 --- a/sys/netinet/in.c +++ b/sys/netinet/in.c @@ -1,1517 +1,1513 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (C) 2001 WIDE Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.4 (Berkeley) 1/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int in_aifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static int in_difaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static void in_socktrim(struct sockaddr_in *); static void in_purgemaddrs(struct ifnet *); VNET_DEFINE_STATIC(int, nosameprefix); #define V_nosameprefix VNET(nosameprefix) SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nosameprefix), 0, "Refuse to create same prefixes on different interfaces"); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static struct sx in_control_sx; SX_SYSINIT(in_control_sx, &in_control_sx, "in_control"); /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). */ int in_localaddr(struct in_addr in) { struct rm_priotracker in_ifa_tracker; u_long i = ntohl(in.s_addr); struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((i & ia->ia_subnetmask) == ia->ia_subnet) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in_localip(struct in_addr in) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) { if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in_ifhasaddr(struct ifnet *ifp, struct in_addr in) { - struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; - NET_EPOCH_ENTER(et); + NET_EPOCH_ASSERT(); + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; - if (ia->ia_addr.sin_addr.s_addr == in.s_addr) { - NET_EPOCH_EXIT(et); + if (ia->ia_addr.sin_addr.s_addr == in.s_addr) return (1); - } } - NET_EPOCH_EXIT(et); return (0); } /* * Return a reference to the interface address which is different to * the supplied one but with same IP address value. */ static struct in_ifaddr * in_localip_more(struct in_ifaddr *ia) { struct rm_priotracker in_ifa_tracker; in_addr_t in = IA_SIN(ia)->sin_addr.s_addr; struct in_ifaddr *it; IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(it, INADDR_HASH(in), ia_hash) { if (it != ia && IA_SIN(it)->sin_addr.s_addr == in) { ifa_ref(&it->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (it); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (NULL); } /* * Determine whether an IP address is in a reserved set of addresses * that may not be forwarded, or whether datagrams to that destination * may be forwarded. */ int in_canforward(struct in_addr in) { u_long i = ntohl(in.s_addr); if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i) || IN_ZERONET(i) || IN_LOOPBACK(i)) return (0); return (1); } /* * Trim a mask in a sockaddr */ static void in_socktrim(struct sockaddr_in *ap) { char *cplim = (char *) &ap->sin_addr; char *cp = (char *) (&ap->sin_addr + 1); ap->sin_len = 0; while (--cp >= cplim) if (*cp) { (ap)->sin_len = cp - (char *) (ap) + 1; break; } } /* * Generic internet control operations (ioctl's). */ int in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr; struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; int error; if (ifp == NULL) return (EADDRNOTAVAIL); /* * Filter out 4 ioctls we implement directly. Forward the rest * to specific functions and ifp->if_ioctl(). */ switch (cmd) { case SIOCGIFADDR: case SIOCGIFBRDADDR: case SIOCGIFDSTADDR: case SIOCGIFNETMASK: break; case SIOCDIFADDR: sx_xlock(&in_control_sx); error = in_difaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case OSIOCAIFADDR: /* 9.x compat */ case SIOCAIFADDR: sx_xlock(&in_control_sx); error = in_aifaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* We no longer support that old commands. */ return (EINVAL); default: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); return ((*ifp->if_ioctl)(ifp, cmd, data)); } if (addr->sin_addr.s_addr != INADDR_ANY && prison_check_ip4(td->td_ucred, &addr->sin_addr) != 0) return (EADDRNOTAVAIL); /* * Find address for this interface, if it exists. If an * address was specified, find that one instead of the * first one on the interface, if possible. */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr) break; } if (ifa == NULL) CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) { ia = (struct in_ifaddr *)ifa; if (prison_check_ip4(td->td_ucred, &ia->ia_addr.sin_addr) == 0) break; } if (ifa == NULL) { NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } error = 0; switch (cmd) { case SIOCGIFADDR: *addr = ia->ia_addr; break; case SIOCGIFBRDADDR: if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EINVAL; break; } *addr = ia->ia_broadaddr; break; case SIOCGIFDSTADDR: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; break; } *addr = ia->ia_dstaddr; break; case SIOCGIFNETMASK: *addr = ia->ia_sockmask; break; } NET_EPOCH_EXIT(et); return (error); } static int in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct in_aliasreq *ifra = (struct in_aliasreq *)data; const struct sockaddr_in *addr = &ifra->ifra_addr; const struct sockaddr_in *broadaddr = &ifra->ifra_broadaddr; const struct sockaddr_in *mask = &ifra->ifra_mask; const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr; const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0; struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; bool iaIsFirst; int error = 0; error = priv_check(td, PRIV_NET_ADDIFADDR); if (error) return (error); /* * ifra_addr must be present and be of INET family. * ifra_broadaddr/ifra_dstaddr and ifra_mask are optional. */ if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) return (EINVAL); if (broadaddr->sin_len != 0 && (broadaddr->sin_len != sizeof(struct sockaddr_in) || broadaddr->sin_family != AF_INET)) return (EINVAL); if (mask->sin_len != 0 && (mask->sin_len != sizeof(struct sockaddr_in) || mask->sin_family != AF_INET)) return (EINVAL); if ((ifp->if_flags & IFF_POINTOPOINT) && (dstaddr->sin_len != sizeof(struct sockaddr_in) || dstaddr->sin_addr.s_addr == INADDR_ANY)) return (EDESTADDRREQ); if (vhid > 0 && carp_attach_p == NULL) return (EPROTONOSUPPORT); /* * See whether address already exist. */ iaIsFirst = true; ia = NULL; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; iaIsFirst = false; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0) ia = it; } NET_EPOCH_EXIT(et); if (ia != NULL) (void )in_difaddr_ioctl(cmd, data, ifp, td); ifa = ifa_alloc(sizeof(struct in_ifaddr), M_WAITOK); ia = (struct in_ifaddr *)ifa; ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; callout_init_rw(&ia->ia_garp_timer, &ifp->if_addr_lock, CALLOUT_RETURNUNLOCKED); ia->ia_ifp = ifp; ia->ia_addr = *addr; if (mask->sin_len != 0) { ia->ia_sockmask = *mask; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); } else { in_addr_t i = ntohl(addr->sin_addr.s_addr); /* * Be compatible with network classes, if netmask isn't * supplied, guess it based on classes. */ if (IN_CLASSA(i)) ia->ia_subnetmask = IN_CLASSA_NET; else if (IN_CLASSB(i)) ia->ia_subnetmask = IN_CLASSB_NET; else ia->ia_subnetmask = IN_CLASSC_NET; ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); } ia->ia_subnet = ntohl(addr->sin_addr.s_addr) & ia->ia_subnetmask; in_socktrim(&ia->ia_sockmask); if (ifp->if_flags & IFF_BROADCAST) { if (broadaddr->sin_len != 0) { ia->ia_broadaddr = *broadaddr; } else if (ia->ia_subnetmask == IN_RFC3021_MASK) { ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST; ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } else { ia->ia_broadaddr.sin_addr.s_addr = htonl(ia->ia_subnet | ~ia->ia_subnetmask); ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } } if (ifp->if_flags & IFF_POINTOPOINT) ia->ia_dstaddr = *dstaddr; /* XXXGL: rtinit() needs this strange assignment. */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_dstaddr = ia->ia_addr; if (vhid != 0) { error = (*carp_attach_p)(&ia->ia_ifa, vhid); if (error) return (error); } /* if_addrhead is already referenced by ifa_alloc() */ IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(ifa); /* in_ifaddrhead */ IN_IFADDR_WLOCK(); CK_STAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * Give the interface a chance to initialize * if this is its first address, * and to validate the address if necessary. */ if (ifp->if_ioctl != NULL) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) goto fail1; } /* * Add route for the network. */ if (vhid == 0) { int flags = RTF_UP; if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) flags |= RTF_HOST; error = in_addprefix(ia, flags); if (error) goto fail1; } /* * Add a loopback route to self. */ if (vhid == 0 && (ifp->if_flags & IFF_LOOPBACK) == 0 && ia->ia_addr.sin_addr.s_addr != INADDR_ANY && !((ifp->if_flags & IFF_POINTOPOINT) && ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)) { struct in_ifaddr *eia; eia = in_localip_more(ia); if (eia == NULL) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error) goto fail2; } else ifa_free(&eia->ia_ifa); } if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST)) { struct in_addr allhosts_addr; struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); error = in_joingroup(ifp, &allhosts_addr, NULL, &ii->ii_allhosts); } /* * Note: we don't need extra reference for ifa, since we called * with sx lock held, and ifaddr can not be deleted in concurrent * thread. */ EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, ifa, IFADDR_EVENT_ADD); return (error); fail2: if (vhid == 0) (void )in_scrubprefix(ia, LLE_STATIC); fail1: if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, false); IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (error); } static int in_difaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct ifreq *ifr = (struct ifreq *)data; const struct sockaddr_in *addr = (const struct sockaddr_in *) &ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; bool deleteAny, iaIsLast; int error; if (td != NULL) { error = priv_check(td, PRIV_NET_DELIFADDR); if (error) return (error); } if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) deleteAny = true; else deleteAny = false; iaIsLast = true; ia = NULL; IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (deleteAny && ia == NULL && (td == NULL || prison_check_ip4(td->td_ucred, &it->ia_addr.sin_addr) == 0)) ia = it; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && (td == NULL || prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0)) ia = it; if (it != ia) iaIsLast = false; } if (ia == NULL) { IF_ADDR_WUNLOCK(ifp); return (EADDRNOTAVAIL); } CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, LLE_STATIC); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right * thing to do, but at least if we are running * a routing process they will come back. */ in_ifadown(&ia->ia_ifa, 1); if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, cmd == SIOCAIFADDR); /* * If this is the last IPv4 address configured on this * interface, leave the all-hosts group. * No state-change report need be transmitted. */ if (iaIsLast && (ifp->if_flags & IFF_MULTICAST)) { struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); if (ii->ii_allhosts) { (void)in_leavegroup(ii->ii_allhosts, NULL); ii->ii_allhosts = NULL; } } IF_ADDR_WLOCK(ifp); if (callout_stop(&ia->ia_garp_timer) == 1) { ifa_free(&ia->ia_ifa); } IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, &ia->ia_ifa, IFADDR_EVENT_DEL); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (0); } #define rtinitflags(x) \ ((((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) != 0) \ ? RTF_HOST : 0) /* * Check if we have a route for the given prefix already or add one accordingly. */ int in_addprefix(struct in_ifaddr *target, int flags) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error; if ((flags & RTF_HOST) != 0) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } IN_IFADDR_RLOCK(&in_ifa_tracker); /* Look for an existing address with the same prefix, mask, and fib */ CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if (target->ia_ifp->if_fib != ia->ia_ifp->if_fib) continue; /* * If we got a matching prefix route inserted by other * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { #ifdef RADIX_MPATH if (ia->ia_addr.sin_addr.s_addr == target->ia_addr.sin_addr.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else break; #endif if (V_nosameprefix) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_ADD, &target->ia_ifa, fibnum); IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * No-one seem to have this prefix route, so we try to insert it. */ error = rtinit(&target->ia_ifa, (int)RTM_ADD, flags); if (!error) target->ia_flags |= IFA_ROUTE; return (error); } /* * Removes either all lle entries for given @ia, or lle * corresponding to @ia address. */ static void in_scrubprefixlle(struct in_ifaddr *ia, int all, u_int flags) { struct sockaddr_in addr, mask; struct sockaddr *saddr, *smask; struct ifnet *ifp; saddr = (struct sockaddr *)&addr; bzero(&addr, sizeof(addr)); addr.sin_len = sizeof(addr); addr.sin_family = AF_INET; smask = (struct sockaddr *)&mask; bzero(&mask, sizeof(mask)); mask.sin_len = sizeof(mask); mask.sin_family = AF_INET; mask.sin_addr.s_addr = ia->ia_subnetmask; ifp = ia->ia_ifp; if (all) { /* * Remove all L2 entries matching given prefix. * Convert address to host representation to avoid * doing this on every callback. ia_subnetmask is already * stored in host representation. */ addr.sin_addr.s_addr = ntohl(ia->ia_addr.sin_addr.s_addr); lltable_prefix_free(AF_INET, saddr, smask, flags); } else { /* Remove interface address only */ addr.sin_addr.s_addr = ia->ia_addr.sin_addr.s_addr; lltable_delete_addr(LLTABLE(ifp), LLE_IFADDR, saddr); } } /* * If there is no other address in the system that can serve a route to the * same prefix, remove the route. Hand over the route to the new address * otherwise. */ int in_scrubprefix(struct in_ifaddr *target, u_int flags) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error = 0; /* * Remove the loopback route to the interface address. */ if ((target->ia_addr.sin_addr.s_addr != INADDR_ANY) && !(target->ia_ifp->if_flags & IFF_LOOPBACK) && (flags & LLE_STATIC)) { struct in_ifaddr *eia; /* * XXXME: add fib-aware in_localip. * We definitely don't want to switch between * prefixes in different fibs. */ eia = in_localip_more(target); if (eia != NULL) { error = ifa_switch_loopback_route((struct ifaddr *)eia, (struct sockaddr *)&target->ia_addr); ifa_free(&eia->ia_ifa); } else { error = ifa_del_loopback_route((struct ifaddr *)target, (struct sockaddr *)&target->ia_addr); } } if (rtinitflags(target)) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } if ((target->ia_flags & IFA_ROUTE) == 0) { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_DELETE, &target->ia_ifa, fibnum); /* * Removing address from !IFF_UP interface or * prefix which exists on other interface (along with route). * No entries should exist here except target addr. * Given that, delete this entry only. */ in_scrubprefixlle(target, 0, flags); return (0); } IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if ((ia->ia_ifp->if_flags & IFF_UP) == 0) continue; /* * If we got a matching prefix address, move IFA_ROUTE and * the route itself to it. Make sure that routing daemons * get a heads-up. */ if ((ia->ia_flags & IFA_ROUTE) == 0) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n", error); /* Scrub all entries IFF interface is different */ in_scrubprefixlle(target, target->ia_ifp != ia->ia_ifp, flags); error = rtinit(&ia->ia_ifa, (int)RTM_ADD, rtinitflags(ia) | RTF_UP); if (error == 0) ia->ia_flags |= IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, new prefix add failed\n", error); ifa_free(&ia->ia_ifa); return (error); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * remove all L2 entries on the given prefix */ in_scrubprefixlle(target, 1, flags); /* * As no-one seem to have this prefix, we can remove the route. */ error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, prefix delete failed\n", error); return (error); } #undef rtinitflags void in_ifscrub_all(void) { struct ifnet *ifp; struct ifaddr *ifa, *nifa; struct ifaliasreq ifr; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Cannot lock here - lock recursion. */ /* NET_EPOCH_ENTER(et); */ CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { if (ifa->ifa_addr->sa_family != AF_INET) continue; /* * This is ugly but the only way for legacy IP to * cleanly remove addresses and everything attached. */ bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; (void)in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL); } /* NET_EPOCH_EXIT(et); */ in_purgemaddrs(ifp); igmp_domifdetach(ifp); } IFNET_RUNLOCK(); } int in_ifaddr_broadcast(struct in_addr in, struct in_ifaddr *ia) { return ((in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || /* * Check for old-style (host 0) broadcast, but * taking into account that RFC 3021 obsoletes it. */ (ia->ia_subnetmask != IN_RFC3021_MASK && ntohl(in.s_addr) == ia->ia_subnet)) && /* * Check for an all one subnetmask. These * only exist when an interface gets a secondary * address. */ ia->ia_subnetmask != (u_long)0xffffffff); } /* * Return 1 if the address might be a local broadcast address. */ int in_broadcast(struct in_addr in, struct ifnet *ifp) { - struct epoch_tracker et; struct ifaddr *ifa; int found; + NET_EPOCH_ASSERT(); + if (in.s_addr == INADDR_BROADCAST || in.s_addr == INADDR_ANY) return (1); if ((ifp->if_flags & IFF_BROADCAST) == 0) return (0); found = 0; /* * Look through the list of addresses for a match * with a broadcast address. */ - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && in_ifaddr_broadcast(in, (struct in_ifaddr *)ifa)) { found = 1; break; } - NET_EPOCH_EXIT(et); return (found); } /* * On interface removal, clean up IPv4 data structures hung off of the ifnet. */ void in_ifdetach(struct ifnet *ifp) { IN_MULTI_LOCK(); in_pcbpurgeif0(&V_ripcbinfo, ifp); in_pcbpurgeif0(&V_udbinfo, ifp); in_pcbpurgeif0(&V_ulitecbinfo, ifp); in_purgemaddrs(ifp); IN_MULTI_UNLOCK(); } /* * Delete all IPv4 multicast address records, and associated link-layer * multicast address records, associated with ifp. * XXX It looks like domifdetach runs AFTER the link layer cleanup. * XXX This should not race with ifma_protospec being set during * a new allocation, if it does, we have bigger problems. */ static void in_purgemaddrs(struct ifnet *ifp) { struct in_multi_head purgeinms; struct in_multi *inm; struct ifmultiaddr *ifma, *next; SLIST_INIT(&purgeinms); IN_MULTI_LIST_LOCK(); /* * Extract list of in_multi associated with the detaching ifp * which the PF_INET layer is about to release. * We need to do this as IF_ADDR_LOCK() may be re-acquired * by code further down. */ IF_ADDR_WLOCK(ifp); restart: CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; inm_rele_locked(&purgeinms, inm); if (__predict_false(ifma_restart)) { ifma_restart = true; goto restart; } } IF_ADDR_WUNLOCK(ifp); inm_release_list_deferred(&purgeinms); igmp_ifdetach(ifp); IN_MULTI_LIST_UNLOCK(); } struct in_llentry { struct llentry base; }; #define IN_LLTBL_DEFAULT_HSIZE 32 #define IN_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. */ static void in_lltable_destroy_lle_unlocked(epoch_context_t ctx) { struct llentry *lle; lle = __containerof(ctx, struct llentry, lle_epoch_ctx); LLE_LOCK_DESTROY(lle); LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); } /* * Called by the datapath to indicate that * the entry was used. */ static void in_lltable_mark_used(struct llentry *lle) { LLE_REQ_LOCK(lle); lle->r_skip_req = 0; LLE_REQ_UNLOCK(lle); } /* * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); epoch_call(net_epoch_preempt, &lle->lle_epoch_ctx, in_lltable_destroy_lle_unlocked); } static struct llentry * in_lltable_new(struct in_addr addr4, u_int flags) { struct in_llentry *lle; lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; /* * For IPv4 this will trigger "arpresolve" to generate * an ARP request. */ lle->base.la_expire = time_uptime; /* mark expired */ lle->base.r_l3addr.addr4 = addr4; lle->base.lle_refcnt = 1; lle->base.lle_free = in_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); LLE_REQ_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } #define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ ((((d).s_addr ^ (a).s_addr) & (m).s_addr)) == 0 ) static int in_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { struct in_addr addr, mask, lle_addr; addr = ((const struct sockaddr_in *)saddr)->sin_addr; mask = ((const struct sockaddr_in *)smask)->sin_addr; lle_addr.s_addr = ntohl(lle->r_l3addr.addr4.s_addr); if (IN_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. * Note also we should handle 'ifdown' cases without removing * ifaddr macs. */ if (addr.s_addr == lle_addr.s_addr && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in_lltable_free_entry(struct lltable *llt, struct llentry *lle) { size_t pkts_dropped; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table if not already */ if ((lle->la_flags & LLE_LINKED) != 0) { IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp); lltable_unlink_entry(llt, lle); } /* Drop hold queue */ pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); } static int in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { struct rt_addrinfo info; struct sockaddr_in rt_key, rt_mask; struct sockaddr rt_gateway; int rt_flags; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); bzero(&rt_key, sizeof(rt_key)); rt_key.sin_len = sizeof(rt_key); bzero(&rt_mask, sizeof(rt_mask)); rt_mask.sin_len = sizeof(rt_mask); bzero(&rt_gateway, sizeof(rt_gateway)); rt_gateway.sa_len = sizeof(rt_gateway); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key; info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&rt_mask; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&rt_gateway; if (rib_lookup_info(ifp->if_fib, l3addr, NHR_REF, 0, &info) != 0) return (EINVAL); rt_flags = info.rti_flags; /* * If the gateway for an existing host route matches the target L3 * address, which is a special route inserted by some implementation * such as MANET, and the interface is of the correct type, then * allow for ARP to proceed. */ if (rt_flags & RTF_GATEWAY) { if (!(rt_flags & RTF_HOST) || !info.rti_ifp || info.rti_ifp->if_type != IFT_ETHER || (info.rti_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 || memcmp(rt_gateway.sa_data, l3addr->sa_data, sizeof(in_addr_t)) != 0) { rib_free_info(&info); return (EINVAL); } } rib_free_info(&info); /* * Make sure that at least the destination address is covered * by the route. This is for handling the case where 2 or more * interfaces have the same prefix. An incoming packet arrives * on one interface and the corresponding outgoing packet leaves * another interface. */ if (!(rt_flags & RTF_HOST) && info.rti_ifp != ifp) { const char *sa, *mask, *addr, *lim; const struct sockaddr_in *l3sin; mask = (const char *)&rt_mask; /* * Just being extra cautious to avoid some custom * code getting into trouble. */ if ((info.rti_addrs & RTA_NETMASK) == 0) return (EINVAL); sa = (const char *)&rt_key; addr = (const char *)l3addr; l3sin = (const struct sockaddr_in *)l3addr; lim = addr + l3sin->sin_len; for ( ; addr < lim; sa++, mask++, addr++) { if ((*sa ^ *addr) & *mask) { #ifdef DIAGNOSTIC char addrbuf[INET_ADDRSTRLEN]; log(LOG_INFO, "IPv4 address: \"%s\" " "is not on the network\n", inet_ntoa_r(l3sin->sin_addr, addrbuf)); #endif return (EINVAL); } } } return (0); } static inline uint32_t in_lltable_hash_dst(const struct in_addr dst, uint32_t hsize) { return (IN_LLTBL_HASH(dst.s_addr, hsize)); } static uint32_t in_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in_lltable_hash_dst(lle->r_l3addr.addr4, hsize)); } static void in_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)sa; bzero(sin, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = lle->r_l3addr.addr4; } static inline struct llentry * in_lltable_find_dst(struct lltable *llt, struct in_addr dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; CK_LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (lle->r_l3addr.addr4.s_addr == dst.s_addr) break; } return (lle); } static void in_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in_lltable_new(sin->sin_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if (flags & LLE_STATIC) lle->r_flags |= RLLE_VALID; if ((flags & LLE_IFADDR) == LLE_IFADDR) { linkhdrsize = LLE_MAX_LINKHDR; if (lltable_calc_llheader(ifp, AF_INET, IF_LLADDR(ifp), linkhdr, &linkhdrsize, &lladdr_off) != 0) { epoch_call(net_epoch_preempt, &lle->lle_epoch_ctx, in_lltable_destroy_lle_unlocked); return (NULL); } lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); lle->la_flags |= LLE_STATIC; lle->r_flags |= (RLLE_VALID | RLLE_IFADDR); } return (lle); } /* * Return NULL if not found or marked for deletion. * If found return lle read locked. */ static struct llentry * in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); KASSERT((flags & (LLE_UNLOCKED | LLE_EXCLUSIVE)) != (LLE_UNLOCKED | LLE_EXCLUSIVE), ("wrong lle request flags: %#x", flags)); lle = in_lltable_find_dst(llt, sin->sin_addr); if (lle == NULL) return (NULL); if (flags & LLE_UNLOCKED) return (lle); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); /* * If the afdata lock is not held, the LLE may have been unlinked while * we were blocked on the LLE lock. Check for this case. */ if (__predict_false((lle->la_flags & LLE_LINKED) == 0)) { if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(lle); else LLE_RUNLOCK(lle); return (NULL); } return (lle); } static int in_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in sin; struct sockaddr_dl sdl; } arpc; struct sockaddr_dl *sdl; int error; bzero(&arpc, sizeof(arpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&arpc.sin) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in; (IPv4) * struct sockaddr_dl; */ arpc.rtm.rtm_msglen = sizeof(arpc); arpc.rtm.rtm_version = RTM_VERSION; arpc.rtm.rtm_type = RTM_GET; arpc.rtm.rtm_flags = RTF_UP; arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; /* publish */ if (lle->la_flags & LLE_PUB) arpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &arpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } arpc.rtm.rtm_rmx.rmx_expire = lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) arpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) arpc.rtm.rtm_flags |= RTF_PINNED; arpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &arpc, sizeof(arpc)); return (error); } static struct lltable * in_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET; llt->llt_ifp = ifp; llt->llt_lookup = in_lltable_lookup; llt->llt_alloc_entry = in_lltable_alloc; llt->llt_delete_entry = in_lltable_delete_entry; llt->llt_dump_entry = in_lltable_dump_entry; llt->llt_hash = in_lltable_hash; llt->llt_fill_sa_entry = in_lltable_fill_sa_entry; llt->llt_free_entry = in_lltable_free_entry; llt->llt_match_prefix = in_lltable_match_prefix; llt->llt_mark_used = in_lltable_mark_used; lltable_link(llt); return (llt); } void * in_domifattach(struct ifnet *ifp) { struct in_ifinfo *ii; ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO); ii->ii_llt = in_lltattach(ifp); ii->ii_igmp = igmp_domifattach(ifp); return (ii); } void in_domifdetach(struct ifnet *ifp, void *aux) { struct in_ifinfo *ii = (struct in_ifinfo *)aux; igmp_domifdetach(ifp); lltable_free(ii->ii_llt); free(ii, M_IFADDR); } diff --git a/sys/netinet/in_mcast.c b/sys/netinet/in_mcast.c index ac5c6a03ab54..a91d88277fce 100644 --- a/sys/netinet/in_mcast.c +++ b/sys/netinet/in_mcast.c @@ -1,3055 +1,3061 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 2005 Robert N. M. Watson. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPv4 multicast socket, group, and socket option processing module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IGMPV3 #define KTR_IGMPV3 KTR_INET #endif #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in sin; }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ static MALLOC_DEFINE(M_INMFILTER, "in_mfilter", "IPv4 multicast PCB-layer source filter"); static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group"); static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options"); static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource", "IPv4 multicast IGMP-layer source filter"); /* * Locking: * * - Lock order is: Giant, IN_MULTI_LOCK, INP_WLOCK, * IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip_moptions and in_mfilter are covered by the INP_WLOCK. * * struct in_multi is covered by IN_MULTI_LIST_LOCK. There isn't strictly * any need for in_multi itself to be virtualized -- it is bound to an ifp * anyway no matter what happens. */ struct mtx in_multi_list_mtx; MTX_SYSINIT(in_multi_mtx, &in_multi_list_mtx, "in_multi_list_mtx", MTX_DEF); struct mtx in_multi_free_mtx; MTX_SYSINIT(in_multi_free_mtx, &in_multi_free_mtx, "in_multi_free_mtx", MTX_DEF); struct sx in_multi_sx; SX_SYSINIT(in_multi_sx, &in_multi_sx, "in_multi_sx"); int ifma_restart; /* * Functions with non-static linkage defined in this file should be * declared in in_var.h: * imo_multi_filter() * in_addmulti() * in_delmulti() * in_joingroup() * in_joingroup_locked() * in_leavegroup() * in_leavegroup_locked() * and ip_var.h: * inp_freemoptions() * inp_getmoptions() * inp_setmoptions() * * XXX: Both carp and pf need to use the legacy (*,G) KPIs in_addmulti() * and in_delmulti(). */ static void imf_commit(struct in_mfilter *); static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **); static struct in_msource * imf_graft(struct in_mfilter *, const uint8_t, const struct sockaddr_in *); static void imf_leave(struct in_mfilter *); static int imf_prune(struct in_mfilter *, const struct sockaddr_in *); static void imf_purge(struct in_mfilter *); static void imf_rollback(struct in_mfilter *); static void imf_reap(struct in_mfilter *); static struct in_mfilter * imo_match_group(const struct ip_moptions *, const struct ifnet *, const struct sockaddr *); static struct in_msource * imo_match_source(struct in_mfilter *, const struct sockaddr *); static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback); static int in_getmulti(struct ifnet *, const struct in_addr *, struct in_multi **); static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims); #ifdef KTR static int inm_is_ifp_detached(const struct in_multi *); #endif static int inm_merge(struct in_multi *, /*const*/ struct in_mfilter *); static void inm_purge(struct in_multi *); static void inm_reap(struct in_multi *); static void inm_release(struct in_multi *); static struct ip_moptions * inp_findmoptions(struct inpcb *); static int inp_get_source_filters(struct inpcb *, struct sockopt *); static int inp_join_group(struct inpcb *, struct sockopt *); static int inp_leave_group(struct inpcb *, struct sockopt *); static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *, const struct sockaddr_in *, const struct in_addr); static int inp_block_unblock_source(struct inpcb *, struct sockopt *); static int inp_set_multicast_if(struct inpcb *, struct sockopt *); static int inp_set_source_filters(struct inpcb *, struct sockopt *); static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS); static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0, "IPv4 multicast"); static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc, CTLFLAG_RWTUN, &in_mcast_maxgrpsrc, 0, "Max source filters per group"); static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc, CTLFLAG_RWTUN, &in_mcast_maxsocksrc, 0, "Max source filters per socket"); int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP; SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RWTUN, &in_mcast_loop, 0, "Loopback multicast datagrams by default"); static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters, "Per-interface stack-wide source filters"); #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp * is detached. */ static int __inline inm_is_ifp_detached(const struct in_multi *inm) { struct ifnet *ifp; KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); } return (ifp == NULL); } #endif static struct grouptask free_gtask; static struct in_multi_head inm_free_list; static void inm_release_task(void *arg __unused); static void inm_init(void) { SLIST_INIT(&inm_free_list); taskqgroup_config_gtask_init(NULL, &free_gtask, inm_release_task, "inm release task"); } #ifdef EARLY_AP_STARTUP SYSINIT(inm_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, inm_init, NULL); #else SYSINIT(inm_init, SI_SUB_ROOT_CONF - 1, SI_ORDER_FIRST, inm_init, NULL); #endif void inm_release_list_deferred(struct in_multi_head *inmh) { if (SLIST_EMPTY(inmh)) return; mtx_lock(&in_multi_free_mtx); SLIST_CONCAT(&inm_free_list, inmh, in_multi, inm_nrele); mtx_unlock(&in_multi_free_mtx); GROUPTASK_ENQUEUE(&free_gtask); } void inm_disconnect(struct in_multi *inm) { struct ifnet *ifp; struct ifmultiaddr *ifma, *ll_ifma; ifp = inm->inm_ifp; IF_ADDR_WLOCK_ASSERT(ifp); ifma = inm->inm_ifma; if_ref(ifp); if (ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname); if ((ll_ifma = ifma->ifma_llifma) != NULL) { MPASS(ifma != ll_ifma); ifma->ifma_llifma = NULL; MPASS(ll_ifma->ifma_llifma == NULL); MPASS(ll_ifma->ifma_ifp == ifp); if (--ll_ifma->ifma_refcount == 0) { if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname); if_freemulti(ll_ifma); ifma_restart = true; } } } void inm_release_deferred(struct in_multi *inm) { struct in_multi_head tmp; IN_MULTI_LIST_LOCK_ASSERT(); MPASS(inm->inm_refcount > 0); if (--inm->inm_refcount == 0) { SLIST_INIT(&tmp); inm_disconnect(inm); inm->inm_ifma->ifma_protospec = NULL; SLIST_INSERT_HEAD(&tmp, inm, inm_nrele); inm_release_list_deferred(&tmp); } } static void inm_release_task(void *arg __unused) { struct in_multi_head inm_free_tmp; struct in_multi *inm, *tinm; SLIST_INIT(&inm_free_tmp); mtx_lock(&in_multi_free_mtx); SLIST_CONCAT(&inm_free_tmp, &inm_free_list, in_multi, inm_nrele); mtx_unlock(&in_multi_free_mtx); IN_MULTI_LOCK(); SLIST_FOREACH_SAFE(inm, &inm_free_tmp, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&inm_free_tmp, inm_nrele); MPASS(inm); inm_release(inm); } IN_MULTI_UNLOCK(); } /* * Initialize an in_mfilter structure to a known state at t0, t1 * with an empty source filter list. */ static __inline void imf_init(struct in_mfilter *imf, const int st0, const int st1) { memset(imf, 0, sizeof(struct in_mfilter)); RB_INIT(&imf->imf_sources); imf->imf_st[0] = st0; imf->imf_st[1] = st1; } struct in_mfilter * ip_mfilter_alloc(const int mflags, const int st0, const int st1) { struct in_mfilter *imf; imf = malloc(sizeof(*imf), M_INMFILTER, mflags); if (imf != NULL) imf_init(imf, st0, st1); return (imf); } void ip_mfilter_free(struct in_mfilter *imf) { imf_purge(imf); free(imf, M_INMFILTER); } /* * Function for looking up an in_multi record for an IPv4 multicast address * on a given interface. ifp must be valid. If no record found, return NULL. * The IN_MULTI_LIST_LOCK and IF_ADDR_LOCK on ifp must be held. */ struct in_multi * inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina) { struct ifmultiaddr *ifma; struct in_multi *inm; IN_MULTI_LIST_LOCK_ASSERT(); IF_ADDR_LOCK_ASSERT(ifp); inm = NULL; CK_STAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_addr.s_addr == ina.s_addr) break; inm = NULL; } return (inm); } /* * Wrapper for inm_lookup_locked(). * The IF_ADDR_LOCK will be taken on ifp and released on return. */ struct in_multi * inm_lookup(struct ifnet *ifp, const struct in_addr ina) { - struct epoch_tracker et; struct in_multi *inm; IN_MULTI_LIST_LOCK_ASSERT(); - NET_EPOCH_ENTER(et); + NET_EPOCH_ASSERT(); + inm = inm_lookup_locked(ifp, ina); - NET_EPOCH_EXIT(et); return (inm); } /* * Find an IPv4 multicast group entry for this ip_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ static struct in_mfilter * imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group) { const struct sockaddr_in *gsin; struct in_mfilter *imf; struct in_multi *inm; gsin = (const struct sockaddr_in *)group; IP_MFILTER_FOREACH(imf, &imo->imo_head) { inm = imf->imf_inm; if (inm == NULL) continue; if ((ifp == NULL || (inm->inm_ifp == ifp)) && in_hosteq(inm->inm_addr, gsin->sin_addr)) { break; } } return (imf); } /* * Find an IPv4 multicast source entry for this imo which matches * the given group index for this socket, and source address. * * NOTE: This does not check if the entry is in-mode, merely if * it exists, which may not be the desired behaviour. */ static struct in_msource * imo_match_source(struct in_mfilter *imf, const struct sockaddr *src) { struct ip_msource find; struct ip_msource *ims; const sockunion_t *psa; KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__)); /* Source trees are keyed in host byte order. */ psa = (const sockunion_t *)src; find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); return ((struct in_msource *)ims); } /* * Perform filtering for multicast datagrams on a socket by group and source. * * Returns 0 if a datagram should be allowed through, or various error codes * if the socket was not a member of the group, or the source was muted, etc. */ int imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group, const struct sockaddr *src) { struct in_mfilter *imf; struct in_msource *ims; int mode; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); imf = imo_match_group(imo, ifp, group); if (imf == NULL) return (MCAST_NOTGMEMBER); /* * Check if the source was included in an (S,G) join. * Allow reception on exclusive memberships by default, * reject reception on inclusive memberships by default. * Exclude source only if an in-mode exclude filter exists. * Include source only if an in-mode include filter exists. * NOTE: We are comparing group state here at IGMP t1 (now) * with socket-layer t0 (since last downcall). */ mode = imf->imf_st[1]; ims = imo_match_source(imf, src); if ((ims == NULL && mode == MCAST_INCLUDE) || (ims != NULL && ims->imsl_st[0] != mode)) return (MCAST_NOTSMEMBER); return (MCAST_PASS); } /* * Find and return a reference to an in_multi record for (ifp, group), * and bump its reference count. * If one does not exist, try to allocate it, and update link-layer multicast * filters on ifp to listen for group. * Assumes the IN_MULTI lock is held across the call. * Return 0 if successful, otherwise return an appropriate error code. */ static int in_getmulti(struct ifnet *ifp, const struct in_addr *group, struct in_multi **pinm) { struct sockaddr_in gsin; struct ifmultiaddr *ifma; struct in_ifinfo *ii; struct in_multi *inm; int error; IN_MULTI_LOCK_ASSERT(); ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET]; IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, *group); if (inm != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(inm->inm_refcount >= 1, ("%s: bad refcount %d", __func__, inm->inm_refcount)); inm_acquire_locked(inm); *pinm = inm; } IN_MULTI_LIST_UNLOCK(); if (inm != NULL) return (0); memset(&gsin, 0, sizeof(gsin)); gsin.sin_family = AF_INET; gsin.sin_len = sizeof(struct sockaddr_in); gsin.sin_addr = *group; /* * Check if a link-layer group is already associated * with this network-layer group on the given ifnet. */ error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma); if (error != 0) return (error); /* XXX ifma_protospec must be covered by IF_ADDR_LOCK */ IN_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); /* * If something other than netinet is occupying the link-layer * group, print a meaningful error message and back out of * the allocation. * Otherwise, bump the refcount on the existing network-layer * group association and return it. */ if (ifma->ifma_protospec != NULL) { inm = (struct in_multi *)ifma->ifma_protospec; #ifdef INVARIANTS KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr", __func__)); KASSERT(ifma->ifma_addr->sa_family == AF_INET, ("%s: ifma not AF_INET", __func__)); KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__)); if (inm->inm_ifma != ifma || inm->inm_ifp != ifp || !in_hosteq(inm->inm_addr, *group)) { char addrbuf[INET_ADDRSTRLEN]; panic("%s: ifma %p is inconsistent with %p (%s)", __func__, ifma, inm, inet_ntoa_r(*group, addrbuf)); } #endif inm_acquire_locked(inm); *pinm = inm; goto out_locked; } IF_ADDR_WLOCK_ASSERT(ifp); /* * A new in_multi record is needed; allocate and initialize it. * We DO NOT perform an IGMP join as the in_ layer may need to * push an initial source list down to IGMP to support SSM. * * The initial source filter state is INCLUDE, {} as per the RFC. */ inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO); if (inm == NULL) { IF_ADDR_WUNLOCK(ifp); IN_MULTI_LIST_UNLOCK(); if_delmulti_ifma(ifma); return (ENOMEM); } inm->inm_addr = *group; inm->inm_ifp = ifp; inm->inm_igi = ii->ii_igmp; inm->inm_ifma = ifma; inm->inm_refcount = 1; inm->inm_state = IGMP_NOT_MEMBER; mbufq_init(&inm->inm_scq, IGMP_MAX_STATE_CHANGES); inm->inm_st[0].iss_fmode = MCAST_UNDEFINED; inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; RB_INIT(&inm->inm_srcs); ifma->ifma_protospec = inm; *pinm = inm; out_locked: IF_ADDR_WUNLOCK(ifp); IN_MULTI_LIST_UNLOCK(); return (0); } /* * Drop a reference to an in_multi record. * * If the refcount drops to 0, free the in_multi record and * delete the underlying link-layer membership. */ static void inm_release(struct in_multi *inm) { struct ifmultiaddr *ifma; struct ifnet *ifp; CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount); MPASS(inm->inm_refcount == 0); CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm); ifma = inm->inm_ifma; ifp = inm->inm_ifp; /* XXX this access is not covered by IF_ADDR_LOCK */ CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma); if (ifp != NULL) { CURVNET_SET(ifp->if_vnet); inm_purge(inm); free(inm, M_IPMADDR); if_delmulti_ifma_flags(ifma, 1); CURVNET_RESTORE(); if_rele(ifp); } else { inm_purge(inm); free(inm, M_IPMADDR); if_delmulti_ifma_flags(ifma, 1); } } /* * Clear recorded source entries for a group. * Used by the IGMP code. Caller must hold the IN_MULTI lock. * FIXME: Should reap. */ void inm_clear_recorded(struct in_multi *inm) { struct ip_msource *ims; IN_MULTI_LIST_LOCK_ASSERT(); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { if (ims->ims_stp) { ims->ims_stp = 0; --inm->inm_st[1].iss_rec; } } KASSERT(inm->inm_st[1].iss_rec == 0, ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec)); } /* * Record a source as pending for a Source-Group IGMPv3 query. * This lives here as it modifies the shared tree. * * inm is the group descriptor. * naddr is the address of the source to record in network-byte order. * * If the net.inet.igmp.sgalloc sysctl is non-zero, we will * lazy-allocate a source node in response to an SG query. * Otherwise, no allocation is performed. This saves some memory * with the trade-off that the source will not be reported to the * router if joined in the window between the query response and * the group actually being joined on the local host. * * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed. * This turns off the allocation of a recorded source entry if * the group has not been joined. * * Return 0 if the source didn't exist or was already marked as recorded. * Return 1 if the source was marked as recorded by this function. * Return <0 if any error occurred (negated errno code). */ int inm_record_source(struct in_multi *inm, const in_addr_t naddr) { struct ip_msource find; struct ip_msource *ims, *nims; IN_MULTI_LIST_LOCK_ASSERT(); find.ims_haddr = ntohl(naddr); ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims && ims->ims_stp) return (0); if (ims == NULL) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (-ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (-ENOMEM); nims->ims_haddr = find.ims_haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; } /* * Mark the source as recorded and update the recorded * source count. */ ++ims->ims_stp; ++inm->inm_st[1].iss_rec; return (1); } /* * Return a pointer to an in_msource owned by an in_mfilter, * given its source address. * Lazy-allocate if needed. If this is a new entry its filter state is * undefined at t0. * * imf is the filter set being modified. * haddr is the source address in *host* byte-order. * * SMPng: May be called with locks held; malloc must not block. */ static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **plims) { struct ip_msource find; struct ip_msource *ims, *nims; struct in_msource *lims; int error; error = 0; ims = NULL; lims = NULL; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); lims = (struct in_msource *)ims; if (lims == NULL) { if (imf->imf_nsrc == in_mcast_maxsocksrc) return (ENOSPC); nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); lims = (struct in_msource *)nims; lims->ims_haddr = find.ims_haddr; lims->imsl_st[0] = MCAST_UNDEFINED; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; } *plims = lims; return (error); } /* * Graft a source entry into an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being in the new filter mode at t1. * * Return the pointer to the new node, otherwise return NULL. */ static struct in_msource * imf_graft(struct in_mfilter *imf, const uint8_t st1, const struct sockaddr_in *psin) { struct ip_msource *nims; struct in_msource *lims; nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (NULL); lims = (struct in_msource *)nims; lims->ims_haddr = ntohl(psin->sin_addr.s_addr); lims->imsl_st[0] = MCAST_UNDEFINED; lims->imsl_st[1] = st1; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; return (lims); } /* * Prune a source entry from an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being left at t1, it is not freed. * * Return 0 if no error occurred, otherwise return an errno value. */ static int imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin) { struct ip_msource find; struct ip_msource *ims; struct in_msource *lims; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); if (ims == NULL) return (ENOENT); lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; return (0); } /* * Revert socket-layer filter set deltas at t1 to t0 state. */ static void imf_rollback(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) { /* no change at t1 */ continue; } else if (lims->imsl_st[0] != MCAST_UNDEFINED) { /* revert change to existing source at t1 */ lims->imsl_st[1] = lims->imsl_st[0]; } else { /* revert source added t1 */ CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } imf->imf_st[1] = imf->imf_st[0]; } /* * Mark socket-layer filter set as INCLUDE {} at t1. */ static void imf_leave(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; } imf->imf_st[1] = MCAST_INCLUDE; } /* * Mark socket-layer filter set deltas as committed. */ static void imf_commit(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[0] = lims->imsl_st[1]; } imf->imf_st[0] = imf->imf_st[1]; } /* * Reap unreferenced sources from socket-layer filter set. */ static void imf_reap(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if ((lims->imsl_st[0] == MCAST_UNDEFINED) && (lims->imsl_st[1] == MCAST_UNDEFINED)) { CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } } /* * Purge socket-layer filter set. */ static void imf_purge(struct in_mfilter *imf) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED; KASSERT(RB_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); } /* * Look up a source filter entry for a multicast group. * * inm is the group descriptor to work with. * haddr is the host-byte-order IPv4 address to look up. * noalloc may be non-zero to suppress allocation of sources. * *pims will be set to the address of the retrieved or allocated source. * * SMPng: NOTE: may be called with locks held. * Return 0 if successful, otherwise return a non-zero error code. */ static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims) { struct ip_msource find; struct ip_msource *ims, *nims; find.ims_haddr = haddr; ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims == NULL && !noalloc) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); nims->ims_haddr = haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; #ifdef KTR CTR3(KTR_IGMPV3, "%s: allocated 0x%08x as %p", __func__, haddr, ims); #endif } *pims = ims; return (0); } /* * Merge socket-layer source into IGMP-layer source. * If rollback is non-zero, perform the inverse of the merge. */ static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback) { int n = rollback ? -1 : 1; if (lims->imsl_st[0] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].ex -= n; } else if (lims->imsl_st[0] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in -= %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].in -= n; } if (lims->imsl_st[1] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex += %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].ex += n; } else if (lims->imsl_st[1] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in += %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].in += n; } } /* * Atomically update the global in_multi state, when a membership's * filter list is being updated in any way. * * imf is the per-inpcb-membership group filter pointer. * A fake imf may be passed for in-kernel consumers. * * XXX This is a candidate for a set-symmetric-difference style loop * which would eliminate the repeated lookup from root of ims nodes, * as they share the same key space. * * If any error occurred this function will back out of refcounts * and return a non-zero value. */ static int inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct ip_msource *ims, *nims; struct in_msource *lims; int schanged, error; int nsrc0, nsrc1; schanged = 0; error = 0; nsrc1 = nsrc0 = 0; IN_MULTI_LIST_LOCK_ASSERT(); /* * Update the source filters first, as this may fail. * Maintain count of in-mode filters at t0, t1. These are * used to work out if we transition into ASM mode or not. * Maintain a count of source filters whose state was * actually modified by this operation. */ RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++; if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; error = inm_get_source(inm, lims->ims_haddr, 0, &nims); ++schanged; if (error) break; ims_merge(nims, lims, 0); } if (error) { struct ip_msource *bims; RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; (void)inm_get_source(inm, lims->ims_haddr, 1, &bims); if (bims == NULL) continue; ims_merge(bims, lims, 1); } goto out_reap; } CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1", __func__, nsrc0, nsrc1); /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ if (imf->imf_st[0] == imf->imf_st[1] && imf->imf_st[1] == MCAST_INCLUDE) { if (nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } } /* Handle filter mode transition on socket. */ if (imf->imf_st[0] != imf->imf_st[1]) { CTR3(KTR_IGMPV3, "%s: imf transition %d to %d", __func__, imf->imf_st[0], imf->imf_st[1]); if (imf->imf_st[0] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__); --inm->inm_st[1].iss_ex; } else if (imf->imf_st[0] == MCAST_INCLUDE) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } if (imf->imf_st[1] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__); inm->inm_st[1].iss_ex++; } else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) { CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__); inm->inm_st[1].iss_in++; } } /* * Track inm filter state in terms of listener counts. * If there are any exclusive listeners, stack-wide * membership is exclusive. * Otherwise, if only inclusive listeners, stack-wide is inclusive. * If no listeners remain, state is undefined at t1, * and the IGMP lifecycle for this group should finish. */ if (inm->inm_st[1].iss_ex > 0) { CTR1(KTR_IGMPV3, "%s: transition to EX", __func__); inm->inm_st[1].iss_fmode = MCAST_EXCLUDE; } else if (inm->inm_st[1].iss_in > 0) { CTR1(KTR_IGMPV3, "%s: transition to IN", __func__); inm->inm_st[1].iss_fmode = MCAST_INCLUDE; } else { CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__); inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; } /* Decrement ASM listener count on transition out of ASM mode. */ if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { if ((imf->imf_st[1] != MCAST_EXCLUDE) || (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) { CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__); --inm->inm_st[1].iss_asm; } } /* Increment ASM listener count on transition to ASM mode. */ if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__); inm->inm_st[1].iss_asm++; } CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm); inm_print(inm); out_reap: if (schanged > 0) { CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__); inm_reap(inm); } return (error); } /* * Mark an in_multi's filter set deltas as committed. * Called by IGMP after a state change has been enqueued. */ void inm_commit(struct in_multi *inm) { struct ip_msource *ims; CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm); CTR1(KTR_IGMPV3, "%s: pre commit:", __func__); inm_print(inm); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { ims->ims_st[0] = ims->ims_st[1]; } inm->inm_st[0] = inm->inm_st[1]; } /* * Reap unreferenced nodes from an in_multi's filter set. */ static void inm_reap(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 || ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 || ims->ims_stp != 0) continue; CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Purge all source nodes from an in_multi's filter set. */ static void inm_purge(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Join a multicast group; unlocked entry point. * * SMPng: XXX: in_joingroup() is called from in_control() when Giant * is not held. Fortunately, ifp is unlikely to have been detached * at this point, so we assume it's OK to recurse. */ int in_joingroup(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { + struct epoch_tracker et; int error; IN_MULTI_LOCK(); + NET_EPOCH_ENTER(et); error = in_joingroup_locked(ifp, gina, imf, pinm); + NET_EPOCH_EXIT(et); IN_MULTI_UNLOCK(); return (error); } /* * Join a multicast group; real entry point. * * Only preserves atomicity at inm level. * NOTE: imf argument cannot be const due to sys/tree.h limitations. * * If the IGMP downcall fails, the group is not joined, and an error * code is returned. */ int in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { struct in_mfilter timf; struct in_multi *inm; int error; + NET_EPOCH_ASSERT(); IN_MULTI_LOCK_ASSERT(); IN_MULTI_LIST_UNLOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: join 0x%08x on %p(%s))", __func__, ntohl(gina->s_addr), ifp, ifp->if_xname); error = 0; inm = NULL; /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); imf = &timf; } error = in_getmulti(ifp, gina, &inm); if (error) { CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__); return (error); } IN_MULTI_LIST_LOCK(); CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_inm_release; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) { CTR1(KTR_IGMPV3, "%s: failed to update source", __func__); goto out_inm_release; } out_inm_release: if (error) { CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); inm_release_deferred(inm); } else { *pinm = inm; } IN_MULTI_LIST_UNLOCK(); return (error); } /* * Leave a multicast group; unlocked entry point. */ int in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { + struct epoch_tracker et; int error; + NET_EPOCH_ENTER(et); IN_MULTI_LOCK(); error = in_leavegroup_locked(inm, imf); IN_MULTI_UNLOCK(); + NET_EPOCH_EXIT(et); return (error); } /* * Leave a multicast group; real entry point. * All source filters will be expunged. * * Only preserves atomicity at inm level. * * Holding the write lock for the INP which contains imf * is highly advisable. We can't assert for it as imf does not * contain a back-pointer to the owning inp. * * Note: This is not the same as inm_release(*) as this function also * makes a state change downcall into IGMP. */ int in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct in_mfilter timf; int error; - error = 0; - + NET_EPOCH_ASSERT(); IN_MULTI_LOCK_ASSERT(); IN_MULTI_LIST_UNLOCK_ASSERT(); + error = 0; + CTR5(KTR_IGMPV3, "%s: leave inm %p, 0x%08x/%s, imf %p", __func__, inm, ntohl(inm->inm_addr.s_addr), (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname), imf); /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); imf = &timf; } /* * Begin state merge transaction at IGMP layer. * * As this particular invocation should not cause any memory * to be allocated, and there is no opportunity to roll back * the transaction, it MUST NOT fail. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); CURVNET_SET(inm->inm_ifp->if_vnet); error = igmp_change_state(inm); IF_ADDR_WLOCK(inm->inm_ifp); inm_release_deferred(inm); IF_ADDR_WUNLOCK(inm->inm_ifp); IN_MULTI_LIST_UNLOCK(); CURVNET_RESTORE(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); return (error); } /*#ifndef BURN_BRIDGES*/ /* * Join an IPv4 multicast group in (*,G) exclusive mode. * The group must be a 224.0.0.0/24 link-scope group. * This KPI is for legacy kernel consumers only. */ struct in_multi * in_addmulti(struct in_addr *ap, struct ifnet *ifp) { struct in_multi *pinm; int error; #ifdef INVARIANTS char addrbuf[INET_ADDRSTRLEN]; #endif KASSERT(IN_LOCAL_GROUP(ntohl(ap->s_addr)), ("%s: %s not in 224.0.0.0/24", __func__, inet_ntoa_r(*ap, addrbuf))); error = in_joingroup(ifp, ap, NULL, &pinm); if (error != 0) pinm = NULL; return (pinm); } /* * Block or unblock an ASM multicast source on an inpcb. * This implements the delta-based API described in RFC 3678. * * The delta-based API applies only to exclusive-mode memberships. * An IGMP downcall will be performed. * * SMPng: NOTE: Must take Giant as a join may create a new ifma. * * Return 0 if successful, otherwise return an appropriate error code. */ static int inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; struct rm_priotracker in_ifa_tracker; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; uint16_t fmode; int error, doblock; ifp = NULL; error = 0; doblock = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; ssa = (sockunion_t *)&gsr.gsr_source; switch (sopt->sopt_name) { case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: { struct ip_mreq_source mreqs; error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; if (!in_nullhost(mreqs.imr_interface)) { IN_IFADDR_RLOCK(&in_ifa_tracker); INADDR_TO_IFP(mreqs.imr_interface, ifp); IN_IFADDR_RUNLOCK(&in_ifa_tracker); } if (sopt->sopt_name == IP_BLOCK_SOURCE) doblock = 1; CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; } case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) doblock = 1; break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); IN_MULTI_LOCK(); /* * Check if we are actually a member of this group. */ imo = inp_findmoptions(inp); imf = imo_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imf->imf_inm; /* * Attempting to use the delta-based API on an * non exclusive-mode membership is an error. */ fmode = imf->imf_st[0]; if (fmode != MCAST_EXCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Deal with error cases up-front: * Asked to block, but already blocked; or * Asked to unblock, but nothing to unblock. * If adding a new block entry, allocate it. */ ims = imo_match_source(imf, &ssa->sa); if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__, ntohl(ssa->sin.sin_addr.s_addr), doblock ? "" : "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at socket layer. */ if (doblock) { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); ims = imf_graft(imf, fmode, &ssa->sin); if (ims == NULL) error = ENOMEM; } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); error = imf_prune(imf, &ssa->sin); } if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_imf_rollback; } /* * Begin state merge transaction at IGMP layer. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); goto out_imf_rollback; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); IN_MULTI_UNLOCK(); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. * SMPng: NOTE: Returns with the INP write lock held. */ static struct ip_moptions * inp_findmoptions(struct inpcb *inp) { struct ip_moptions *imo; INP_WLOCK(inp); if (inp->inp_moptions != NULL) return (inp->inp_moptions); INP_WUNLOCK(inp); imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); imo->imo_multicast_ifp = NULL; imo->imo_multicast_addr.s_addr = INADDR_ANY; imo->imo_multicast_vif = -1; imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = in_mcast_loop; STAILQ_INIT(&imo->imo_head); INP_WLOCK(inp); if (inp->inp_moptions != NULL) { free(imo, M_IPMOPTS); return (inp->inp_moptions); } inp->inp_moptions = imo; return (imo); } static void inp_gcmoptions(struct ip_moptions *imo) { struct in_mfilter *imf; struct in_multi *inm; struct ifnet *ifp; while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) { ip_mfilter_remove(&imo->imo_head, imf); imf_leave(imf); if ((inm = imf->imf_inm) != NULL) { if ((ifp = inm->inm_ifp) != NULL) { CURVNET_SET(ifp->if_vnet); (void)in_leavegroup(inm, imf); CURVNET_RESTORE(); } else { (void)in_leavegroup(inm, imf); } } ip_mfilter_free(imf); } free(imo, M_IPMOPTS); } /* * Discard the IP multicast options (and source filters). To minimize * the amount of work done while holding locks such as the INP's * pcbinfo lock (which is used in the receive path), the free * operation is deferred to the epoch callback task. */ void inp_freemoptions(struct ip_moptions *imo) { if (imo == NULL) return; inp_gcmoptions(imo); } /* * Atomically get source filters on a socket for an IPv4 multicast group. * Called with INP lock held; returns with lock released. */ static int inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct ip_moptions *imo; struct in_mfilter *imf; struct ip_msource *ims; struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; size_t nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); imo = inp->inp_moptions; KASSERT(imo != NULL, ("%s: null ip_moptions", __func__)); INP_WUNLOCK(inp); error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EINVAL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EINVAL); INP_WLOCK(inp); /* * Lookup group on the socket. */ gsa = (sockunion_t *)&msfr.msfr_group; imf = imo_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } /* * Ignore memberships which are in limbo. */ if (imf->imf_st[1] == MCAST_UNDEFINED) { INP_WUNLOCK(inp); return (EAGAIN); } msfr.msfr_fmode = imf->imf_st[1]; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. * We only copy out the number of entries which userland * has asked for, but we always tell userland how big the * buffer really needs to be. */ if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) msfr.msfr_nsrcs = in_mcast_maxsocksrc; tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_NOWAIT | M_ZERO); if (tss == NULL) { INP_WUNLOCK(inp); return (ENOBUFS); } } /* * Count number of sources in-mode at t0. * If buffer space exists and remains, copy out source entries. */ nsrcs = msfr.msfr_nsrcs; ncsrcs = 0; ptss = tss; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == MCAST_UNDEFINED || lims->imsl_st[0] != imf->imf_st[0]) continue; ++ncsrcs; if (tss != NULL && nsrcs > 0) { psin = (struct sockaddr_in *)ptss; psin->sin_family = AF_INET; psin->sin_len = sizeof(struct sockaddr_in); psin->sin_addr.s_addr = htonl(lims->ims_haddr); psin->sin_port = 0; ++ptss; --nsrcs; } } INP_WUNLOCK(inp); if (tss != NULL) { error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); free(tss, M_TEMP); if (error) return (error); } msfr.msfr_nsrcs = ncsrcs; error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ int inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct rm_priotracker in_ifa_tracker; struct ip_mreqn mreqn; struct ip_moptions *imo; struct ifnet *ifp; struct in_ifaddr *ia; int error, optval; u_char coptval; INP_WLOCK(inp); imo = inp->inp_moptions; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = 0; switch (sopt->sopt_name) { case IP_MULTICAST_VIF: if (imo != NULL) optval = imo->imo_multicast_vif; else optval = -1; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_IF: memset(&mreqn, 0, sizeof(struct ip_mreqn)); if (imo != NULL) { ifp = imo->imo_multicast_ifp; if (!in_nullhost(imo->imo_multicast_addr)) { mreqn.imr_address = imo->imo_multicast_addr; } else if (ifp != NULL) { - struct epoch_tracker et; - mreqn.imr_ifindex = ifp->if_index; - NET_EPOCH_ENTER(et); IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) mreqn.imr_address = IA_SIN(ia)->sin_addr; - NET_EPOCH_EXIT(et); } } INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { error = sooptcopyout(sopt, &mreqn, sizeof(struct ip_mreqn)); } else { error = sooptcopyout(sopt, &mreqn.imr_address, sizeof(struct in_addr)); } break; case IP_MULTICAST_TTL: if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_TTL; else optval = coptval = imo->imo_multicast_ttl; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_LOOP: if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_LOOP; else optval = coptval = imo->imo_multicast_loop; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MSFILTER: if (imo == NULL) { error = EADDRNOTAVAIL; INP_WUNLOCK(inp); } else { error = inp_get_source_filters(inp, sopt); } break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Look up the ifnet to use for a multicast group membership, * given the IPv4 address of an interface, and the IPv4 group address. * * This routine exists to support legacy multicast applications * which do not understand that multicast memberships are scoped to * specific physical links in the networking stack, or which need * to join link-scope groups before IPv4 addresses are configured. * * If inp is non-NULL, use this socket's current FIB number for any * required FIB lookup. * If ina is INADDR_ANY, look up the group address in the unicast FIB, * and use its ifp; usually, this points to the default next-hop. * * If the FIB lookup fails, attempt to use the first non-loopback * interface with multicast capability in the system as a * last resort. The legacy IPv4 ASM API requires that we do * this in order to allow groups to be joined when the routing * table has not yet been populated during boot. * * Returns NULL if no ifp could be found. * * FUTURE: Implement IPv4 source-address selection. */ static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *inp, const struct sockaddr_in *gsin, const struct in_addr ina) { struct rm_priotracker in_ifa_tracker; struct ifnet *ifp; struct nhop4_basic nh4; uint32_t fibnum; KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__)); KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)), ("%s: not multicast", __func__)); ifp = NULL; if (!in_nullhost(ina)) { IN_IFADDR_RLOCK(&in_ifa_tracker); INADDR_TO_IFP(ina, ifp); IN_IFADDR_RUNLOCK(&in_ifa_tracker); } else { fibnum = inp ? inp->inp_inc.inc_fibnum : 0; if (fib4_lookup_nh_basic(fibnum, gsin->sin_addr, 0, 0, &nh4)==0) ifp = nh4.nh_ifp; else { struct in_ifaddr *ia; struct ifnet *mifp; mifp = NULL; IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { mifp = ia->ia_ifp; if (!(mifp->if_flags & IFF_LOOPBACK) && (mifp->if_flags & IFF_MULTICAST)) { ifp = mifp; break; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } return (ifp); } /* * Join an IPv4 multicast group, possibly with a source. */ static int inp_join_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; struct in_msource *lims; int error, is_new; ifp = NULL; lims = NULL; error = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_ADD_MEMBERSHIP: { struct ip_mreqn mreqn; if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); else error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqn.imr_multiaddr; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); if (sopt->sopt_valsize == sizeof(struct ip_mreqn) && mreqn.imr_ifindex != 0) ifp = ifnet_byindex(mreqn.imr_ifindex); else ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, mreqn.imr_address); break; } case IP_ADD_SOURCE_MEMBERSHIP: { struct ip_mreq_source mreqs; error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); if (error) return (error); gsa->sin.sin_family = ssa->sin.sin_family = AF_INET; gsa->sin.sin_len = ssa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); ssa->sin.sin_addr = mreqs.imr_sourceaddr; ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, mreqs.imr_interface); CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; } case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: if (sopt->sopt_name == MCAST_JOIN_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); /* * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. */ gsa->sin.sin_port = 0; if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); ssa->sin.sin_port = 0; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); IN_MULTI_LOCK(); /* * Find the membership in the membership list. */ imo = inp_findmoptions(inp); imf = imo_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { is_new = 1; inm = NULL; if (ip_mfilter_count(&imo->imo_head) >= IP_MAX_MEMBERSHIPS) { error = ENOMEM; goto out_inp_locked; } } else { is_new = 0; inm = imf->imf_inm; if (ssa->ss.ss_family != AF_UNSPEC) { /* * MCAST_JOIN_SOURCE_GROUP on an exclusive membership * is an error. On an existing inclusive membership, * it just adds the source to the filter list. */ if (imf->imf_st[1] != MCAST_INCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Throw out duplicates. * * XXX FIXME: This makes a naive assumption that * even if entries exist for *ssa in this imf, * they will be rejected as dupes, even if they * are not valid in the current mode (in-mode). * * in_msource is transactioned just as for anything * else in SSM -- but note naive use of inm_graft() * below for allocating new filter entries. * * This is only an issue if someone mixes the * full-state SSM API with the delta-based API, * which is discouraged in the relevant RFCs. */ lims = imo_match_source(imf, &ssa->sa); if (lims != NULL /*&& lims->imsl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; goto out_inp_locked; } } else { /* * MCAST_JOIN_GROUP on an existing exclusive * membership is an error; return EADDRINUSE * to preserve 4.4BSD API idempotence, and * avoid tedious detour to code below. * NOTE: This is bending RFC 3678 a bit. * * On an existing inclusive membership, this is also * an error; if you want to change filter mode, * you must use the userland API setsourcefilter(). * XXX We don't reject this for imf in UNDEFINED * state at t1, because allocation of a filter * is atomic with allocation of a membership. */ error = EINVAL; if (imf->imf_st[1] == MCAST_EXCLUDE) error = EADDRINUSE; goto out_inp_locked; } } /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * Graft new source into filter list for this inpcb's * membership of the group. The in_multi may not have * been allocated yet if this is a new membership, however, * the in_mfilter slot will be allocated and must be initialized. * * Note: Grafting of exclusive mode filters doesn't happen * in this path. * XXX: Should check for non-NULL lims (node exists but may * not be in-mode) for interop with full-state API. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* Membership starts in IN mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/source", __func__); imf = ip_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_INCLUDE); if (imf == NULL) { error = ENOMEM; goto out_inp_locked; } } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); } lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin); if (lims == NULL) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); error = ENOMEM; goto out_inp_locked; } } else { /* No address specified; Membership starts in EX mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__); imf = ip_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_EXCLUDE); if (imf == NULL) { error = ENOMEM; goto out_inp_locked; } } } /* * Begin state merge transaction at IGMP layer. */ if (is_new) { in_pcbref(inp); INP_WUNLOCK(inp); error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf, &imf->imf_inm); INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { error = ENXIO; goto out_inp_unlocked; } if (error) { CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed", __func__); goto out_inp_locked; } inm_acquire(imf->imf_inm); } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); imf_rollback(imf); imf_reap(imf); goto out_inp_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); imf_rollback(imf); imf_reap(imf); goto out_inp_locked; } } if (is_new) ip_mfilter_insert(&imo->imo_head, imf); imf_commit(imf); imf = NULL; out_inp_locked: INP_WUNLOCK(inp); out_inp_unlocked: IN_MULTI_UNLOCK(); if (is_new && imf) { if (imf->imf_inm != NULL) { IN_MULTI_LIST_LOCK(); inm_release_deferred(imf->imf_inm); IN_MULTI_LIST_UNLOCK(); } ip_mfilter_free(imf); } return (error); } /* * Leave an IPv4 multicast group on an inpcb, possibly with a source. */ static int inp_leave_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; struct ip_mreq_source mreqs; struct rm_priotracker in_ifa_tracker; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; int error; bool is_final; ifp = NULL; error = 0; is_final = true; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: if (sopt->sopt_name == IP_DROP_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Swap interface and sourceaddr arguments, * as ip_mreq and ip_mreq_source are laid * out differently. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } /* * Attempt to look up hinted ifp from interface address. * Fallthrough with null ifp iff lookup fails, to * preserve 4.4BSD mcast API idempotence. * XXX NOTE WELL: The RFC 3678 API is preferred because * using an IPv4 address as a key is racy. */ if (!in_nullhost(mreqs.imr_interface)) { IN_IFADDR_RLOCK(&in_ifa_tracker); INADDR_TO_IFP(mreqs.imr_interface, ifp); IN_IFADDR_RUNLOCK(&in_ifa_tracker); } CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: if (sopt->sopt_name == MCAST_LEAVE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); } if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (ifp == NULL) return (EADDRNOTAVAIL); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); IN_MULTI_LOCK(); /* * Find the membership in the membership list. */ imo = inp_findmoptions(inp); imf = imo_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imf->imf_inm; if (ssa->ss.ss_family != AF_UNSPEC) is_final = false; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * If we were instructed only to leave a given source, do so. * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ if (is_final) { ip_mfilter_remove(&imo->imo_head, imf); imf_leave(imf); } else { if (imf->imf_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; goto out_inp_locked; } ims = imo_match_source(imf, &ssa->sa); if (ims == NULL) { CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__, ntohl(ssa->sin.sin_addr.s_addr), "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); error = imf_prune(imf, &ssa->sin); if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_inp_locked; } } /* * Begin state merge transaction at IGMP layer. */ if (!is_final) { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); imf_rollback(imf); imf_reap(imf); goto out_inp_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); imf_rollback(imf); imf_reap(imf); goto out_inp_locked; } } imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); if (is_final && imf) { /* * Give up the multicast address record to which * the membership points. */ (void) in_leavegroup_locked(imf->imf_inm, imf); ip_mfilter_free(imf); } IN_MULTI_UNLOCK(); return (error); } /* * Select the interface for transmitting IPv4 multicast datagrams. * * Either an instance of struct in_addr or an instance of struct ip_mreqn * may be passed to this socket option. An address of INADDR_ANY or an * interface index of 0 is used to remove a previous selection. * When no interface is selected, one is chosen for every send. */ static int inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { struct rm_priotracker in_ifa_tracker; struct in_addr addr; struct ip_mreqn mreqn; struct ifnet *ifp; struct ip_moptions *imo; int error; if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { /* * An interface index was specified using the * Linux-derived ip_mreqn structure. */ error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); if (error) return (error); if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex) return (EINVAL); if (mreqn.imr_ifindex == 0) { ifp = NULL; } else { ifp = ifnet_byindex(mreqn.imr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); } } else { /* * An interface was specified by IPv4 address. * This is the traditional BSD usage. */ error = sooptcopyin(sopt, &addr, sizeof(struct in_addr), sizeof(struct in_addr)); if (error) return (error); if (in_nullhost(addr)) { ifp = NULL; } else { IN_IFADDR_RLOCK(&in_ifa_tracker); INADDR_TO_IFP(addr, ifp); IN_IFADDR_RUNLOCK(&in_ifa_tracker); if (ifp == NULL) return (EADDRNOTAVAIL); } CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = 0x%08x", __func__, ifp, ntohl(addr.s_addr)); } /* Reject interfaces which do not support multicast. */ if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); imo = inp_findmoptions(inp); imo->imo_multicast_ifp = ifp; imo->imo_multicast_addr.s_addr = INADDR_ANY; INP_WUNLOCK(inp); return (0); } /* * Atomically set source filters on a socket for an IPv4 multicast group. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. */ static int inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) return (ENOBUFS); if ((msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE)) return (EINVAL); if (msfr.msfr_group.ss_family != AF_INET || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); gsa->sin.sin_port = 0; /* ignore port */ if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); IN_MULTI_LOCK(); /* * Take the INP write lock. * Check if this socket is a member of this group. */ imo = inp_findmoptions(inp); imf = imo_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imf->imf_inm; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); imf->imf_st[1] = msfr.msfr_fmode; /* * Apply any new source filters, if present. * Make a copy of the user-space source vector so * that we may copy them with a single copyin. This * allows us to deal with page faults up-front. */ if (msfr.msfr_nsrcs > 0) { struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *kss, *pkss; int i; INP_WUNLOCK(inp); CTR2(KTR_IGMPV3, "%s: loading %lu source list entries", __func__, (unsigned long)msfr.msfr_nsrcs); kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); if (error) { free(kss, M_TEMP); return (error); } INP_WLOCK(inp); /* * Mark all source filters as UNDEFINED at t1. * Restore new group filter mode, as imf_leave() * will set it to INCLUDE. */ imf_leave(imf); imf->imf_st[1] = msfr.msfr_fmode; /* * Update socket layer filters at t1, lazy-allocating * new entries. This saves a bunch of memory at the * cost of one RB_FIND() per source entry; duplicate * entries in the msfr_nsrcs vector are ignored. * If we encounter an error, rollback transaction. * * XXX This too could be replaced with a set-symmetric * difference like loop to avoid walking from root * every time, as the key space is common. */ for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { psin = (struct sockaddr_in *)pkss; if (psin->sin_family != AF_INET) { error = EAFNOSUPPORT; break; } if (psin->sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; break; } error = imf_get_source(imf, psin, &lims); if (error) break; lims->imsl_st[1] = imf->imf_st[1]; } free(kss, M_TEMP); } if (error) goto out_imf_rollback; INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at IGMP layer. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); goto out_imf_rollback; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); IN_MULTI_UNLOCK(); return (error); } /* * Set the IP multicast options in response to user setsockopt(). * * Many of the socket options handled in this function duplicate the * functionality of socket options in the regular unicast API. However, * it is not possible to merge the duplicate code, because the idempotence * of the IPv4 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. * * SMPng: XXX: Unlocked read of inp_socket believed OK. * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING * is refactored to no longer use vifs. */ int inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip_moptions *imo; int error; error = 0; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) return (EOPNOTSUPP); switch (sopt->sopt_name) { case IP_MULTICAST_VIF: { int vifi; /* * Select a multicast VIF for transmission. * Only useful if multicast forwarding is active. */ if (legal_vif_num == NULL) { error = EOPNOTSUPP; break; } error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int)); if (error) break; if (!legal_vif_num(vifi) && (vifi != -1)) { error = EINVAL; break; } imo = inp_findmoptions(inp); imo->imo_multicast_vif = vifi; INP_WUNLOCK(inp); break; } case IP_MULTICAST_IF: error = inp_set_multicast_if(inp, sopt); break; case IP_MULTICAST_TTL: { u_char ttl; /* * Set the IP time-to-live for outgoing multicast packets. * The original multicast API required a char argument, * which is inconsistent with the rest of the socket API. * We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &ttl, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int ittl; error = sooptcopyin(sopt, &ittl, sizeof(u_int), sizeof(u_int)); if (error) break; if (ittl > 255) { error = EINVAL; break; } ttl = (u_char)ittl; } imo = inp_findmoptions(inp); imo->imo_multicast_ttl = ttl; INP_WUNLOCK(inp); break; } case IP_MULTICAST_LOOP: { u_char loop; /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. The original multicast API required a * char argument, which is inconsistent with the rest * of the socket API. We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &loop, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int iloop; error = sooptcopyin(sopt, &iloop, sizeof(u_int), sizeof(u_int)); if (error) break; loop = (u_char)iloop; } imo = inp_findmoptions(inp); imo->imo_multicast_loop = !!loop; INP_WUNLOCK(inp); break; } case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: error = inp_join_group(inp, sopt); break; case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = inp_leave_group(inp, sopt); break; case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_block_unblock_source(inp, sopt); break; case IP_MSFILTER: error = inp_set_source_filters(inp, sopt); break; default: error = EOPNOTSUPP; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Expose IGMP's multicast filter mode and source list(s) to userland, * keyed by (ifindex, group). * The filter mode is written out as a uint32_t, followed by * 0..n of struct in_addr. * For use by ifmcstat(8). * SMPng: NOTE: unlocked read of ifindex space. */ static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) { struct in_addr src, group; struct epoch_tracker et; struct ifnet *ifp; struct ifmultiaddr *ifma; struct in_multi *inm; struct ip_msource *ims; int *name; int retval; u_int namelen; uint32_t fmode, ifindex; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 2) return (EINVAL); ifindex = name[0]; if (ifindex <= 0 || ifindex > V_if_index) { CTR2(KTR_IGMPV3, "%s: ifindex %u out of range", __func__, ifindex); return (ENOENT); } group.s_addr = name[1]; if (!IN_MULTICAST(ntohl(group.s_addr))) { CTR2(KTR_IGMPV3, "%s: group 0x%08x is not multicast", __func__, ntohl(group.s_addr)); return (EINVAL); } + NET_EPOCH_ENTER(et); ifp = ifnet_byindex(ifindex); if (ifp == NULL) { + NET_EPOCH_EXIT(et); CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u", __func__, ifindex); return (ENOENT); } retval = sysctl_wire_old_buffer(req, sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr))); - if (retval) + if (retval) { + NET_EPOCH_EXIT(et); return (retval); + } IN_MULTI_LIST_LOCK(); - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (!in_hosteq(inm->inm_addr, group)) continue; fmode = inm->inm_st[1].iss_fmode; retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); if (retval != 0) break; RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); /* * Only copy-out sources which are in-mode. */ if (fmode != ims_get_mode(inm, ims, 1)) { CTR1(KTR_IGMPV3, "%s: skip non-in-mode", __func__); continue; } src.s_addr = htonl(ims->ims_haddr); retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr)); if (retval != 0) break; } } - NET_EPOCH_EXIT(et); IN_MULTI_LIST_UNLOCK(); + NET_EPOCH_EXIT(et); return (retval); } #if defined(KTR) && (KTR_COMPILE & KTR_IGMPV3) static const char *inm_modestrs[] = { [MCAST_UNDEFINED] = "un", [MCAST_INCLUDE] = "in", [MCAST_EXCLUDE] = "ex", }; _Static_assert(MCAST_UNDEFINED == 0 && MCAST_EXCLUDE + 1 == nitems(inm_modestrs), "inm_modestrs: no longer matches #defines"); static const char * inm_mode_str(const int mode) { if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) return (inm_modestrs[mode]); return ("??"); } static const char *inm_statestrs[] = { [IGMP_NOT_MEMBER] = "not-member", [IGMP_SILENT_MEMBER] = "silent", [IGMP_REPORTING_MEMBER] = "reporting", [IGMP_IDLE_MEMBER] = "idle", [IGMP_LAZY_MEMBER] = "lazy", [IGMP_SLEEPING_MEMBER] = "sleeping", [IGMP_AWAKENING_MEMBER] = "awakening", [IGMP_G_QUERY_PENDING_MEMBER] = "query-pending", [IGMP_SG_QUERY_PENDING_MEMBER] = "sg-query-pending", [IGMP_LEAVING_MEMBER] = "leaving", }; _Static_assert(IGMP_NOT_MEMBER == 0 && IGMP_LEAVING_MEMBER + 1 == nitems(inm_statestrs), "inm_statetrs: no longer matches #defines"); static const char * inm_state_str(const int state) { if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER) return (inm_statestrs[state]); return ("??"); } /* * Dump an in_multi structure to the console. */ void inm_print(const struct in_multi *inm) { int t; char addrbuf[INET_ADDRSTRLEN]; if ((ktr_mask & KTR_IGMPV3) == 0) return; printf("%s: --- begin inm %p ---\n", __func__, inm); printf("addr %s ifp %p(%s) ifma %p\n", inet_ntoa_r(inm->inm_addr, addrbuf), inm->inm_ifp, inm->inm_ifp->if_xname, inm->inm_ifma); printf("timer %u state %s refcount %u scq.len %u\n", inm->inm_timer, inm_state_str(inm->inm_state), inm->inm_refcount, inm->inm_scq.mq_len); printf("igi %p nsrc %lu sctimer %u scrv %u\n", inm->inm_igi, inm->inm_nsrc, inm->inm_sctimer, inm->inm_scrv); for (t = 0; t < 2; t++) { printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, inm_mode_str(inm->inm_st[t].iss_fmode), inm->inm_st[t].iss_asm, inm->inm_st[t].iss_ex, inm->inm_st[t].iss_in, inm->inm_st[t].iss_rec); } printf("%s: --- end inm %p ---\n", __func__, inm); } #else /* !KTR || !(KTR_COMPILE & KTR_IGMPV3) */ void inm_print(const struct in_multi *inm) { } #endif /* KTR && (KTR_COMPILE & KTR_IGMPV3) */ RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); diff --git a/sys/netinet/in_rmx.c b/sys/netinet/in_rmx.c index 2cface409d7f..cc866e33cfaa 100644 --- a/sys/netinet/in_rmx.c +++ b/sys/netinet/in_rmx.c @@ -1,205 +1,210 @@ /*- * Copyright 1994, 1995 Massachusetts Institute of Technology * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern int in_inithead(void **head, int off); #ifdef VIMAGE extern int in_detachhead(void **head, int off); #endif /* * Do what we need to do when inserting a route. */ static struct radix_node * in_addroute(void *v_arg, void *n_arg, struct radix_head *head, struct radix_node *treenodes) { struct rtentry *rt = (struct rtentry *)treenodes; struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt); /* * A little bit of help for both IP output and input: * For host routes, we make sure that RTF_BROADCAST * is set for anything that looks like a broadcast address. * This way, we can avoid an expensive call to in_broadcast() * in ip_output() most of the time (because the route passed * to ip_output() is almost always a host route). * * We also do the same for local addresses, with the thought * that this might one day be used to speed up ip_input(). * * We also mark routes to multicast addresses as such, because * it's easy to do and might be useful (but this is much more * dubious since it's so easy to inspect the address). */ if (rt->rt_flags & RTF_HOST) { - if (in_broadcast(sin->sin_addr, rt->rt_ifp)) { + struct epoch_tracker et; + bool bcast; + + NET_EPOCH_ENTER(et); + bcast = in_broadcast(sin->sin_addr, rt->rt_ifp); + NET_EPOCH_EXIT(et); + if (bcast) rt->rt_flags |= RTF_BROADCAST; - } else if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr == - sin->sin_addr.s_addr) { + else if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr == + sin->sin_addr.s_addr) rt->rt_flags |= RTF_LOCAL; - } } if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) rt->rt_flags |= RTF_MULTICAST; if (rt->rt_ifp != NULL) { /* * Check route MTU: * inherit interface MTU if not set or * check if MTU is too large. */ if (rt->rt_mtu == 0) { rt->rt_mtu = rt->rt_ifp->if_mtu; } else if (rt->rt_mtu > rt->rt_ifp->if_mtu) rt->rt_mtu = rt->rt_ifp->if_mtu; } return (rn_addroute(v_arg, n_arg, head, treenodes)); } static int _in_rt_was_here; /* * Initialize our routing tree. */ int in_inithead(void **head, int off) { struct rib_head *rh; rh = rt_table_init(32); if (rh == NULL) return (0); rh->rnh_addaddr = in_addroute; *head = (void *)rh; if (_in_rt_was_here == 0 ) { _in_rt_was_here = 1; } return 1; } #ifdef VIMAGE int in_detachhead(void **head, int off) { rt_table_destroy((struct rib_head *)(*head)); return (1); } #endif /* * This zaps old routes when the interface goes down or interface * address is deleted. In the latter case, it deletes static routes * that point to this address. If we don't do this, we may end up * using the old address in the future. The ones we always want to * get rid of are things like ARP entries, since the user might down * the interface, walk over to a completely different network, and * plug back in. */ struct in_ifadown_arg { struct ifaddr *ifa; int del; }; static int in_ifadownkill(const struct rtentry *rt, void *xap) { struct in_ifadown_arg *ap = xap; if (rt->rt_ifa != ap->ifa) return (0); if ((rt->rt_flags & RTF_STATIC) != 0 && ap->del == 0) return (0); return (1); } void in_ifadown(struct ifaddr *ifa, int delete) { struct in_ifadown_arg arg; KASSERT(ifa->ifa_addr->sa_family == AF_INET, ("%s: wrong family", __func__)); arg.ifa = ifa; arg.del = delete; rt_foreach_fib_walk_del(AF_INET, in_ifadownkill, &arg); ifa->ifa_flags &= ~IFA_ROUTE; /* XXXlocking? */ } /* * inet versions of rt functions. These have fib extensions and * for now will just reference the _fib variants. * eventually this order will be reversed, */ void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum) { rtalloc_ign_fib(ro, ignflags, fibnum); } void in_rtredirect(struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct sockaddr *src, u_int fibnum) { rtredirect_fib(dst, gateway, netmask, flags, src, fibnum); } diff --git a/sys/netinet/in_var.h b/sys/netinet/in_var.h index 50112481f236..96879db0abb7 100644 --- a/sys/netinet/in_var.h +++ b/sys/netinet/in_var.h @@ -1,483 +1,484 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1985, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_var.h 8.2 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _NETINET_IN_VAR_H_ #define _NETINET_IN_VAR_H_ /* * Argument structure for SIOCAIFADDR. */ struct in_aliasreq { char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ struct sockaddr_in ifra_addr; struct sockaddr_in ifra_broadaddr; #define ifra_dstaddr ifra_broadaddr struct sockaddr_in ifra_mask; int ifra_vhid; }; #ifdef _KERNEL #include #include #include struct igmp_ifsoftc; struct in_multi; struct lltable; SLIST_HEAD(in_multi_head, in_multi); /* * IPv4 per-interface state. */ struct in_ifinfo { struct lltable *ii_llt; /* ARP state */ struct igmp_ifsoftc *ii_igmp; /* IGMP state */ struct in_multi *ii_allhosts; /* 224.0.0.1 membership */ }; /* * Interface address, Internet version. One of these structures * is allocated for each Internet address on an interface. * The ifaddr structure contains the protocol-independent part * of the structure and is assumed to be first. */ struct in_ifaddr { struct ifaddr ia_ifa; /* protocol-independent info */ #define ia_ifp ia_ifa.ifa_ifp #define ia_flags ia_ifa.ifa_flags /* ia_subnet{,mask} in host order */ u_long ia_subnet; /* subnet address */ u_long ia_subnetmask; /* mask of subnet */ LIST_ENTRY(in_ifaddr) ia_hash; /* entry in bucket of inet addresses */ CK_STAILQ_ENTRY(in_ifaddr) ia_link; /* list of internet addresses */ struct sockaddr_in ia_addr; /* reserve space for interface name */ struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */ #define ia_broadaddr ia_dstaddr struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ struct callout ia_garp_timer; /* timer for retransmitting GARPs */ int ia_garp_count; /* count of retransmitted GARPs */ }; /* * Given a pointer to an in_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in. */ #define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr)) #define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr)) #define IA_MASKSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_sockmask)) #define IN_LNAOF(in, ifa) \ ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask)) extern u_char inetctlerrmap[]; #define LLTABLE(ifp) \ ((struct in_ifinfo *)(ifp)->if_afdata[AF_INET])->ii_llt /* * Hash table for IP addresses. */ CK_STAILQ_HEAD(in_ifaddrhead, in_ifaddr); LIST_HEAD(in_ifaddrhashhead, in_ifaddr); VNET_DECLARE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); VNET_DECLARE(struct in_ifaddrhead, in_ifaddrhead); VNET_DECLARE(u_long, in_ifaddrhmask); /* mask for hash table */ #define V_in_ifaddrhashtbl VNET(in_ifaddrhashtbl) #define V_in_ifaddrhead VNET(in_ifaddrhead) #define V_in_ifaddrhmask VNET(in_ifaddrhmask) #define INADDR_NHASH_LOG2 9 #define INADDR_NHASH (1 << INADDR_NHASH_LOG2) #define INADDR_HASHVAL(x) fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT) #define INADDR_HASH(x) \ (&V_in_ifaddrhashtbl[INADDR_HASHVAL(x) & V_in_ifaddrhmask]) extern struct rmlock in_ifaddr_lock; #define IN_IFADDR_LOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_LOCKED) #define IN_IFADDR_RLOCK(t) rm_rlock(&in_ifaddr_lock, (t)) #define IN_IFADDR_RLOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_RLOCKED) #define IN_IFADDR_RUNLOCK(t) rm_runlock(&in_ifaddr_lock, (t)) #define IN_IFADDR_WLOCK() rm_wlock(&in_ifaddr_lock) #define IN_IFADDR_WLOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_WLOCKED) #define IN_IFADDR_WUNLOCK() rm_wunlock(&in_ifaddr_lock) /* * Macro for finding the internet address structure (in_ifaddr) * corresponding to one of our IP addresses (in_addr). */ #define INADDR_TO_IFADDR(addr, ia) \ /* struct in_addr addr; */ \ /* struct in_ifaddr *ia; */ \ do { \ \ LIST_FOREACH(ia, INADDR_HASH((addr).s_addr), ia_hash) \ if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \ break; \ } while (0) /* * Macro for finding the interface (ifnet structure) corresponding to one * of our IP addresses. */ #define INADDR_TO_IFP(addr, ifp) \ /* struct in_addr addr; */ \ /* struct ifnet *ifp; */ \ { \ struct in_ifaddr *ia; \ \ INADDR_TO_IFADDR(addr, ia); \ (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \ } /* * Macro for finding the internet address structure (in_ifaddr) corresponding * to a given interface (ifnet structure). */ #define IFP_TO_IA(ifp, ia, t) \ /* struct ifnet *ifp; */ \ /* struct in_ifaddr *ia; */ \ /* struct rm_priotracker *t; */ \ do { \ + NET_EPOCH_ASSERT(); \ IN_IFADDR_RLOCK((t)); \ for ((ia) = CK_STAILQ_FIRST(&V_in_ifaddrhead); \ (ia) != NULL && (ia)->ia_ifp != (ifp); \ (ia) = CK_STAILQ_NEXT((ia), ia_link)) \ continue; \ IN_IFADDR_RUNLOCK((t)); \ } while (0) /* * Legacy IPv4 IGMP per-link structure. */ struct router_info { struct ifnet *rti_ifp; int rti_type; /* type of router which is querier on this interface */ int rti_time; /* # of slow timeouts since last old query */ SLIST_ENTRY(router_info) rti_list; }; /* * IPv4 multicast IGMP-layer source entry. */ struct ip_msource { RB_ENTRY(ip_msource) ims_link; /* RB tree links */ in_addr_t ims_haddr; /* host byte order */ struct ims_st { uint16_t ex; /* # of exclusive members */ uint16_t in; /* # of inclusive members */ } ims_st[2]; /* state at t0, t1 */ uint8_t ims_stp; /* pending query */ }; /* * IPv4 multicast PCB-layer source entry. */ struct in_msource { RB_ENTRY(ip_msource) ims_link; /* RB tree links */ in_addr_t ims_haddr; /* host byte order */ uint8_t imsl_st[2]; /* state before/at commit */ }; RB_HEAD(ip_msource_tree, ip_msource); /* define struct ip_msource_tree */ static __inline int ip_msource_cmp(const struct ip_msource *a, const struct ip_msource *b) { if (a->ims_haddr < b->ims_haddr) return (-1); if (a->ims_haddr == b->ims_haddr) return (0); return (1); } RB_PROTOTYPE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); /* * IPv4 multicast PCB-layer group filter descriptor. */ struct in_mfilter { struct ip_msource_tree imf_sources; /* source list for (S,G) */ u_long imf_nsrc; /* # of source entries */ uint8_t imf_st[2]; /* state before/at commit */ struct in_multi *imf_inm; /* associated multicast address */ STAILQ_ENTRY(in_mfilter) imf_entry; /* list entry */ }; /* * Helper types and functions for IPv4 multicast filters. */ STAILQ_HEAD(ip_mfilter_head, in_mfilter); struct in_mfilter *ip_mfilter_alloc(int mflags, int st0, int st1); void ip_mfilter_free(struct in_mfilter *); static inline void ip_mfilter_init(struct ip_mfilter_head *head) { STAILQ_INIT(head); } static inline struct in_mfilter * ip_mfilter_first(const struct ip_mfilter_head *head) { return (STAILQ_FIRST(head)); } static inline void ip_mfilter_insert(struct ip_mfilter_head *head, struct in_mfilter *imf) { STAILQ_INSERT_TAIL(head, imf, imf_entry); } static inline void ip_mfilter_remove(struct ip_mfilter_head *head, struct in_mfilter *imf) { STAILQ_REMOVE(head, imf, in_mfilter, imf_entry); } #define IP_MFILTER_FOREACH(imf, head) \ STAILQ_FOREACH(imf, head, imf_entry) static inline size_t ip_mfilter_count(struct ip_mfilter_head *head) { struct in_mfilter *imf; size_t num = 0; STAILQ_FOREACH(imf, head, imf_entry) num++; return (num); } /* * IPv4 group descriptor. * * For every entry on an ifnet's if_multiaddrs list which represents * an IP multicast group, there is one of these structures. * * If any source filters are present, then a node will exist in the RB-tree * to permit fast lookup by source whenever an operation takes place. * This permits pre-order traversal when we issue reports. * Source filter trees are kept separately from the socket layer to * greatly simplify locking. * * When IGMPv3 is active, inm_timer is the response to group query timer. * The state-change timer inm_sctimer is separate; whenever state changes * for the group the state change record is generated and transmitted, * and kept if retransmissions are necessary. * * FUTURE: inm_link is now only used when groups are being purged * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but * because it is at the very start of the struct, we can't do this * w/o breaking the ABI for ifmcstat. */ struct in_multi { LIST_ENTRY(in_multi) inm_link; /* to-be-released by in_ifdetach */ struct in_addr inm_addr; /* IP multicast address, convenience */ struct ifnet *inm_ifp; /* back pointer to ifnet */ struct ifmultiaddr *inm_ifma; /* back pointer to ifmultiaddr */ u_int inm_timer; /* IGMPv1/v2 group / v3 query timer */ u_int inm_state; /* state of the membership */ void *inm_rti; /* unused, legacy field */ u_int inm_refcount; /* reference count */ /* New fields for IGMPv3 follow. */ struct igmp_ifsoftc *inm_igi; /* IGMP info */ SLIST_ENTRY(in_multi) inm_nrele; /* to-be-released by IGMP */ struct ip_msource_tree inm_srcs; /* tree of sources */ u_long inm_nsrc; /* # of tree entries */ struct mbufq inm_scq; /* queue of pending * state-change packets */ struct timeval inm_lastgsrtv; /* Time of last G-S-R query */ uint16_t inm_sctimer; /* state-change timer */ uint16_t inm_scrv; /* state-change rexmit count */ /* * SSM state counters which track state at T0 (the time the last * state-change report's RV timer went to zero) and T1 * (time of pending report, i.e. now). * Used for computing IGMPv3 state-change reports. Several refcounts * are maintained here to optimize for common use-cases. */ struct inm_st { uint16_t iss_fmode; /* IGMP filter mode */ uint16_t iss_asm; /* # of ASM listeners */ uint16_t iss_ex; /* # of exclusive members */ uint16_t iss_in; /* # of inclusive members */ uint16_t iss_rec; /* # of recorded sources */ } inm_st[2]; /* state at t0, t1 */ }; /* * Helper function to derive the filter mode on a source entry * from its internal counters. Predicates are: * A source is only excluded if all listeners exclude it. * A source is only included if no listeners exclude it, * and at least one listener includes it. * May be used by ifmcstat(8). */ static __inline uint8_t ims_get_mode(const struct in_multi *inm, const struct ip_msource *ims, uint8_t t) { t = !!t; if (inm->inm_st[t].iss_ex > 0 && inm->inm_st[t].iss_ex == ims->ims_st[t].ex) return (MCAST_EXCLUDE); else if (ims->ims_st[t].in > 0 && ims->ims_st[t].ex == 0) return (MCAST_INCLUDE); return (MCAST_UNDEFINED); } #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet); SYSCTL_DECL(_net_inet_ip); SYSCTL_DECL(_net_inet_raw); #endif /* * Lock macros for IPv4 layer multicast address lists. IPv4 lock goes * before link layer multicast locks in the lock order. In most cases, * consumers of IN_*_MULTI() macros should acquire the locks before * calling them; users of the in_{add,del}multi() functions should not. */ extern struct mtx in_multi_list_mtx; extern struct sx in_multi_sx; #define IN_MULTI_LIST_LOCK() mtx_lock(&in_multi_list_mtx) #define IN_MULTI_LIST_UNLOCK() mtx_unlock(&in_multi_list_mtx) #define IN_MULTI_LIST_LOCK_ASSERT() mtx_assert(&in_multi_list_mtx, MA_OWNED) #define IN_MULTI_LIST_UNLOCK_ASSERT() mtx_assert(&in_multi_list_mtx, MA_NOTOWNED) #define IN_MULTI_LOCK() sx_xlock(&in_multi_sx) #define IN_MULTI_UNLOCK() sx_xunlock(&in_multi_sx) #define IN_MULTI_LOCK_ASSERT() sx_assert(&in_multi_sx, SA_XLOCKED) #define IN_MULTI_UNLOCK_ASSERT() sx_assert(&in_multi_sx, SA_XUNLOCKED) void inm_disconnect(struct in_multi *inm); extern int ifma_restart; /* Acquire an in_multi record. */ static __inline void inm_acquire_locked(struct in_multi *inm) { IN_MULTI_LIST_LOCK_ASSERT(); ++inm->inm_refcount; } static __inline void inm_acquire(struct in_multi *inm) { IN_MULTI_LIST_LOCK(); inm_acquire_locked(inm); IN_MULTI_LIST_UNLOCK(); } static __inline void inm_rele_locked(struct in_multi_head *inmh, struct in_multi *inm) { MPASS(inm->inm_refcount > 0); IN_MULTI_LIST_LOCK_ASSERT(); if (--inm->inm_refcount == 0) { MPASS(inmh != NULL); inm_disconnect(inm); inm->inm_ifma->ifma_protospec = NULL; SLIST_INSERT_HEAD(inmh, inm, inm_nrele); } } /* * Return values for imo_multi_filter(). */ #define MCAST_PASS 0 /* Pass */ #define MCAST_NOTGMEMBER 1 /* This host not a member of group */ #define MCAST_NOTSMEMBER 2 /* This host excluded source */ #define MCAST_MUTED 3 /* [deprecated] */ struct rtentry; struct route; struct ip_moptions; struct in_multi *inm_lookup_locked(struct ifnet *, const struct in_addr); struct in_multi *inm_lookup(struct ifnet *, const struct in_addr); int imo_multi_filter(const struct ip_moptions *, const struct ifnet *, const struct sockaddr *, const struct sockaddr *); void inm_commit(struct in_multi *); void inm_clear_recorded(struct in_multi *); void inm_print(const struct in_multi *); int inm_record_source(struct in_multi *inm, const in_addr_t); void inm_release_deferred(struct in_multi *); void inm_release_list_deferred(struct in_multi_head *); struct in_multi * in_addmulti(struct in_addr *, struct ifnet *); int in_joingroup(struct ifnet *, const struct in_addr *, /*const*/ struct in_mfilter *, struct in_multi **); int in_joingroup_locked(struct ifnet *, const struct in_addr *, /*const*/ struct in_mfilter *, struct in_multi **); int in_leavegroup(struct in_multi *, /*const*/ struct in_mfilter *); int in_leavegroup_locked(struct in_multi *, /*const*/ struct in_mfilter *); int in_control(struct socket *, u_long, caddr_t, struct ifnet *, struct thread *); int in_addprefix(struct in_ifaddr *, int); int in_scrubprefix(struct in_ifaddr *, u_int); void in_ifscrub_all(void); void ip_input(struct mbuf *); void ip_direct_input(struct mbuf *); void in_ifadown(struct ifaddr *ifa, int); struct mbuf *ip_tryforward(struct mbuf *); void *in_domifattach(struct ifnet *); void in_domifdetach(struct ifnet *, void *); /* XXX */ void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum); void in_rtredirect(struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct sockaddr *, u_int); #endif /* _KERNEL */ /* INET6 stuff */ #include #endif /* _NETINET_IN_VAR_H_ */ diff --git a/sys/netinet/ip_carp.c b/sys/netinet/ip_carp.c index 819a0f9c75e6..ceccd5a02f3a 100644 --- a/sys/netinet/ip_carp.c +++ b/sys/netinet/ip_carp.c @@ -1,2303 +1,2297 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002 Michael Shalayeff. * Copyright (c) 2003 Ryan McBride. * Copyright (c) 2011 Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_bpf.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #endif #ifdef INET #include #include #endif #ifdef INET6 #include #include #include #include #include #include #endif #include static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses"); struct carp_softc { struct ifnet *sc_carpdev; /* Pointer to parent ifnet. */ struct ifaddr **sc_ifas; /* Our ifaddrs. */ struct sockaddr_dl sc_addr; /* Our link level address. */ struct callout sc_ad_tmo; /* Advertising timeout. */ #ifdef INET struct callout sc_md_tmo; /* Master down timeout. */ #endif #ifdef INET6 struct callout sc_md6_tmo; /* XXX: Master down timeout. */ #endif struct mtx sc_mtx; int sc_vhid; int sc_advskew; int sc_advbase; int sc_naddrs; int sc_naddrs6; int sc_ifasiz; enum { INIT = 0, BACKUP, MASTER } sc_state; int sc_suppress; int sc_sendad_errors; #define CARP_SENDAD_MAX_ERRORS 3 int sc_sendad_success; #define CARP_SENDAD_MIN_SUCCESS 3 int sc_init_counter; uint64_t sc_counter; /* authentication */ #define CARP_HMAC_PAD 64 unsigned char sc_key[CARP_KEY_LEN]; unsigned char sc_pad[CARP_HMAC_PAD]; SHA1_CTX sc_sha1; TAILQ_ENTRY(carp_softc) sc_list; /* On the carp_if list. */ LIST_ENTRY(carp_softc) sc_next; /* On the global list. */ }; struct carp_if { #ifdef INET int cif_naddrs; #endif #ifdef INET6 int cif_naddrs6; #endif TAILQ_HEAD(, carp_softc) cif_vrs; #ifdef INET struct ip_moptions cif_imo; #endif #ifdef INET6 struct ip6_moptions cif_im6o; #endif struct ifnet *cif_ifp; struct mtx cif_mtx; uint32_t cif_flags; #define CIF_PROMISC 0x00000001 }; #define CARP_INET 0 #define CARP_INET6 1 static int proto_reg[] = {-1, -1}; /* * Brief design of carp(4). * * Any carp-capable ifnet may have a list of carp softcs hanging off * its ifp->if_carp pointer. Each softc represents one unique virtual * host id, or vhid. The softc has a back pointer to the ifnet. All * softcs are joined in a global list, which has quite limited use. * * Any interface address that takes part in CARP negotiation has a * pointer to the softc of its vhid, ifa->ifa_carp. That could be either * AF_INET or AF_INET6 address. * * Although, one can get the softc's backpointer to ifnet and traverse * through its ifp->if_addrhead queue to find all interface addresses * involved in CARP, we keep a growable array of ifaddr pointers. This * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that * do calls into the network stack, thus avoiding LORs. * * Locking: * * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(), * callout-driven events and ioctl()s. * * To traverse the list of softcs on an ifnet we use CIF_LOCK() or carp_sx. * To traverse the global list we use the mutex carp_mtx. * * Known issues with locking: * * - Sending ad, we put the pointer to the softc in an mtag, and no reference * counting is done on the softc. * - On module unload we may race (?) with packet processing thread * dereferencing our function pointers. */ /* Accept incoming CARP packets. */ VNET_DEFINE_STATIC(int, carp_allow) = 1; #define V_carp_allow VNET(carp_allow) /* Set DSCP in outgoing CARP packets. */ VNET_DEFINE_STATIC(int, carp_dscp) = 56; #define V_carp_dscp VNET(carp_dscp) /* Preempt slower nodes. */ VNET_DEFINE_STATIC(int, carp_preempt) = 0; #define V_carp_preempt VNET(carp_preempt) /* Log level. */ VNET_DEFINE_STATIC(int, carp_log) = 1; #define V_carp_log VNET(carp_log) /* Global advskew demotion. */ VNET_DEFINE_STATIC(int, carp_demotion) = 0; #define V_carp_demotion VNET(carp_demotion) /* Send error demotion factor. */ VNET_DEFINE_STATIC(int, carp_senderr_adj) = CARP_MAXSKEW; #define V_carp_senderr_adj VNET(carp_senderr_adj) /* Iface down demotion factor. */ VNET_DEFINE_STATIC(int, carp_ifdown_adj) = CARP_MAXSKEW; #define V_carp_ifdown_adj VNET(carp_ifdown_adj) static int carp_allow_sysctl(SYSCTL_HANDLER_ARGS); static int carp_dscp_sysctl(SYSCTL_HANDLER_ARGS); static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW, 0, "CARP"); SYSCTL_PROC(_net_inet_carp, OID_AUTO, allow, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, 0, 0, carp_allow_sysctl, "I", "Accept incoming CARP packets"); SYSCTL_PROC(_net_inet_carp, OID_AUTO, dscp, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, 0, 0, carp_dscp_sysctl, "I", "DSCP value for carp packets"); SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode"); SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_log), 0, "CARP log level"); SYSCTL_PROC(_net_inet_carp, OID_AUTO, demotion, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, 0, 0, carp_demote_adj_sysctl, "I", "Adjust demotion factor (skew of advskew)"); SYSCTL_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_senderr_adj), 0, "Send error demotion factor adjustment"); SYSCTL_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_ifdown_adj), 0, "Interface down demotion factor adjustment"); VNET_PCPUSTAT_DEFINE(struct carpstats, carpstats); VNET_PCPUSTAT_SYSINIT(carpstats); VNET_PCPUSTAT_SYSUNINIT(carpstats); #define CARPSTATS_ADD(name, val) \ counter_u64_add(VNET(carpstats)[offsetof(struct carpstats, name) / \ sizeof(uint64_t)], (val)) #define CARPSTATS_INC(name) CARPSTATS_ADD(name, 1) SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats, carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)"); #define CARP_LOCK_INIT(sc) mtx_init(&(sc)->sc_mtx, "carp_softc", \ NULL, MTX_DEF) #define CARP_LOCK_DESTROY(sc) mtx_destroy(&(sc)->sc_mtx) #define CARP_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED) #define CARP_LOCK(sc) mtx_lock(&(sc)->sc_mtx) #define CARP_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx) #define CIF_LOCK_INIT(cif) mtx_init(&(cif)->cif_mtx, "carp_if", \ NULL, MTX_DEF) #define CIF_LOCK_DESTROY(cif) mtx_destroy(&(cif)->cif_mtx) #define CIF_LOCK_ASSERT(cif) mtx_assert(&(cif)->cif_mtx, MA_OWNED) #define CIF_LOCK(cif) mtx_lock(&(cif)->cif_mtx) #define CIF_UNLOCK(cif) mtx_unlock(&(cif)->cif_mtx) #define CIF_FREE(cif) do { \ CIF_LOCK(cif); \ if (TAILQ_EMPTY(&(cif)->cif_vrs)) \ carp_free_if(cif); \ else \ CIF_UNLOCK(cif); \ } while (0) #define CARP_LOG(...) do { \ if (V_carp_log > 0) \ log(LOG_INFO, "carp: " __VA_ARGS__); \ } while (0) #define CARP_DEBUG(...) do { \ if (V_carp_log > 1) \ log(LOG_DEBUG, __VA_ARGS__); \ } while (0) #define IFNET_FOREACH_IFA(ifp, ifa) \ CK_STAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link) \ if ((ifa)->ifa_carp != NULL) #define CARP_FOREACH_IFA(sc, ifa) \ CARP_LOCK_ASSERT(sc); \ for (int _i = 0; \ _i < (sc)->sc_naddrs + (sc)->sc_naddrs6 && \ ((ifa) = sc->sc_ifas[_i]) != NULL; \ ++_i) #define IFNET_FOREACH_CARP(ifp, sc) \ KASSERT(mtx_owned(&ifp->if_carp->cif_mtx) || \ sx_xlocked(&carp_sx), ("cif_vrs not locked")); \ TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list) #define DEMOTE_ADVSKEW(sc) \ (((sc)->sc_advskew + V_carp_demotion > CARP_MAXSKEW) ? \ CARP_MAXSKEW : ((sc)->sc_advskew + V_carp_demotion)) static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t); static struct carp_softc *carp_alloc(struct ifnet *); static void carp_destroy(struct carp_softc *); static struct carp_if *carp_alloc_if(struct ifnet *); static void carp_free_if(struct carp_if *); static void carp_set_state(struct carp_softc *, int, const char* reason); static void carp_sc_state(struct carp_softc *); static void carp_setrun(struct carp_softc *, sa_family_t); static void carp_master_down(void *); static void carp_master_down_locked(struct carp_softc *, const char* reason); static void carp_send_ad(void *); static void carp_send_ad_locked(struct carp_softc *); static void carp_addroute(struct carp_softc *); static void carp_ifa_addroute(struct ifaddr *); static void carp_delroute(struct carp_softc *); static void carp_ifa_delroute(struct ifaddr *); static void carp_send_ad_all(void *, int); static void carp_demote_adj(int, char *); static LIST_HEAD(, carp_softc) carp_list; static struct mtx carp_mtx; static struct sx carp_sx; static struct task carp_sendall_task = TASK_INITIALIZER(0, carp_send_ad_all, NULL); static void carp_hmac_prepare(struct carp_softc *sc) { uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT; uint8_t vhid = sc->sc_vhid & 0xff; struct ifaddr *ifa; int i, found; #ifdef INET struct in_addr last, cur, in; #endif #ifdef INET6 struct in6_addr last6, cur6, in6; #endif CARP_LOCK_ASSERT(sc); /* Compute ipad from key. */ bzero(sc->sc_pad, sizeof(sc->sc_pad)); bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key)); for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36; /* Precompute first part of inner hash. */ SHA1Init(&sc->sc_sha1); SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad)); SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version)); SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type)); SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid)); #ifdef INET cur.s_addr = 0; do { found = 0; last = cur; cur.s_addr = 0xffffffff; CARP_FOREACH_IFA(sc, ifa) { in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; if (ifa->ifa_addr->sa_family == AF_INET && ntohl(in.s_addr) > ntohl(last.s_addr) && ntohl(in.s_addr) < ntohl(cur.s_addr)) { cur.s_addr = in.s_addr; found++; } } if (found) SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur)); } while (found); #endif /* INET */ #ifdef INET6 memset(&cur6, 0, sizeof(cur6)); do { found = 0; last6 = cur6; memset(&cur6, 0xff, sizeof(cur6)); CARP_FOREACH_IFA(sc, ifa) { in6 = ifatoia6(ifa)->ia_addr.sin6_addr; if (IN6_IS_SCOPE_EMBED(&in6)) in6.s6_addr16[1] = 0; if (ifa->ifa_addr->sa_family == AF_INET6 && memcmp(&in6, &last6, sizeof(in6)) > 0 && memcmp(&in6, &cur6, sizeof(in6)) < 0) { cur6 = in6; found++; } } if (found) SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6)); } while (found); #endif /* INET6 */ /* convert ipad to opad */ for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36 ^ 0x5c; } static void carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { SHA1_CTX sha1ctx; CARP_LOCK_ASSERT(sc); /* fetch first half of inner hash */ bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx)); SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter)); SHA1Final(md, &sha1ctx); /* outer hash */ SHA1Init(&sha1ctx); SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad)); SHA1Update(&sha1ctx, md, 20); SHA1Final(md, &sha1ctx); } static int carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { unsigned char md2[20]; CARP_LOCK_ASSERT(sc); carp_hmac_generate(sc, counter, md2); return (bcmp(md, md2, sizeof(md2))); } /* * process input packet. * we have rearranged checks order compared to the rfc, * but it seems more efficient this way or not possible otherwise. */ #ifdef INET int carp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct carp_header *ch; int iplen, len; iplen = *offp; *mp = NULL; CARPSTATS_INC(carps_ipackets); if (!V_carp_allow) { m_freem(m); return (IPPROTO_DONE); } /* verify that the IP TTL is 255. */ if (ip->ip_ttl != CARP_DFLTTL) { CARPSTATS_INC(carps_badttl); CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, ip->ip_ttl, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } iplen = ip->ip_hl << 2; if (m->m_pkthdr.len < iplen + sizeof(*ch)) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: received len %zd < sizeof(struct carp_header) " "on %s\n", __func__, m->m_len - sizeof(struct ip), m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } if (iplen + sizeof(*ch) < m->m_len) { if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) { CARPSTATS_INC(carps_hdrops); CARP_DEBUG("%s: pullup failed\n", __func__); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); } ch = (struct carp_header *)((char *)ip + iplen); /* * verify that the received packet length is * equal to the CARP header */ len = iplen + sizeof(*ch); if (len > m->m_pkthdr.len) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: packet too short %d on %s\n", __func__, m->m_pkthdr.len, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } if ((m = m_pullup(m, len)) == NULL) { CARPSTATS_INC(carps_hdrops); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); ch = (struct carp_header *)((char *)ip + iplen); /* verify the CARP checksum */ m->m_data += iplen; if (in_cksum(m, len - iplen)) { CARPSTATS_INC(carps_badsum); CARP_DEBUG("%s: checksum failed on %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } m->m_data -= iplen; carp_input_c(m, ch, AF_INET); return (IPPROTO_DONE); } #endif #ifdef INET6 int carp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct carp_header *ch; u_int len; CARPSTATS_INC(carps_ipackets6); if (!V_carp_allow) { m_freem(m); return (IPPROTO_DONE); } /* check if received on a valid carp interface */ if (m->m_pkthdr.rcvif->if_carp == NULL) { CARPSTATS_INC(carps_badif); CARP_DEBUG("%s: packet received on non-carp interface: %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } /* verify that the IP TTL is 255 */ if (ip6->ip6_hlim != CARP_DFLTTL) { CARPSTATS_INC(carps_badttl); CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, ip6->ip6_hlim, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } /* verify that we have a complete carp packet */ len = m->m_len; IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch)); if (ch == NULL) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: packet size %u too small\n", __func__, len); return (IPPROTO_DONE); } /* verify the CARP checksum */ m->m_data += *offp; if (in_cksum(m, sizeof(*ch))) { CARPSTATS_INC(carps_badsum); CARP_DEBUG("%s: checksum failed, on %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } m->m_data -= *offp; carp_input_c(m, ch, AF_INET6); return (IPPROTO_DONE); } #endif /* INET6 */ /* * This routine should not be necessary at all, but some switches * (VMWare ESX vswitches) can echo our own packets back at us, * and we must ignore them or they will cause us to drop out of * MASTER mode. * * We cannot catch all cases of network loops. Instead, what we * do here is catch any packet that arrives with a carp header * with a VHID of 0, that comes from an address that is our own. * These packets are by definition "from us" (even if they are from * a misconfigured host that is pretending to be us). * * The VHID test is outside this mini-function. */ static int carp_source_is_self(struct mbuf *m, struct ifaddr *ifa, sa_family_t af) { #ifdef INET struct ip *ip4; struct in_addr in4; #endif #ifdef INET6 struct ip6_hdr *ip6; struct in6_addr in6; #endif switch (af) { #ifdef INET case AF_INET: ip4 = mtod(m, struct ip *); in4 = ifatoia(ifa)->ia_addr.sin_addr; return (in4.s_addr == ip4->ip_src.s_addr); #endif #ifdef INET6 case AF_INET6: ip6 = mtod(m, struct ip6_hdr *); in6 = ifatoia6(ifa)->ia_addr.sin6_addr; return (memcmp(&in6, &ip6->ip6_src, sizeof(in6)) == 0); #endif default: break; } return (0); } static void carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ifaddr *ifa, *match; struct carp_softc *sc; uint64_t tmp_counter; struct timeval sc_tv, ch_tv; - struct epoch_tracker et; int error; + NET_EPOCH_ASSERT(); + /* * Verify that the VHID is valid on the receiving interface. * * There should be just one match. If there are none * the VHID is not valid and we drop the packet. If * there are multiple VHID matches, take just the first * one, for compatibility with previous code. While we're * scanning, check for obvious loops in the network topology * (these should never happen, and as noted above, we may * miss real loops; this is just a double-check). */ - NET_EPOCH_ENTER(et); error = 0; match = NULL; IFNET_FOREACH_IFA(ifp, ifa) { if (match == NULL && ifa->ifa_carp != NULL && ifa->ifa_addr->sa_family == af && ifa->ifa_carp->sc_vhid == ch->carp_vhid) match = ifa; if (ch->carp_vhid == 0 && carp_source_is_self(m, ifa, af)) error = ELOOP; } ifa = error ? NULL : match; if (ifa != NULL) ifa_ref(ifa); - NET_EPOCH_EXIT(et); if (ifa == NULL) { if (error == ELOOP) { CARP_DEBUG("dropping looped packet on interface %s\n", ifp->if_xname); CARPSTATS_INC(carps_badif); /* ??? */ } else { CARPSTATS_INC(carps_badvhid); } m_freem(m); return; } /* verify the CARP version. */ if (ch->carp_version != CARP_VERSION) { CARPSTATS_INC(carps_badver); CARP_DEBUG("%s: invalid version %d\n", ifp->if_xname, ch->carp_version); ifa_free(ifa); m_freem(m); return; } sc = ifa->ifa_carp; CARP_LOCK(sc); ifa_free(ifa); if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) { CARPSTATS_INC(carps_badauth); CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__, sc->sc_vhid, ifp->if_xname); goto out; } tmp_counter = ntohl(ch->carp_counter[0]); tmp_counter = tmp_counter<<32; tmp_counter += ntohl(ch->carp_counter[1]); /* XXX Replay protection goes here */ sc->sc_init_counter = 0; sc->sc_counter = tmp_counter; sc_tv.tv_sec = sc->sc_advbase; sc_tv.tv_usec = DEMOTE_ADVSKEW(sc) * 1000000 / 256; ch_tv.tv_sec = ch->carp_advbase; ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256; switch (sc->sc_state) { case INIT: break; case MASTER: /* * If we receive an advertisement from a master who's going to * be more frequent than us, go into BACKUP state. */ if (timevalcmp(&sc_tv, &ch_tv, >) || timevalcmp(&sc_tv, &ch_tv, ==)) { callout_stop(&sc->sc_ad_tmo); carp_set_state(sc, BACKUP, "more frequent advertisement received"); carp_setrun(sc, 0); carp_delroute(sc); } break; case BACKUP: /* * If we're pre-empting masters who advertise slower than us, * and this one claims to be slower, treat him as down. */ if (V_carp_preempt && timevalcmp(&sc_tv, &ch_tv, <)) { carp_master_down_locked(sc, "preempting a slower master"); break; } /* * If the master is going to advertise at such a low frequency * that he's guaranteed to time out, we'd might as well just * treat him as timed out now. */ sc_tv.tv_sec = sc->sc_advbase * 3; if (timevalcmp(&sc_tv, &ch_tv, <)) { carp_master_down_locked(sc, "master will time out"); break; } /* * Otherwise, we reset the counter and wait for the next * advertisement. */ carp_setrun(sc, af); break; } out: CARP_UNLOCK(sc); m_freem(m); } static int carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) { struct m_tag *mtag; if (sc->sc_init_counter) { /* this could also be seconds since unix epoch */ sc->sc_counter = arc4random(); sc->sc_counter = sc->sc_counter << 32; sc->sc_counter += arc4random(); } else sc->sc_counter++; ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff); ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff); carp_hmac_generate(sc, ch->carp_counter, ch->carp_md); /* Tag packet for carp_output */ if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *), M_NOWAIT)) == NULL) { m_freem(m); CARPSTATS_INC(carps_onomem); return (ENOMEM); } bcopy(&sc, mtag + 1, sizeof(sc)); m_tag_prepend(m, mtag); return (0); } /* * To avoid LORs and possible recursions this function shouldn't * be called directly, but scheduled via taskqueue. */ static void carp_send_ad_all(void *ctx __unused, int pending __unused) { struct carp_softc *sc; mtx_lock(&carp_mtx); LIST_FOREACH(sc, &carp_list, sc_next) if (sc->sc_state == MASTER) { CARP_LOCK(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); carp_send_ad_locked(sc); CURVNET_RESTORE(); CARP_UNLOCK(sc); } mtx_unlock(&carp_mtx); } /* Send a periodic advertisement, executed in callout context. */ static void carp_send_ad(void *v) { struct carp_softc *sc = v; CARP_LOCK_ASSERT(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); carp_send_ad_locked(sc); CURVNET_RESTORE(); CARP_UNLOCK(sc); } static void carp_send_ad_error(struct carp_softc *sc, int error) { if (error) { if (sc->sc_sendad_errors < INT_MAX) sc->sc_sendad_errors++; if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { static const char fmt[] = "send error %d on %s"; char msg[sizeof(fmt) + IFNAMSIZ]; sprintf(msg, fmt, error, sc->sc_carpdev->if_xname); carp_demote_adj(V_carp_senderr_adj, msg); } sc->sc_sendad_success = 0; } else { if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS && ++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) { static const char fmt[] = "send ok on %s"; char msg[sizeof(fmt) + IFNAMSIZ]; sprintf(msg, fmt, sc->sc_carpdev->if_xname); carp_demote_adj(-V_carp_senderr_adj, msg); sc->sc_sendad_errors = 0; } else sc->sc_sendad_errors = 0; } } /* * Pick the best ifaddr on the given ifp for sending CARP * advertisements. * * "Best" here is defined by ifa_preferred(). This function is much * much like ifaof_ifpforaddr() except that we just use ifa_preferred(). * * (This could be simplified to return the actual address, except that * it has a different format in AF_INET and AF_INET6.) */ static struct ifaddr * carp_best_ifa(int af, struct ifnet *ifp) { - struct epoch_tracker et; struct ifaddr *ifa, *best; + NET_EPOCH_ASSERT(); + if (af >= AF_MAX) return (NULL); best = NULL; - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == af && (best == NULL || ifa_preferred(best, ifa))) best = ifa; } - NET_EPOCH_EXIT(et); if (best != NULL) ifa_ref(best); return (best); } static void carp_send_ad_locked(struct carp_softc *sc) { struct carp_header ch; struct timeval tv; struct ifaddr *ifa; struct carp_header *ch_ptr; struct mbuf *m; int len, advskew; CARP_LOCK_ASSERT(sc); advskew = DEMOTE_ADVSKEW(sc); tv.tv_sec = sc->sc_advbase; tv.tv_usec = advskew * 1000000 / 256; ch.carp_version = CARP_VERSION; ch.carp_type = CARP_ADVERTISEMENT; ch.carp_vhid = sc->sc_vhid; ch.carp_advbase = sc->sc_advbase; ch.carp_advskew = advskew; ch.carp_authlen = 7; /* XXX DEFINE */ ch.carp_pad1 = 0; /* must be zero */ ch.carp_cksum = 0; /* XXXGL: OpenBSD picks first ifaddr with needed family. */ #ifdef INET if (sc->sc_naddrs) { struct ip *ip; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CARPSTATS_INC(carps_onomem); goto resched; } len = sizeof(*ip) + sizeof(ch); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; M_ALIGN(m, m->m_len); m->m_flags |= M_MCAST; ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = V_carp_dscp << IPTOS_DSCP_OFFSET; ip->ip_len = htons(len); ip->ip_off = htons(IP_DF); ip->ip_ttl = CARP_DFLTTL; ip->ip_p = IPPROTO_CARP; ip->ip_sum = 0; ip_fillid(ip); ifa = carp_best_ifa(AF_INET, sc->sc_carpdev); if (ifa != NULL) { ip->ip_src.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; ifa_free(ifa); } else ip->ip_src.s_addr = 0; ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP); ch_ptr = (struct carp_header *)(&ip[1]); bcopy(&ch, ch_ptr, sizeof(ch)); if (carp_prepare_ad(m, sc, ch_ptr)) goto resched; m->m_data += sizeof(*ip); ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip)); m->m_data -= sizeof(*ip); CARPSTATS_INC(carps_opackets); carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_carpdev->if_carp->cif_imo, NULL)); } #endif /* INET */ #ifdef INET6 if (sc->sc_naddrs6) { struct ip6_hdr *ip6; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CARPSTATS_INC(carps_onomem); goto resched; } len = sizeof(*ip6) + sizeof(ch); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; M_ALIGN(m, m->m_len); m->m_flags |= M_MCAST; ip6 = mtod(m, struct ip6_hdr *); bzero(ip6, sizeof(*ip6)); ip6->ip6_vfc |= IPV6_VERSION; /* Traffic class isn't defined in ip6 struct instead * it gets offset into flowid field */ ip6->ip6_flow |= htonl(V_carp_dscp << (IPV6_FLOWLABEL_LEN + IPTOS_DSCP_OFFSET)); ip6->ip6_hlim = CARP_DFLTTL; ip6->ip6_nxt = IPPROTO_CARP; /* set the source address */ ifa = carp_best_ifa(AF_INET6, sc->sc_carpdev); if (ifa != NULL) { bcopy(IFA_IN6(ifa), &ip6->ip6_src, sizeof(struct in6_addr)); ifa_free(ifa); } else /* This should never happen with IPv6. */ bzero(&ip6->ip6_src, sizeof(struct in6_addr)); /* Set the multicast destination. */ ip6->ip6_dst.s6_addr16[0] = htons(0xff02); ip6->ip6_dst.s6_addr8[15] = 0x12; if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) { m_freem(m); CARP_DEBUG("%s: in6_setscope failed\n", __func__); goto resched; } ch_ptr = (struct carp_header *)(&ip6[1]); bcopy(&ch, ch_ptr, sizeof(ch)); if (carp_prepare_ad(m, sc, ch_ptr)) goto resched; m->m_data += sizeof(*ip6); ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip6)); m->m_data -= sizeof(*ip6); CARPSTATS_INC(carps_opackets6); carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0, &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL)); } #endif /* INET6 */ resched: callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); } static void carp_addroute(struct carp_softc *sc) { struct ifaddr *ifa; CARP_FOREACH_IFA(sc, ifa) carp_ifa_addroute(ifa); } static void carp_ifa_addroute(struct ifaddr *ifa) { switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: in_addprefix(ifatoia(ifa), RTF_UP); ifa_add_loopback_route(ifa, (struct sockaddr *)&ifatoia(ifa)->ia_addr); break; #endif #ifdef INET6 case AF_INET6: ifa_add_loopback_route(ifa, (struct sockaddr *)&ifatoia6(ifa)->ia_addr); nd6_add_ifa_lle(ifatoia6(ifa)); break; #endif } } static void carp_delroute(struct carp_softc *sc) { struct ifaddr *ifa; CARP_FOREACH_IFA(sc, ifa) carp_ifa_delroute(ifa); } static void carp_ifa_delroute(struct ifaddr *ifa) { switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifa_del_loopback_route(ifa, (struct sockaddr *)&ifatoia(ifa)->ia_addr); in_scrubprefix(ifatoia(ifa), LLE_STATIC); break; #endif #ifdef INET6 case AF_INET6: ifa_del_loopback_route(ifa, (struct sockaddr *)&ifatoia6(ifa)->ia_addr); nd6_rem_ifa_lle(ifatoia6(ifa), 1); break; #endif } } int carp_master(struct ifaddr *ifa) { struct carp_softc *sc = ifa->ifa_carp; return (sc->sc_state == MASTER); } #ifdef INET /* * Broadcast a gratuitous ARP request containing * the virtual router MAC address for each IP address * associated with the virtual router. */ static void carp_send_arp(struct carp_softc *sc) { struct ifaddr *ifa; struct in_addr addr; CARP_FOREACH_IFA(sc, ifa) { if (ifa->ifa_addr->sa_family != AF_INET) continue; addr = ((struct sockaddr_in *)ifa->ifa_addr)->sin_addr; arp_announce_ifaddr(sc->sc_carpdev, addr, LLADDR(&sc->sc_addr)); } } int carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr) { struct carp_softc *sc = ifa->ifa_carp; if (sc->sc_state == MASTER) { *enaddr = LLADDR(&sc->sc_addr); return (1); } return (0); } #endif #ifdef INET6 static void carp_send_na(struct carp_softc *sc) { static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT; struct ifaddr *ifa; struct in6_addr *in6; CARP_FOREACH_IFA(sc, ifa) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6 = IFA_IN6(ifa); nd6_na_output(sc->sc_carpdev, &mcast, in6, ND_NA_FLAG_OVERRIDE, 1, NULL); DELAY(1000); /* XXX */ } } /* * Returns ifa in case it's a carp address and it is MASTER, or if the address * matches and is not a carp address. Returns NULL otherwise. */ struct ifaddr * carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr) { - struct epoch_tracker et; struct ifaddr *ifa; + NET_EPOCH_ASSERT(); + ifa = NULL; - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) continue; if (ifa->ifa_carp && ifa->ifa_carp->sc_state != MASTER) ifa = NULL; else ifa_ref(ifa); break; } - NET_EPOCH_EXIT(et); return (ifa); } caddr_t carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) { - struct epoch_tracker et; struct ifaddr *ifa; - NET_EPOCH_ENTER(et); + NET_EPOCH_ASSERT(); + IFNET_FOREACH_IFA(ifp, ifa) if (ifa->ifa_addr->sa_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) { struct carp_softc *sc = ifa->ifa_carp; struct m_tag *mtag; - NET_EPOCH_EXIT(et); - mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *), M_NOWAIT); if (mtag == NULL) /* Better a bit than nothing. */ return (LLADDR(&sc->sc_addr)); bcopy(&sc, mtag + 1, sizeof(sc)); m_tag_prepend(m, mtag); return (LLADDR(&sc->sc_addr)); } - NET_EPOCH_EXIT(et); return (NULL); } #endif /* INET6 */ int carp_forus(struct ifnet *ifp, u_char *dhost) { struct carp_softc *sc; uint8_t *ena = dhost; if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1) return (0); CIF_LOCK(ifp->if_carp); IFNET_FOREACH_CARP(ifp, sc) { CARP_LOCK(sc); if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr), ETHER_ADDR_LEN)) { CARP_UNLOCK(sc); CIF_UNLOCK(ifp->if_carp); return (1); } CARP_UNLOCK(sc); } CIF_UNLOCK(ifp->if_carp); return (0); } /* Master down timeout event, executed in callout context. */ static void carp_master_down(void *v) { struct carp_softc *sc = v; CARP_LOCK_ASSERT(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); if (sc->sc_state == BACKUP) { carp_master_down_locked(sc, "master timed out"); } CURVNET_RESTORE(); CARP_UNLOCK(sc); } static void carp_master_down_locked(struct carp_softc *sc, const char *reason) { CARP_LOCK_ASSERT(sc); switch (sc->sc_state) { case BACKUP: carp_set_state(sc, MASTER, reason); carp_send_ad_locked(sc); #ifdef INET carp_send_arp(sc); #endif #ifdef INET6 carp_send_na(sc); #endif carp_setrun(sc, 0); carp_addroute(sc); break; case INIT: case MASTER: #ifdef INVARIANTS panic("carp: VHID %u@%s: master_down event in %s state\n", sc->sc_vhid, sc->sc_carpdev->if_xname, sc->sc_state ? "MASTER" : "INIT"); #endif break; } } /* * When in backup state, af indicates whether to reset the master down timer * for v4 or v6. If it's set to zero, reset the ones which are already pending. */ static void carp_setrun(struct carp_softc *sc, sa_family_t af) { struct timeval tv; CARP_LOCK_ASSERT(sc); if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 || sc->sc_carpdev->if_link_state != LINK_STATE_UP || (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) || !V_carp_allow) return; switch (sc->sc_state) { case INIT: carp_set_state(sc, BACKUP, "initialization complete"); carp_setrun(sc, 0); break; case BACKUP: callout_stop(&sc->sc_ad_tmo); tv.tv_sec = 3 * sc->sc_advbase; tv.tv_usec = sc->sc_advskew * 1000000 / 256; switch (af) { #ifdef INET case AF_INET: callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); break; #endif #ifdef INET6 case AF_INET6: callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); break; #endif default: #ifdef INET if (sc->sc_naddrs) callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); #endif #ifdef INET6 if (sc->sc_naddrs6) callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); #endif break; } break; case MASTER: tv.tv_sec = sc->sc_advbase; tv.tv_usec = sc->sc_advskew * 1000000 / 256; callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); break; } } /* * Setup multicast structures. */ static int carp_multicast_setup(struct carp_if *cif, sa_family_t sa) { struct ifnet *ifp = cif->cif_ifp; int error = 0; switch (sa) { #ifdef INET case AF_INET: { struct ip_moptions *imo = &cif->cif_imo; struct in_mfilter *imf; struct in_addr addr; if (ip_mfilter_first(&imo->imo_head) != NULL) return (0); imf = ip_mfilter_alloc(M_WAITOK, 0, 0); ip_mfilter_init(&imo->imo_head); imo->imo_multicast_vif = -1; addr.s_addr = htonl(INADDR_CARP_GROUP); if ((error = in_joingroup(ifp, &addr, NULL, &imf->imf_inm)) != 0) { ip_mfilter_free(imf); break; } ip_mfilter_insert(&imo->imo_head, imf); imo->imo_multicast_ifp = ifp; imo->imo_multicast_ttl = CARP_DFLTTL; imo->imo_multicast_loop = 0; break; } #endif #ifdef INET6 case AF_INET6: { struct ip6_moptions *im6o = &cif->cif_im6o; struct in6_mfilter *im6f[2]; struct in6_addr in6; if (ip6_mfilter_first(&im6o->im6o_head)) return (0); im6f[0] = ip6_mfilter_alloc(M_WAITOK, 0, 0); im6f[1] = ip6_mfilter_alloc(M_WAITOK, 0, 0); ip6_mfilter_init(&im6o->im6o_head); im6o->im6o_multicast_hlim = CARP_DFLTTL; im6o->im6o_multicast_ifp = ifp; /* Join IPv6 CARP multicast group. */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr8[15] = 0x12; if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { ip6_mfilter_free(im6f[0]); ip6_mfilter_free(im6f[1]); break; } if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[0]->im6f_in6m, 0)) != 0) { ip6_mfilter_free(im6f[0]); ip6_mfilter_free(im6f[1]); break; } /* Join solicited multicast address. */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr32[1] = 0; in6.s6_addr32[2] = htonl(1); in6.s6_addr32[3] = 0; in6.s6_addr8[12] = 0xff; if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { ip6_mfilter_free(im6f[0]); ip6_mfilter_free(im6f[1]); break; } if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[1]->im6f_in6m, 0)) != 0) { in6_leavegroup(im6f[0]->im6f_in6m, NULL); ip6_mfilter_free(im6f[0]); ip6_mfilter_free(im6f[1]); break; } ip6_mfilter_insert(&im6o->im6o_head, im6f[0]); ip6_mfilter_insert(&im6o->im6o_head, im6f[1]); break; } #endif } return (error); } /* * Free multicast structures. */ static void carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa) { #ifdef INET struct ip_moptions *imo = &cif->cif_imo; struct in_mfilter *imf; #endif #ifdef INET6 struct ip6_moptions *im6o = &cif->cif_im6o; struct in6_mfilter *im6f; #endif sx_assert(&carp_sx, SA_XLOCKED); switch (sa) { #ifdef INET case AF_INET: if (cif->cif_naddrs != 0) break; while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) { ip_mfilter_remove(&imo->imo_head, imf); in_leavegroup(imf->imf_inm, NULL); ip_mfilter_free(imf); } break; #endif #ifdef INET6 case AF_INET6: if (cif->cif_naddrs6 != 0) break; while ((im6f = ip6_mfilter_first(&im6o->im6o_head)) != NULL) { ip6_mfilter_remove(&im6o->im6o_head, im6f); in6_leavegroup(im6f->im6f_in6m, NULL); ip6_mfilter_free(im6f); } break; #endif } } int carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa) { struct m_tag *mtag; struct carp_softc *sc; if (!sa) return (0); switch (sa->sa_family) { #ifdef INET case AF_INET: break; #endif #ifdef INET6 case AF_INET6: break; #endif default: return (0); } mtag = m_tag_find(m, PACKET_TAG_CARP, NULL); if (mtag == NULL) return (0); bcopy(mtag + 1, &sc, sizeof(sc)); /* Set the source MAC address to the Virtual Router MAC Address. */ switch (ifp->if_type) { case IFT_ETHER: case IFT_BRIDGE: case IFT_L2VLAN: { struct ether_header *eh; eh = mtod(m, struct ether_header *); eh->ether_shost[0] = 0; eh->ether_shost[1] = 0; eh->ether_shost[2] = 0x5e; eh->ether_shost[3] = 0; eh->ether_shost[4] = 1; eh->ether_shost[5] = sc->sc_vhid; } break; default: printf("%s: carp is not supported for the %d interface type\n", ifp->if_xname, ifp->if_type); return (EOPNOTSUPP); } return (0); } static struct carp_softc* carp_alloc(struct ifnet *ifp) { struct carp_softc *sc; struct carp_if *cif; sx_assert(&carp_sx, SA_XLOCKED); if ((cif = ifp->if_carp) == NULL) cif = carp_alloc_if(ifp); sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO); sc->sc_advbase = CARP_DFLTINTV; sc->sc_vhid = -1; /* required setting */ sc->sc_init_counter = 1; sc->sc_state = INIT; sc->sc_ifasiz = sizeof(struct ifaddr *); sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO); sc->sc_carpdev = ifp; CARP_LOCK_INIT(sc); #ifdef INET callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); #endif #ifdef INET6 callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); #endif callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); CIF_LOCK(cif); TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list); CIF_UNLOCK(cif); mtx_lock(&carp_mtx); LIST_INSERT_HEAD(&carp_list, sc, sc_next); mtx_unlock(&carp_mtx); return (sc); } static void carp_grow_ifas(struct carp_softc *sc) { struct ifaddr **new; new = malloc(sc->sc_ifasiz * 2, M_CARP, M_WAITOK | M_ZERO); CARP_LOCK(sc); bcopy(sc->sc_ifas, new, sc->sc_ifasiz); free(sc->sc_ifas, M_CARP); sc->sc_ifas = new; sc->sc_ifasiz *= 2; CARP_UNLOCK(sc); } static void carp_destroy(struct carp_softc *sc) { struct ifnet *ifp = sc->sc_carpdev; struct carp_if *cif = ifp->if_carp; sx_assert(&carp_sx, SA_XLOCKED); if (sc->sc_suppress) carp_demote_adj(-V_carp_ifdown_adj, "vhid removed"); CARP_UNLOCK(sc); CIF_LOCK(cif); TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list); CIF_UNLOCK(cif); mtx_lock(&carp_mtx); LIST_REMOVE(sc, sc_next); mtx_unlock(&carp_mtx); callout_drain(&sc->sc_ad_tmo); #ifdef INET callout_drain(&sc->sc_md_tmo); #endif #ifdef INET6 callout_drain(&sc->sc_md6_tmo); #endif CARP_LOCK_DESTROY(sc); free(sc->sc_ifas, M_CARP); free(sc, M_CARP); } static struct carp_if* carp_alloc_if(struct ifnet *ifp) { struct carp_if *cif; int error; cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO); if ((error = ifpromisc(ifp, 1)) != 0) printf("%s: ifpromisc(%s) failed: %d\n", __func__, ifp->if_xname, error); else cif->cif_flags |= CIF_PROMISC; CIF_LOCK_INIT(cif); cif->cif_ifp = ifp; TAILQ_INIT(&cif->cif_vrs); IF_ADDR_WLOCK(ifp); ifp->if_carp = cif; if_ref(ifp); IF_ADDR_WUNLOCK(ifp); return (cif); } static void carp_free_if(struct carp_if *cif) { struct ifnet *ifp = cif->cif_ifp; CIF_LOCK_ASSERT(cif); KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty", __func__)); IF_ADDR_WLOCK(ifp); ifp->if_carp = NULL; IF_ADDR_WUNLOCK(ifp); CIF_LOCK_DESTROY(cif); if (cif->cif_flags & CIF_PROMISC) ifpromisc(ifp, 0); if_rele(ifp); free(cif, M_CARP); } static void carp_carprcp(struct carpreq *carpr, struct carp_softc *sc, int priv) { CARP_LOCK(sc); carpr->carpr_state = sc->sc_state; carpr->carpr_vhid = sc->sc_vhid; carpr->carpr_advbase = sc->sc_advbase; carpr->carpr_advskew = sc->sc_advskew; if (priv) bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key)); else bzero(carpr->carpr_key, sizeof(carpr->carpr_key)); CARP_UNLOCK(sc); } int carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td) { struct carpreq carpr; struct ifnet *ifp; struct carp_softc *sc = NULL; int error = 0, locked = 0; if ((error = copyin(ifr_data_get_ptr(ifr), &carpr, sizeof carpr))) return (error); ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) return (ENXIO); switch (ifp->if_type) { case IFT_ETHER: case IFT_L2VLAN: case IFT_BRIDGE: break; default: error = EOPNOTSUPP; goto out; } if ((ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; goto out; } sx_xlock(&carp_sx); switch (cmd) { case SIOCSVH: if ((error = priv_check(td, PRIV_NETINET_CARP))) break; if (carpr.carpr_vhid <= 0 || carpr.carpr_vhid > CARP_MAXVHID || carpr.carpr_advbase < 0 || carpr.carpr_advskew < 0) { error = EINVAL; break; } if (ifp->if_carp) { IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == carpr.carpr_vhid) break; } if (sc == NULL) { sc = carp_alloc(ifp); CARP_LOCK(sc); sc->sc_vhid = carpr.carpr_vhid; LLADDR(&sc->sc_addr)[0] = 0; LLADDR(&sc->sc_addr)[1] = 0; LLADDR(&sc->sc_addr)[2] = 0x5e; LLADDR(&sc->sc_addr)[3] = 0; LLADDR(&sc->sc_addr)[4] = 1; LLADDR(&sc->sc_addr)[5] = sc->sc_vhid; } else CARP_LOCK(sc); locked = 1; if (carpr.carpr_advbase > 0) { if (carpr.carpr_advbase > 255 || carpr.carpr_advbase < CARP_DFLTINTV) { error = EINVAL; break; } sc->sc_advbase = carpr.carpr_advbase; } if (carpr.carpr_advskew >= 255) { error = EINVAL; break; } sc->sc_advskew = carpr.carpr_advskew; if (carpr.carpr_key[0] != '\0') { bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key)); carp_hmac_prepare(sc); } if (sc->sc_state != INIT && carpr.carpr_state != sc->sc_state) { switch (carpr.carpr_state) { case BACKUP: callout_stop(&sc->sc_ad_tmo); carp_set_state(sc, BACKUP, "user requested via ifconfig"); carp_setrun(sc, 0); carp_delroute(sc); break; case MASTER: carp_master_down_locked(sc, "user requested via ifconfig"); break; default: break; } } break; case SIOCGVH: { int priveleged; if (carpr.carpr_vhid < 0 || carpr.carpr_vhid > CARP_MAXVHID) { error = EINVAL; break; } if (carpr.carpr_count < 1) { error = EMSGSIZE; break; } if (ifp->if_carp == NULL) { error = ENOENT; break; } priveleged = (priv_check(td, PRIV_NETINET_CARP) == 0); if (carpr.carpr_vhid != 0) { IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == carpr.carpr_vhid) break; if (sc == NULL) { error = ENOENT; break; } carp_carprcp(&carpr, sc, priveleged); error = copyout(&carpr, ifr_data_get_ptr(ifr), sizeof(carpr)); } else { int i, count; count = 0; IFNET_FOREACH_CARP(ifp, sc) count++; if (count > carpr.carpr_count) { CIF_UNLOCK(ifp->if_carp); error = EMSGSIZE; break; } i = 0; IFNET_FOREACH_CARP(ifp, sc) { carp_carprcp(&carpr, sc, priveleged); carpr.carpr_count = count; error = copyout(&carpr, (caddr_t)ifr_data_get_ptr(ifr) + (i * sizeof(carpr)), sizeof(carpr)); if (error) { CIF_UNLOCK(ifp->if_carp); break; } i++; } } break; } default: error = EINVAL; } sx_xunlock(&carp_sx); out: if (locked) CARP_UNLOCK(sc); if_rele(ifp); return (error); } static int carp_get_vhid(struct ifaddr *ifa) { if (ifa == NULL || ifa->ifa_carp == NULL) return (0); return (ifa->ifa_carp->sc_vhid); } int carp_attach(struct ifaddr *ifa, int vhid) { struct ifnet *ifp = ifa->ifa_ifp; struct carp_if *cif = ifp->if_carp; struct carp_softc *sc; int index, error; KASSERT(ifa->ifa_carp == NULL, ("%s: ifa %p attached", __func__, ifa)); switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: #endif break; default: return (EPROTOTYPE); } sx_xlock(&carp_sx); if (ifp->if_carp == NULL) { sx_xunlock(&carp_sx); return (ENOPROTOOPT); } IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == vhid) break; if (sc == NULL) { sx_xunlock(&carp_sx); return (ENOENT); } error = carp_multicast_setup(cif, ifa->ifa_addr->sa_family); if (error) { CIF_FREE(cif); sx_xunlock(&carp_sx); return (error); } index = sc->sc_naddrs + sc->sc_naddrs6 + 1; if (index > sc->sc_ifasiz / sizeof(struct ifaddr *)) carp_grow_ifas(sc); switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: cif->cif_naddrs++; sc->sc_naddrs++; break; #endif #ifdef INET6 case AF_INET6: cif->cif_naddrs6++; sc->sc_naddrs6++; break; #endif } ifa_ref(ifa); CARP_LOCK(sc); sc->sc_ifas[index - 1] = ifa; ifa->ifa_carp = sc; carp_hmac_prepare(sc); carp_sc_state(sc); CARP_UNLOCK(sc); sx_xunlock(&carp_sx); return (0); } void carp_detach(struct ifaddr *ifa, bool keep_cif) { struct ifnet *ifp = ifa->ifa_ifp; struct carp_if *cif = ifp->if_carp; struct carp_softc *sc = ifa->ifa_carp; int i, index; KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa)); sx_xlock(&carp_sx); CARP_LOCK(sc); /* Shift array. */ index = sc->sc_naddrs + sc->sc_naddrs6; for (i = 0; i < index; i++) if (sc->sc_ifas[i] == ifa) break; KASSERT(i < index, ("%s: %p no backref", __func__, ifa)); for (; i < index - 1; i++) sc->sc_ifas[i] = sc->sc_ifas[i+1]; sc->sc_ifas[index - 1] = NULL; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: cif->cif_naddrs--; sc->sc_naddrs--; break; #endif #ifdef INET6 case AF_INET6: cif->cif_naddrs6--; sc->sc_naddrs6--; break; #endif } carp_ifa_delroute(ifa); carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family); ifa->ifa_carp = NULL; ifa_free(ifa); carp_hmac_prepare(sc); carp_sc_state(sc); if (!keep_cif && sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) carp_destroy(sc); else CARP_UNLOCK(sc); if (!keep_cif) CIF_FREE(cif); sx_xunlock(&carp_sx); } static void carp_set_state(struct carp_softc *sc, int state, const char *reason) { CARP_LOCK_ASSERT(sc); if (sc->sc_state != state) { const char *carp_states[] = { CARP_STATES }; char subsys[IFNAMSIZ+5]; snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid, sc->sc_carpdev->if_xname); CARP_LOG("%s: %s -> %s (%s)\n", subsys, carp_states[sc->sc_state], carp_states[state], reason); sc->sc_state = state; devctl_notify("CARP", subsys, carp_states[state], NULL); } } static void carp_linkstate(struct ifnet *ifp) { struct carp_softc *sc; CIF_LOCK(ifp->if_carp); IFNET_FOREACH_CARP(ifp, sc) { CARP_LOCK(sc); carp_sc_state(sc); CARP_UNLOCK(sc); } CIF_UNLOCK(ifp->if_carp); } static void carp_sc_state(struct carp_softc *sc) { CARP_LOCK_ASSERT(sc); if (sc->sc_carpdev->if_link_state != LINK_STATE_UP || !(sc->sc_carpdev->if_flags & IFF_UP) || !V_carp_allow) { callout_stop(&sc->sc_ad_tmo); #ifdef INET callout_stop(&sc->sc_md_tmo); #endif #ifdef INET6 callout_stop(&sc->sc_md6_tmo); #endif carp_set_state(sc, INIT, "hardware interface down"); carp_setrun(sc, 0); if (!sc->sc_suppress) carp_demote_adj(V_carp_ifdown_adj, "interface down"); sc->sc_suppress = 1; } else { carp_set_state(sc, INIT, "hardware interface up"); carp_setrun(sc, 0); if (sc->sc_suppress) carp_demote_adj(-V_carp_ifdown_adj, "interface up"); sc->sc_suppress = 0; } } static void carp_demote_adj(int adj, char *reason) { atomic_add_int(&V_carp_demotion, adj); CARP_LOG("demoted by %d to %d (%s)\n", adj, V_carp_demotion, reason); taskqueue_enqueue(taskqueue_swi, &carp_sendall_task); } static int carp_allow_sysctl(SYSCTL_HANDLER_ARGS) { int new, error; struct carp_softc *sc; new = V_carp_allow; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) return (error); if (V_carp_allow != new) { V_carp_allow = new; mtx_lock(&carp_mtx); LIST_FOREACH(sc, &carp_list, sc_next) { CARP_LOCK(sc); if (curvnet == sc->sc_carpdev->if_vnet) carp_sc_state(sc); CARP_UNLOCK(sc); } mtx_unlock(&carp_mtx); } return (0); } static int carp_dscp_sysctl(SYSCTL_HANDLER_ARGS) { int new, error; new = V_carp_dscp; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) return (error); if (new < 0 || new > 63) return (EINVAL); V_carp_dscp = new; return (0); } static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS) { int new, error; new = V_carp_demotion; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) return (error); carp_demote_adj(new, "sysctl"); return (0); } #ifdef INET extern struct domain inetdomain; static struct protosw in_carp_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_CARP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = carp_input, .pr_output = rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; #endif #ifdef INET6 extern struct domain inet6domain; static struct protosw in6_carp_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_CARP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = carp6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }; #endif static void carp_mod_cleanup(void) { #ifdef INET if (proto_reg[CARP_INET] == 0) { (void)ipproto_unregister(IPPROTO_CARP); pf_proto_unregister(PF_INET, IPPROTO_CARP, SOCK_RAW); proto_reg[CARP_INET] = -1; } carp_iamatch_p = NULL; #endif #ifdef INET6 if (proto_reg[CARP_INET6] == 0) { (void)ip6proto_unregister(IPPROTO_CARP); pf_proto_unregister(PF_INET6, IPPROTO_CARP, SOCK_RAW); proto_reg[CARP_INET6] = -1; } carp_iamatch6_p = NULL; carp_macmatch6_p = NULL; #endif carp_ioctl_p = NULL; carp_attach_p = NULL; carp_detach_p = NULL; carp_get_vhid_p = NULL; carp_linkstate_p = NULL; carp_forus_p = NULL; carp_output_p = NULL; carp_demote_adj_p = NULL; carp_master_p = NULL; mtx_unlock(&carp_mtx); taskqueue_drain(taskqueue_swi, &carp_sendall_task); mtx_destroy(&carp_mtx); sx_destroy(&carp_sx); } static int carp_mod_load(void) { int err; mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF); sx_init(&carp_sx, "carp_sx"); LIST_INIT(&carp_list); carp_get_vhid_p = carp_get_vhid; carp_forus_p = carp_forus; carp_output_p = carp_output; carp_linkstate_p = carp_linkstate; carp_ioctl_p = carp_ioctl; carp_attach_p = carp_attach; carp_detach_p = carp_detach; carp_demote_adj_p = carp_demote_adj; carp_master_p = carp_master; #ifdef INET6 carp_iamatch6_p = carp_iamatch6; carp_macmatch6_p = carp_macmatch6; proto_reg[CARP_INET6] = pf_proto_register(PF_INET6, (struct protosw *)&in6_carp_protosw); if (proto_reg[CARP_INET6]) { printf("carp: error %d attaching to PF_INET6\n", proto_reg[CARP_INET6]); carp_mod_cleanup(); return (proto_reg[CARP_INET6]); } err = ip6proto_register(IPPROTO_CARP); if (err) { printf("carp: error %d registering with INET6\n", err); carp_mod_cleanup(); return (err); } #endif #ifdef INET carp_iamatch_p = carp_iamatch; proto_reg[CARP_INET] = pf_proto_register(PF_INET, &in_carp_protosw); if (proto_reg[CARP_INET]) { printf("carp: error %d attaching to PF_INET\n", proto_reg[CARP_INET]); carp_mod_cleanup(); return (proto_reg[CARP_INET]); } err = ipproto_register(IPPROTO_CARP); if (err) { printf("carp: error %d registering with INET\n", err); carp_mod_cleanup(); return (err); } #endif return (0); } static int carp_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: return carp_mod_load(); /* NOTREACHED */ case MOD_UNLOAD: mtx_lock(&carp_mtx); if (LIST_EMPTY(&carp_list)) carp_mod_cleanup(); else { mtx_unlock(&carp_mtx); return (EBUSY); } break; default: return (EINVAL); } return (0); } static moduledata_t carp_mod = { "carp", carp_modevent, 0 }; DECLARE_MODULE(carp, carp_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); diff --git a/sys/netinet/ip_encap.c b/sys/netinet/ip_encap.c index ebcce2958a32..15ba528e7e0c 100644 --- a/sys/netinet/ip_encap.c +++ b/sys/netinet/ip_encap.c @@ -1,417 +1,415 @@ /* $KAME: ip_encap.c,v 1.41 2001/03/15 08:35:08 itojun Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2018 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * My grandfather said that there's a devil inside tunnelling technology... * * We have surprisingly many protocols that want packets with IP protocol * #4 or #41. Here's a list of protocols that want protocol #41: * RFC1933 configured tunnel * RFC1933 automatic tunnel * RFC2401 IPsec tunnel * RFC2473 IPv6 generic packet tunnelling * RFC2529 6over4 tunnel * mobile-ip6 (uses RFC2473) * RFC3056 6to4 tunnel * isatap tunnel * Here's a list of protocol that want protocol #4: * RFC1853 IPv4-in-IPv4 tunnelling * RFC2003 IPv4 encapsulation within IPv4 * RFC2344 reverse tunnelling for mobile-ip4 * RFC2401 IPsec tunnel * Well, what can I say. They impose different en/decapsulation mechanism * from each other, so they need separate protocol handler. The only one * we can easily determine by protocol # is IPsec, which always has * AH/ESP/IPComp header right after outer IP header. * * So, clearly good old protosw does not work for protocol #4 and #41. * The code will let you match protocol via src/dst address pair. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif static MALLOC_DEFINE(M_NETADDR, "encap_export_host", "Export host address structure"); struct encaptab { CK_LIST_ENTRY(encaptab) chain; int proto; int min_length; int exact_match; void *arg; encap_lookup_t lookup; encap_check_t check; encap_input_t input; }; struct srcaddrtab { CK_LIST_ENTRY(srcaddrtab) chain; encap_srcaddr_t srcaddr; void *arg; }; CK_LIST_HEAD(encaptab_head, encaptab); CK_LIST_HEAD(srcaddrtab_head, srcaddrtab); #ifdef INET static struct encaptab_head ipv4_encaptab = CK_LIST_HEAD_INITIALIZER(); static struct srcaddrtab_head ipv4_srcaddrtab = CK_LIST_HEAD_INITIALIZER(); #endif #ifdef INET6 static struct encaptab_head ipv6_encaptab = CK_LIST_HEAD_INITIALIZER(); static struct srcaddrtab_head ipv6_srcaddrtab = CK_LIST_HEAD_INITIALIZER(); #endif static struct mtx encapmtx, srcaddrmtx; MTX_SYSINIT(encapmtx, &encapmtx, "encapmtx", MTX_DEF); MTX_SYSINIT(srcaddrmtx, &srcaddrmtx, "srcaddrmtx", MTX_DEF); #define ENCAP_WLOCK() mtx_lock(&encapmtx) #define ENCAP_WUNLOCK() mtx_unlock(&encapmtx) #define ENCAP_RLOCK_TRACKER struct epoch_tracker encap_et -#define ENCAP_RLOCK() \ - epoch_enter_preempt(net_epoch_preempt, &encap_et) -#define ENCAP_RUNLOCK() \ - epoch_exit_preempt(net_epoch_preempt, &encap_et) -#define ENCAP_WAIT() epoch_wait_preempt(net_epoch_preempt) +#define ENCAP_RLOCK() NET_EPOCH_ENTER(encap_et) +#define ENCAP_RUNLOCK() NET_EPOCH_EXIT(encap_et) +#define ENCAP_WAIT() NET_EPOCH_WAIT() #define SRCADDR_WLOCK() mtx_lock(&srcaddrmtx) #define SRCADDR_WUNLOCK() mtx_unlock(&srcaddrmtx) #define SRCADDR_RLOCK_TRACKER struct epoch_tracker srcaddr_et #define SRCADDR_RLOCK() \ epoch_enter_preempt(net_epoch_preempt, &srcaddr_et) #define SRCADDR_RUNLOCK() \ epoch_exit_preempt(net_epoch_preempt, &srcaddr_et) #define SRCADDR_WAIT() epoch_wait_preempt(net_epoch_preempt) /* * ifaddr_event_ext handler. * * Tunnelling interfaces may request the kernel to notify when * some interface addresses appears or disappears. Usually tunnelling * interface must use an address configured on the local machine as * ingress address to be able receive datagramms and do not send * spoofed packets. */ static void srcaddr_change_event(void *arg __unused, struct ifnet *ifp, struct ifaddr *ifa, int event) { SRCADDR_RLOCK_TRACKER; struct srcaddrtab_head *head; struct srcaddrtab *p; /* Support for old ifaddr_event. */ EVENTHANDLER_INVOKE(ifaddr_event, ifp); switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: head = &ipv4_srcaddrtab; break; #endif #ifdef INET6 case AF_INET6: head = &ipv6_srcaddrtab; break; #endif default: /* ignore event */ return; } SRCADDR_RLOCK(); CK_LIST_FOREACH(p, head, chain) { (*p->srcaddr)(p->arg, ifa->ifa_addr, event); } SRCADDR_RUNLOCK(); } EVENTHANDLER_DEFINE(ifaddr_event_ext, srcaddr_change_event, NULL, 0); static struct srcaddrtab * encap_register_srcaddr(struct srcaddrtab_head *head, encap_srcaddr_t func, void *arg, int mflags) { struct srcaddrtab *p, *tmp; if (func == NULL) return (NULL); p = malloc(sizeof(*p), M_NETADDR, mflags); if (p == NULL) return (NULL); p->srcaddr = func; p->arg = arg; SRCADDR_WLOCK(); CK_LIST_FOREACH(tmp, head, chain) { if (func == tmp->srcaddr && arg == tmp->arg) break; } if (tmp == NULL) CK_LIST_INSERT_HEAD(head, p, chain); SRCADDR_WUNLOCK(); if (tmp != NULL) { free(p, M_NETADDR); p = tmp; } return (p); } static int encap_unregister_srcaddr(struct srcaddrtab_head *head, const struct srcaddrtab *cookie) { struct srcaddrtab *p; SRCADDR_WLOCK(); CK_LIST_FOREACH(p, head, chain) { if (p == cookie) { CK_LIST_REMOVE(p, chain); SRCADDR_WUNLOCK(); SRCADDR_WAIT(); free(p, M_NETADDR); return (0); } } SRCADDR_WUNLOCK(); return (EINVAL); } static struct encaptab * encap_attach(struct encaptab_head *head, const struct encap_config *cfg, void *arg, int mflags) { struct encaptab *ep, *tmp; if (cfg == NULL || cfg->input == NULL || (cfg->check == NULL && cfg->lookup == NULL) || (cfg->lookup != NULL && cfg->exact_match != ENCAP_DRV_LOOKUP) || (cfg->exact_match == ENCAP_DRV_LOOKUP && cfg->lookup == NULL)) return (NULL); ep = malloc(sizeof(*ep), M_NETADDR, mflags); if (ep == NULL) return (NULL); ep->proto = cfg->proto; ep->min_length = cfg->min_length; ep->exact_match = cfg->exact_match; ep->arg = arg; ep->lookup = cfg->exact_match == ENCAP_DRV_LOOKUP ? cfg->lookup: NULL; ep->check = cfg->exact_match != ENCAP_DRV_LOOKUP ? cfg->check: NULL; ep->input = cfg->input; ENCAP_WLOCK(); CK_LIST_FOREACH(tmp, head, chain) { if (tmp->exact_match <= ep->exact_match) break; } if (tmp == NULL) CK_LIST_INSERT_HEAD(head, ep, chain); else CK_LIST_INSERT_BEFORE(tmp, ep, chain); ENCAP_WUNLOCK(); return (ep); } static int encap_detach(struct encaptab_head *head, const struct encaptab *cookie) { struct encaptab *ep; ENCAP_WLOCK(); CK_LIST_FOREACH(ep, head, chain) { if (ep == cookie) { CK_LIST_REMOVE(ep, chain); ENCAP_WUNLOCK(); ENCAP_WAIT(); free(ep, M_NETADDR); return (0); } } ENCAP_WUNLOCK(); return (EINVAL); } static int encap_input(struct encaptab_head *head, struct mbuf *m, int off, int proto) { ENCAP_RLOCK_TRACKER; struct encaptab *ep, *match; void *arg; int matchprio, ret; match = NULL; matchprio = 0; ENCAP_RLOCK(); CK_LIST_FOREACH(ep, head, chain) { if (ep->proto >= 0 && ep->proto != proto) continue; if (ep->min_length > m->m_pkthdr.len) continue; if (ep->exact_match == ENCAP_DRV_LOOKUP) ret = (*ep->lookup)(m, off, proto, &arg); else ret = (*ep->check)(m, off, proto, ep->arg); if (ret <= 0) continue; if (ret > matchprio) { match = ep; if (ep->exact_match != ENCAP_DRV_LOOKUP) arg = ep->arg; /* * No need to continue the search, we got the * exact match. */ if (ret >= ep->exact_match) break; matchprio = ret; } } if (match != NULL) { /* found a match, "match" has the best one */ ret = (*match->input)(m, off, proto, arg); ENCAP_RUNLOCK(); MPASS(ret == IPPROTO_DONE); return (IPPROTO_DONE); } ENCAP_RUNLOCK(); return (0); } #ifdef INET const struct srcaddrtab * ip_encap_register_srcaddr(encap_srcaddr_t func, void *arg, int mflags) { return (encap_register_srcaddr(&ipv4_srcaddrtab, func, arg, mflags)); } int ip_encap_unregister_srcaddr(const struct srcaddrtab *cookie) { return (encap_unregister_srcaddr(&ipv4_srcaddrtab, cookie)); } const struct encaptab * ip_encap_attach(const struct encap_config *cfg, void *arg, int mflags) { return (encap_attach(&ipv4_encaptab, cfg, arg, mflags)); } int ip_encap_detach(const struct encaptab *cookie) { return (encap_detach(&ipv4_encaptab, cookie)); } int encap4_input(struct mbuf **mp, int *offp, int proto) { if (encap_input(&ipv4_encaptab, *mp, *offp, proto) != IPPROTO_DONE) return (rip_input(mp, offp, proto)); return (IPPROTO_DONE); } #endif /* INET */ #ifdef INET6 const struct srcaddrtab * ip6_encap_register_srcaddr(encap_srcaddr_t func, void *arg, int mflags) { return (encap_register_srcaddr(&ipv6_srcaddrtab, func, arg, mflags)); } int ip6_encap_unregister_srcaddr(const struct srcaddrtab *cookie) { return (encap_unregister_srcaddr(&ipv6_srcaddrtab, cookie)); } const struct encaptab * ip6_encap_attach(const struct encap_config *cfg, void *arg, int mflags) { return (encap_attach(&ipv6_encaptab, cfg, arg, mflags)); } int ip6_encap_detach(const struct encaptab *cookie) { return (encap_detach(&ipv6_encaptab, cookie)); } int encap6_input(struct mbuf **mp, int *offp, int proto) { if (encap_input(&ipv6_encaptab, *mp, *offp, proto) != IPPROTO_DONE) return (rip6_input(mp, offp, proto)); return (IPPROTO_DONE); } #endif /* INET6 */ diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c index 3e5ce6283f0c..0bf9b86756f8 100644 --- a/sys/netinet/ip_icmp.c +++ b/sys/netinet/ip_icmp.c @@ -1,1071 +1,1058 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ /* * ICMP routines: error generation, receive packet processing, and * routines to turnaround packets back to the originator, and * host table maintenance routines. */ VNET_DEFINE_STATIC(int, icmplim) = 200; #define V_icmplim VNET(icmplim) SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmplim), 0, "Maximum number of ICMP responses per second"); VNET_DEFINE_STATIC(int, icmplim_output) = 1; #define V_icmplim_output VNET(icmplim_output) SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmplim_output), 0, "Enable logging of ICMP response rate limiting"); #ifdef INET VNET_PCPUSTAT_DEFINE(struct icmpstat, icmpstat); VNET_PCPUSTAT_SYSINIT(icmpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_icmp, ICMPCTL_STATS, stats, struct icmpstat, icmpstat, "ICMP statistics (struct icmpstat, netinet/icmp_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(icmpstat); #endif /* VIMAGE */ VNET_DEFINE_STATIC(int, icmpmaskrepl) = 0; #define V_icmpmaskrepl VNET(icmpmaskrepl) SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpmaskrepl), 0, "Reply to ICMP Address Mask Request packets"); VNET_DEFINE_STATIC(u_int, icmpmaskfake) = 0; #define V_icmpmaskfake VNET(icmpmaskfake) SYSCTL_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpmaskfake), 0, "Fake reply to ICMP Address Mask Request packets"); VNET_DEFINE(int, drop_redirect) = 0; #define V_drop_redirect VNET(drop_redirect) SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(drop_redirect), 0, "Ignore ICMP redirects"); VNET_DEFINE_STATIC(int, log_redirect) = 0; #define V_log_redirect VNET(log_redirect) SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(log_redirect), 0, "Log ICMP redirects to the console"); VNET_DEFINE_STATIC(char, reply_src[IFNAMSIZ]); #define V_reply_src VNET(reply_src) SYSCTL_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(reply_src), IFNAMSIZ, "ICMP reply source for non-local packets"); VNET_DEFINE_STATIC(int, icmp_rfi) = 0; #define V_icmp_rfi VNET(icmp_rfi) SYSCTL_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_rfi), 0, "ICMP reply from incoming interface for non-local packets"); /* Router requirements RFC 1812 section 4.3.2.3 requires 576 - 28. */ VNET_DEFINE_STATIC(int, icmp_quotelen) = 548; #define V_icmp_quotelen VNET(icmp_quotelen) SYSCTL_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_quotelen), 0, "Number of bytes from original packet to quote in ICMP reply"); VNET_DEFINE_STATIC(int, icmpbmcastecho) = 0; #define V_icmpbmcastecho VNET(icmpbmcastecho) SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpbmcastecho), 0, "Reply to multicast ICMP Echo Request and Timestamp packets"); VNET_DEFINE_STATIC(int, icmptstamprepl) = 1; #define V_icmptstamprepl VNET(icmptstamprepl) SYSCTL_INT(_net_inet_icmp, OID_AUTO, tstamprepl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmptstamprepl), 0, "Respond to ICMP Timestamp packets"); VNET_DEFINE_STATIC(int, error_keeptags) = 0; #define V_error_keeptags VNET(error_keeptags) SYSCTL_INT(_net_inet_icmp, OID_AUTO, error_keeptags, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(error_keeptags), 0, "ICMP error response keeps copy of mbuf_tags of original packet"); #ifdef ICMPPRINTFS int icmpprintfs = 0; #endif static void icmp_reflect(struct mbuf *); static void icmp_send(struct mbuf *, struct mbuf *); extern struct protosw inetsw[]; /* * Kernel module interface for updating icmpstat. The argument is an index * into icmpstat treated as an array of u_long. While this encodes the * general layout of icmpstat into the caller, it doesn't encode its * location, so that future changes to add, for example, per-CPU stats * support won't cause binary compatibility problems for kernel modules. */ void kmod_icmpstat_inc(int statnum) { counter_u64_add(VNET(icmpstat)[statnum], 1); } /* * Generate an error packet of type error * in response to bad packet ip. */ void icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu) { struct ip *oip, *nip; struct icmp *icp; struct mbuf *m; unsigned icmplen, icmpelen, nlen, oiphlen; KASSERT((u_int)type <= ICMP_MAXTYPE, ("%s: illegal ICMP type", __func__)); if (type != ICMP_REDIRECT) ICMPSTAT_INC(icps_error); /* * Don't send error: * if the original packet was encrypted. * if not the first fragment of message. * in response to a multicast or broadcast packet. * if the old packet protocol was an ICMP error message. */ if (n->m_flags & M_DECRYPTED) goto freeit; if (n->m_flags & (M_BCAST|M_MCAST)) goto freeit; /* Drop if IP header plus 8 bytes is not contiguous in first mbuf. */ if (n->m_len < sizeof(struct ip) + ICMP_MINLEN) goto freeit; oip = mtod(n, struct ip *); oiphlen = oip->ip_hl << 2; if (n->m_len < oiphlen + ICMP_MINLEN) goto freeit; #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_error(%p, %x, %d)\n", oip, type, code); #endif if (oip->ip_off & htons(~(IP_MF|IP_DF))) goto freeit; if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiphlen))->icmp_type)) { ICMPSTAT_INC(icps_oldicmp); goto freeit; } /* * Calculate length to quote from original packet and * prevent the ICMP mbuf from overflowing. * Unfortunately this is non-trivial since ip_forward() * sends us truncated packets. */ nlen = m_length(n, NULL); if (oip->ip_p == IPPROTO_TCP) { struct tcphdr *th; int tcphlen; if (oiphlen + sizeof(struct tcphdr) > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + sizeof(struct tcphdr) && (n = m_pullup(n, oiphlen + sizeof(struct tcphdr))) == NULL) goto freeit; oip = mtod(n, struct ip *); th = mtodo(n, oiphlen); tcphlen = th->th_off << 2; if (tcphlen < sizeof(struct tcphdr)) goto freeit; if (ntohs(oip->ip_len) < oiphlen + tcphlen) goto freeit; if (oiphlen + tcphlen > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + tcphlen && (n = m_pullup(n, oiphlen + tcphlen)) == NULL) goto freeit; oip = mtod(n, struct ip *); icmpelen = max(tcphlen, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); } else if (oip->ip_p == IPPROTO_SCTP) { struct sctphdr *sh; struct sctp_chunkhdr *ch; if (ntohs(oip->ip_len) < oiphlen + sizeof(struct sctphdr)) goto stdreply; if (oiphlen + sizeof(struct sctphdr) > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + sizeof(struct sctphdr) && (n = m_pullup(n, oiphlen + sizeof(struct sctphdr))) == NULL) goto freeit; oip = mtod(n, struct ip *); icmpelen = max(sizeof(struct sctphdr), min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); sh = mtodo(n, oiphlen); if (ntohl(sh->v_tag) == 0 && ntohs(oip->ip_len) >= oiphlen + sizeof(struct sctphdr) + 8 && (n->m_len >= oiphlen + sizeof(struct sctphdr) + 8 || n->m_next != NULL)) { if (n->m_len < oiphlen + sizeof(struct sctphdr) + 8 && (n = m_pullup(n, oiphlen + sizeof(struct sctphdr) + 8)) == NULL) goto freeit; oip = mtod(n, struct ip *); sh = mtodo(n, oiphlen); ch = (struct sctp_chunkhdr *)(sh + 1); if (ch->chunk_type == SCTP_INITIATION) { icmpelen = max(sizeof(struct sctphdr) + 8, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); } } } else stdreply: icmpelen = max(8, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); icmplen = min(oiphlen + icmpelen, nlen); if (icmplen < sizeof(struct ip)) goto freeit; if (MHLEN > sizeof(struct ip) + ICMP_MINLEN + icmplen) m = m_gethdr(M_NOWAIT, MT_DATA); else m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) goto freeit; #ifdef MAC mac_netinet_icmp_reply(n, m); #endif icmplen = min(icmplen, M_TRAILINGSPACE(m) - sizeof(struct ip) - ICMP_MINLEN); m_align(m, sizeof(struct ip) + ICMP_MINLEN + icmplen); m->m_data += sizeof(struct ip); m->m_len = ICMP_MINLEN + icmplen; /* XXX MRT make the outgoing packet use the same FIB * that was associated with the incoming packet */ M_SETFIB(m, M_GETFIB(n)); icp = mtod(m, struct icmp *); ICMPSTAT_INC(icps_outhist[type]); icp->icmp_type = type; if (type == ICMP_REDIRECT) icp->icmp_gwaddr.s_addr = dest; else { icp->icmp_void = 0; /* * The following assignments assume an overlay with the * just zeroed icmp_void field. */ if (type == ICMP_PARAMPROB) { icp->icmp_pptr = code; code = 0; } else if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG && mtu) { icp->icmp_nextmtu = htons(mtu); } } icp->icmp_code = code; /* * Copy the quotation into ICMP message and * convert quoted IP header back to network representation. */ m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip); nip = &icp->icmp_ip; /* * Set up ICMP message mbuf and copy old IP header (without options * in front of ICMP message. * If the original mbuf was meant to bypass the firewall, the error * reply should bypass as well. */ m->m_flags |= n->m_flags & M_SKIP_FIREWALL; KASSERT(M_LEADINGSPACE(m) >= sizeof(struct ip), ("insufficient space for ip header")); m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = n->m_pkthdr.rcvif; nip = mtod(m, struct ip *); bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip)); nip->ip_len = htons(m->m_len); nip->ip_v = IPVERSION; nip->ip_hl = 5; nip->ip_p = IPPROTO_ICMP; nip->ip_tos = 0; nip->ip_off = 0; if (V_error_keeptags) m_tag_copy_chain(m, n, M_NOWAIT); icmp_reflect(m); freeit: m_freem(n); } /* * Process a received ICMP message. */ int icmp_input(struct mbuf **mp, int *offp, int proto) { - struct epoch_tracker et; struct icmp *icp; struct in_ifaddr *ia; struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct sockaddr_in icmpsrc, icmpdst, icmpgw; int hlen = *offp; int icmplen = ntohs(ip->ip_len) - *offp; int i, code; void (*ctlfunc)(int, struct sockaddr *, void *); int fibnum; + NET_EPOCH_ASSERT(); + *mp = NULL; /* * Locate icmp structure in mbuf, and check * that not corrupted and of at least minimum length. */ #ifdef ICMPPRINTFS if (icmpprintfs) { char srcbuf[INET_ADDRSTRLEN]; char dstbuf[INET_ADDRSTRLEN]; printf("icmp_input from %s to %s, len %d\n", inet_ntoa_r(ip->ip_src, srcbuf), inet_ntoa_r(ip->ip_dst, dstbuf), icmplen); } #endif - NET_EPOCH_ENTER(et); if (icmplen < ICMP_MINLEN) { ICMPSTAT_INC(icps_tooshort); goto freeit; } i = hlen + min(icmplen, ICMP_ADVLENMIN); if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { ICMPSTAT_INC(icps_tooshort); - NET_EPOCH_EXIT(et); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); m->m_len -= hlen; m->m_data += hlen; icp = mtod(m, struct icmp *); if (in_cksum(m, icmplen)) { ICMPSTAT_INC(icps_checksum); goto freeit; } m->m_len += hlen; m->m_data -= hlen; #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_input, type %d code %d\n", icp->icmp_type, icp->icmp_code); #endif /* * Message type specific processing. */ if (icp->icmp_type > ICMP_MAXTYPE) goto raw; /* Initialize */ bzero(&icmpsrc, sizeof(icmpsrc)); icmpsrc.sin_len = sizeof(struct sockaddr_in); icmpsrc.sin_family = AF_INET; bzero(&icmpdst, sizeof(icmpdst)); icmpdst.sin_len = sizeof(struct sockaddr_in); icmpdst.sin_family = AF_INET; bzero(&icmpgw, sizeof(icmpgw)); icmpgw.sin_len = sizeof(struct sockaddr_in); icmpgw.sin_family = AF_INET; ICMPSTAT_INC(icps_inhist[icp->icmp_type]); code = icp->icmp_code; switch (icp->icmp_type) { case ICMP_UNREACH: switch (code) { case ICMP_UNREACH_NET: case ICMP_UNREACH_HOST: case ICMP_UNREACH_SRCFAIL: case ICMP_UNREACH_NET_UNKNOWN: case ICMP_UNREACH_HOST_UNKNOWN: case ICMP_UNREACH_ISOLATED: case ICMP_UNREACH_TOSNET: case ICMP_UNREACH_TOSHOST: case ICMP_UNREACH_HOST_PRECEDENCE: case ICMP_UNREACH_PRECEDENCE_CUTOFF: code = PRC_UNREACH_NET; break; case ICMP_UNREACH_NEEDFRAG: code = PRC_MSGSIZE; break; /* * RFC 1122, Sections 3.2.2.1 and 4.2.3.9. * Treat subcodes 2,3 as immediate RST */ case ICMP_UNREACH_PROTOCOL: code = PRC_UNREACH_PROTOCOL; break; case ICMP_UNREACH_PORT: code = PRC_UNREACH_PORT; break; case ICMP_UNREACH_NET_PROHIB: case ICMP_UNREACH_HOST_PROHIB: case ICMP_UNREACH_FILTER_PROHIB: code = PRC_UNREACH_ADMIN_PROHIB; break; default: goto badcode; } goto deliver; case ICMP_TIMXCEED: if (code > 1) goto badcode; code += PRC_TIMXCEED_INTRANS; goto deliver; case ICMP_PARAMPROB: if (code > 1) goto badcode; code = PRC_PARAMPROB; deliver: /* * Problem with datagram; advise higher level routines. */ if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { ICMPSTAT_INC(icps_badlen); goto freeit; } /* Discard ICMP's in response to multicast packets */ if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr))) goto badcode; #ifdef ICMPPRINTFS if (icmpprintfs) printf("deliver to protocol %d\n", icp->icmp_ip.ip_p); #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; /* * XXX if the packet contains [IPv4 AH TCP], we can't make a * notification to TCP layer. */ i = sizeof(struct ip) + min(icmplen, ICMP_ADVLENPREF(icp)); ip_stripoptions(m); if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { /* This should actually not happen */ ICMPSTAT_INC(icps_tooshort); - NET_EPOCH_EXIT(et); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); icp = (struct icmp *)(ip + 1); /* * The upper layer handler can rely on: * - The outer IP header has no options. * - The outer IP header, the ICMP header, the inner IP header, * and the first n bytes of the inner payload are contiguous. * n is at least 8, but might be larger based on * ICMP_ADVLENPREF. See its definition in ip_icmp.h. */ ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput; if (ctlfunc) (*ctlfunc)(code, (struct sockaddr *)&icmpsrc, (void *)&icp->icmp_ip); break; badcode: ICMPSTAT_INC(icps_badcode); break; case ICMP_ECHO: if (!V_icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { ICMPSTAT_INC(icps_bmcastecho); break; } if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0) goto freeit; icp->icmp_type = ICMP_ECHOREPLY; goto reflect; case ICMP_TSTAMP: if (V_icmptstamprepl == 0) break; if (!V_icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { ICMPSTAT_INC(icps_bmcasttstamp); break; } if (icmplen < ICMP_TSLEN) { ICMPSTAT_INC(icps_badlen); break; } if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0) goto freeit; icp->icmp_type = ICMP_TSTAMPREPLY; icp->icmp_rtime = iptime(); icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */ goto reflect; case ICMP_MASKREQ: if (V_icmpmaskrepl == 0) break; /* * We are not able to respond with all ones broadcast * unless we receive it over a point-to-point interface. */ if (icmplen < ICMP_MASKLEN) break; switch (ip->ip_dst.s_addr) { case INADDR_BROADCAST: case INADDR_ANY: icmpdst.sin_addr = ip->ip_src; break; default: icmpdst.sin_addr = ip->ip_dst; } ia = (struct in_ifaddr *)ifaof_ifpforaddr( (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); if (ia == NULL) break; if (ia->ia_ifp == NULL) break; icp->icmp_type = ICMP_MASKREPLY; if (V_icmpmaskfake == 0) icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr; else icp->icmp_mask = V_icmpmaskfake; if (ip->ip_src.s_addr == 0) { if (ia->ia_ifp->if_flags & IFF_BROADCAST) ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr; else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; } reflect: ICMPSTAT_INC(icps_reflect); ICMPSTAT_INC(icps_outhist[icp->icmp_type]); icmp_reflect(m); - NET_EPOCH_EXIT(et); return (IPPROTO_DONE); case ICMP_REDIRECT: if (V_log_redirect) { u_long src, dst, gw; src = ntohl(ip->ip_src.s_addr); dst = ntohl(icp->icmp_ip.ip_dst.s_addr); gw = ntohl(icp->icmp_gwaddr.s_addr); printf("icmp redirect from %d.%d.%d.%d: " "%d.%d.%d.%d => %d.%d.%d.%d\n", (int)(src >> 24), (int)((src >> 16) & 0xff), (int)((src >> 8) & 0xff), (int)(src & 0xff), (int)(dst >> 24), (int)((dst >> 16) & 0xff), (int)((dst >> 8) & 0xff), (int)(dst & 0xff), (int)(gw >> 24), (int)((gw >> 16) & 0xff), (int)((gw >> 8) & 0xff), (int)(gw & 0xff)); } /* * RFC1812 says we must ignore ICMP redirects if we * are acting as router. */ if (V_drop_redirect || V_ipforwarding) break; if (code > 3) goto badcode; if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { ICMPSTAT_INC(icps_badlen); break; } /* * Short circuit routing redirects to force * immediate change in the kernel's routing * tables. The message is also handed to anyone * listening on a raw socket (e.g. the routing * daemon for use in updating its tables). */ icmpgw.sin_addr = ip->ip_src; icmpdst.sin_addr = icp->icmp_gwaddr; #ifdef ICMPPRINTFS if (icmpprintfs) { char dstbuf[INET_ADDRSTRLEN]; char gwbuf[INET_ADDRSTRLEN]; printf("redirect dst %s to %s\n", inet_ntoa_r(icp->icmp_ip.ip_dst, dstbuf), inet_ntoa_r(icp->icmp_gwaddr, gwbuf)); } #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) { in_rtredirect((struct sockaddr *)&icmpsrc, (struct sockaddr *)&icmpdst, (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST, (struct sockaddr *)&icmpgw, fibnum); } pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc); break; /* * No kernel processing for the following; * just fall through to send to raw listener. */ case ICMP_ECHOREPLY: case ICMP_ROUTERADVERT: case ICMP_ROUTERSOLICIT: case ICMP_TSTAMPREPLY: case ICMP_IREQREPLY: case ICMP_MASKREPLY: case ICMP_SOURCEQUENCH: default: break; } raw: - NET_EPOCH_EXIT(et); *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); freeit: - NET_EPOCH_EXIT(et); m_freem(m); return (IPPROTO_DONE); } /* * Reflect the ip packet back to the source */ static void icmp_reflect(struct mbuf *m) { struct rm_priotracker in_ifa_tracker; - struct epoch_tracker et; struct ip *ip = mtod(m, struct ip *); struct ifaddr *ifa; struct ifnet *ifp; struct in_ifaddr *ia; struct in_addr t; struct nhop4_extended nh_ext; struct mbuf *opts = NULL; int optlen = (ip->ip_hl << 2) - sizeof(struct ip); + NET_EPOCH_ASSERT(); + if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || IN_EXPERIMENTAL(ntohl(ip->ip_src.s_addr)) || IN_ZERONET(ntohl(ip->ip_src.s_addr)) ) { m_freem(m); /* Bad return address */ ICMPSTAT_INC(icps_badaddr); goto done; /* Ip_output() will check for broadcast */ } t = ip->ip_dst; ip->ip_dst = ip->ip_src; /* * Source selection for ICMP replies: * * If the incoming packet was addressed directly to one of our * own addresses, use dst as the src for the reply. */ IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) { if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) { t = IA_SIN(ia)->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * If the incoming packet was addressed to one of our broadcast * addresses, use the first non-broadcast address which corresponds * to the incoming interface. */ ifp = m->m_pkthdr.rcvif; if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == t.s_addr) { t = IA_SIN(ia)->sin_addr; - NET_EPOCH_EXIT(et); goto match; } } - NET_EPOCH_EXIT(et); } /* * If the packet was transiting through us, use the address of * the interface the packet came through in. If that interface * doesn't have a suitable IP address, the normal selection * criteria apply. */ if (V_icmp_rfi && ifp != NULL) { - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); t = IA_SIN(ia)->sin_addr; - NET_EPOCH_EXIT(et); goto match; } - NET_EPOCH_EXIT(et); } /* * If the incoming packet was not addressed directly to us, use * designated interface for icmp replies specified by sysctl * net.inet.icmp.reply_src (default not set). Otherwise continue * with normal source selection. */ if (V_reply_src[0] != '\0' && (ifp = ifunit(V_reply_src))) { - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); t = IA_SIN(ia)->sin_addr; - NET_EPOCH_EXIT(et); goto match; } - NET_EPOCH_EXIT(et); } /* * If the packet was transiting through us, use the address of * the interface that is the closest to the packet source. * When we don't have a route back to the packet source, stop here * and drop the packet. */ if (fib4_lookup_nh_ext(M_GETFIB(m), ip->ip_dst, 0, 0, &nh_ext) != 0) { m_freem(m); ICMPSTAT_INC(icps_noroute); goto done; } t = nh_ext.nh_src; match: #ifdef MAC mac_netinet_icmp_replyinplace(m); #endif ip->ip_src = t; ip->ip_ttl = V_ip_defttl; if (optlen > 0) { u_char *cp; int opt, cnt; u_int len; /* * Retrieve any source routing from the incoming packet; * add on any record-route or timestamp options. */ cp = (u_char *) (ip + 1); if ((opts = ip_srcroute(m)) == NULL && (opts = m_gethdr(M_NOWAIT, MT_DATA))) { opts->m_len = sizeof(struct in_addr); mtod(opts, struct in_addr *)->s_addr = 0; } if (opts) { #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_reflect optlen %d rt %d => ", optlen, opts->m_len); #endif for (cnt = optlen; cnt > 0; cnt -= len, cp += len) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) len = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) break; len = cp[IPOPT_OLEN]; if (len < IPOPT_OLEN + sizeof(*cp) || len > cnt) break; } /* * Should check for overflow, but it "can't happen" */ if (opt == IPOPT_RR || opt == IPOPT_TS || opt == IPOPT_SECURITY) { bcopy((caddr_t)cp, mtod(opts, caddr_t) + opts->m_len, len); opts->m_len += len; } } /* Terminate & pad, if necessary */ cnt = opts->m_len % 4; if (cnt) { for (; cnt < 4; cnt++) { *(mtod(opts, caddr_t) + opts->m_len) = IPOPT_EOL; opts->m_len++; } } #ifdef ICMPPRINTFS if (icmpprintfs) printf("%d\n", opts->m_len); #endif } ip_stripoptions(m); } m_tag_delete_nonpersistent(m); m->m_flags &= ~(M_BCAST|M_MCAST); icmp_send(m, opts); done: if (opts) (void)m_free(opts); } /* * Send an icmp packet back to the ip level, * after supplying a checksum. */ static void icmp_send(struct mbuf *m, struct mbuf *opts) { struct ip *ip = mtod(m, struct ip *); int hlen; struct icmp *icp; hlen = ip->ip_hl << 2; m->m_data += hlen; m->m_len -= hlen; icp = mtod(m, struct icmp *); icp->icmp_cksum = 0; icp->icmp_cksum = in_cksum(m, ntohs(ip->ip_len) - hlen); m->m_data -= hlen; m->m_len += hlen; m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef ICMPPRINTFS if (icmpprintfs) { char dstbuf[INET_ADDRSTRLEN]; char srcbuf[INET_ADDRSTRLEN]; printf("icmp_send dst %s src %s\n", inet_ntoa_r(ip->ip_dst, dstbuf), inet_ntoa_r(ip->ip_src, srcbuf)); } #endif (void) ip_output(m, opts, NULL, 0, NULL, NULL); } /* * Return milliseconds since 00:00 UTC in network format. */ uint32_t iptime(void) { struct timeval atv; u_long t; getmicrotime(&atv); t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000; return (htonl(t)); } /* * Return the next larger or smaller MTU plateau (table from RFC 1191) * given current value MTU. If DIR is less than zero, a larger plateau * is returned; otherwise, a smaller value is returned. */ int ip_next_mtu(int mtu, int dir) { static int mtutab[] = { 65535, 32000, 17914, 8166, 4352, 2002, 1492, 1280, 1006, 508, 296, 68, 0 }; int i, size; size = (sizeof mtutab) / (sizeof mtutab[0]); if (dir >= 0) { for (i = 0; i < size; i++) if (mtu > mtutab[i]) return mtutab[i]; } else { for (i = size - 1; i >= 0; i--) if (mtu < mtutab[i]) return mtutab[i]; if (mtu == mtutab[0]) return mtutab[0]; } return 0; } #endif /* INET */ /* * badport_bandlim() - check for ICMP bandwidth limit * * Return 0 if it is ok to send an ICMP error response, -1 if we have * hit our bandwidth limit and it is not ok. * * If icmplim is <= 0, the feature is disabled and 0 is returned. * * For now we separate the TCP and UDP subsystems w/ different 'which' * values. We may eventually remove this separation (and simplify the * code further). * * Note that the printing of the error message is delayed so we can * properly print the icmp error rate that the system was trying to do * (i.e. 22000/100 pps, etc...). This can cause long delays in printing * the 'final' error, but it doesn't make sense to solve the printing * delay with more complex code. */ struct icmp_rate { const char *descr; struct counter_rate cr; }; VNET_DEFINE_STATIC(struct icmp_rate, icmp_rates[BANDLIM_MAX]) = { { "icmp unreach response" }, { "icmp ping response" }, { "icmp tstamp response" }, { "closed port RST response" }, { "open port RST response" }, { "icmp6 unreach response" }, { "sctp ootb response" } }; #define V_icmp_rates VNET(icmp_rates) static void icmp_bandlimit_init(void) { for (int i = 0; i < BANDLIM_MAX; i++) { V_icmp_rates[i].cr.cr_rate = counter_u64_alloc(M_WAITOK); V_icmp_rates[i].cr.cr_ticks = ticks; } } VNET_SYSINIT(icmp_bandlimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, icmp_bandlimit_init, NULL); static void icmp_bandlimit_uninit(void) { for (int i = 0; i < BANDLIM_MAX; i++) counter_u64_free(V_icmp_rates[i].cr.cr_rate); } VNET_SYSUNINIT(icmp_bandlimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, icmp_bandlimit_uninit, NULL); int badport_bandlim(int which) { int64_t pps; if (V_icmplim == 0 || which == BANDLIM_UNLIMITED) return (0); KASSERT(which >= 0 && which < BANDLIM_MAX, ("%s: which %d", __func__, which)); pps = counter_ratecheck(&V_icmp_rates[which].cr, V_icmplim); if (pps == -1) return (-1); if (pps > 0 && V_icmplim_output) log(LOG_NOTICE, "Limiting %s from %jd to %d packets/sec\n", V_icmp_rates[which].descr, (intmax_t )pps, V_icmplim); return (0); } diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c index 03f5fed3e82b..8991262cc120 100644 --- a/sys/netinet/ip_input.c +++ b/sys/netinet/ip_input.c @@ -1,1432 +1,1425 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CTASSERT CTASSERT(sizeof(struct ip) == 20); #endif /* IP reassembly functions are defined in ip_reass.c. */ extern void ipreass_init(void); extern void ipreass_drain(void); extern void ipreass_slowtimo(void); #ifdef VIMAGE extern void ipreass_destroy(void); #endif struct rmlock in_ifaddr_lock; RM_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock"); VNET_DEFINE(int, rsvp_on); VNET_DEFINE(int, ipforwarding); SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipforwarding), 0, "Enable IP forwarding between interfaces"); VNET_DEFINE_STATIC(int, ipsendredirects) = 1; /* XXX */ #define V_ipsendredirects VNET(ipsendredirects) SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsendredirects), 0, "Enable sending IP redirects"); /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table * and transmit implementation do not implement the Strong ES model, * setting this to 1 results in an odd hybrid. * * XXX - ip_checkinterface currently must be disabled if you use ipnat * to translate the destination address to another local interface. * * XXX - ip_checkinterface must be disabled if you add IP aliases * to the loopback interface instead of the interface where the * packets for those addresses are received. */ VNET_DEFINE_STATIC(int, ip_checkinterface); #define V_ip_checkinterface VNET(ip_checkinterface) SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_checkinterface), 0, "Verify packet arrives on correct interface"); VNET_DEFINE(pfil_head_t, inet_pfil_head); /* Packet filter hooks */ static struct netisr_handler ip_nh = { .nh_name = "ip", .nh_handler = ip_input, .nh_proto = NETISR_IP, #ifdef RSS .nh_m2cpuid = rss_soft_m2cpuid_v4, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, #else .nh_policy = NETISR_POLICY_FLOW, #endif }; #ifdef RSS /* * Directly dispatched frames are currently assumed * to have a flowid already calculated. * * It should likely have something that assert it * actually has valid flow details. */ static struct netisr_handler ip_direct_nh = { .nh_name = "ip_direct", .nh_handler = ip_direct_input, .nh_proto = NETISR_IP_DIRECT, .nh_m2cpuid = rss_soft_m2cpuid_v4, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, }; #endif extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */ VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */ VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */ #ifdef IPCTL_DEFMTU SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, &ip_mtu, 0, "Default MTU"); #endif #ifdef IPSTEALTH VNET_DEFINE(int, ipstealth); SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipstealth), 0, "IP stealth mode, no TTL decrementation on forwarding"); #endif /* * IP statistics are stored in the "array" of counter(9)s. */ VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat); VNET_PCPUSTAT_SYSINIT(ipstat); SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ipstat); #endif /* VIMAGE */ /* * Kernel module interface for updating ipstat. The argument is an index * into ipstat treated as an array. */ void kmod_ipstat_inc(int statnum) { counter_u64_add(VNET(ipstat)[statnum], 1); } void kmod_ipstat_dec(int statnum) { counter_u64_add(VNET(ipstat)[statnum], -1); } static int sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I", "Maximum size of the IP input queue"); static int sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I", "Number of packets dropped from the IP input queue"); #ifdef RSS static int sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_direct_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_direct_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQMAXLEN, intr_direct_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I", "Maximum size of the IP direct input queue"); static int sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_direct_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_direct_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQDROPS, intr_direct_queue_drops, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I", "Number of packets dropped from the IP direct input queue"); #endif /* RSS */ /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. */ void ip_init(void) { struct pfil_head_args args; struct protosw *pr; int i; CK_STAILQ_INIT(&V_in_ifaddrhead); V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); /* Initialize IP reassembly queue. */ ipreass_init(); /* Initialize packet filter hooks. */ args.pa_version = PFIL_VERSION; args.pa_flags = PFIL_IN | PFIL_OUT; args.pa_type = PFIL_TYPE_IP4; args.pa_headname = PFIL_INET_NAME; V_inet_pfil_head = pfil_head_register(&args); if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET, &V_ipsec_hhh_in[HHOOK_IPSEC_INET], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register input helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET, &V_ipsec_hhh_out[HHOOK_IPSEC_INET], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register output helper hook\n", __func__); /* Skip initialization of globals for non-default instances. */ #ifdef VIMAGE if (!IS_DEFAULT_VNET(curvnet)) { netisr_register_vnet(&ip_nh); #ifdef RSS netisr_register_vnet(&ip_direct_nh); #endif return; } #endif pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip_init: PF_INET not found"); /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip_protox[i] = pr - inetsw; /* * Cycle through IP protocols and put them into the appropriate place * in ip_protox[]. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip_protox[pr->pr_protocol] = pr - inetsw; } netisr_register(&ip_nh); #ifdef RSS netisr_register(&ip_direct_nh); #endif } #ifdef VIMAGE static void ip_destroy(void *unused __unused) { struct ifnet *ifp; int error; #ifdef RSS netisr_unregister_vnet(&ip_direct_nh); #endif netisr_unregister_vnet(&ip_nh); pfil_head_unregister(V_inet_pfil_head); error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET]); if (error != 0) { printf("%s: WARNING: unable to deregister input helper hook " "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET: " "error %d returned\n", __func__, error); } error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET]); if (error != 0) { printf("%s: WARNING: unable to deregister output helper hook " "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET: " "error %d returned\n", __func__, error); } /* Remove the IPv4 addresses from all interfaces. */ in_ifscrub_all(); /* Make sure the IPv4 routes are gone as well. */ IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) rt_flushifroutes_af(ifp, AF_INET); IFNET_RUNLOCK(); /* Destroy IP reassembly queue. */ ipreass_destroy(); /* Cleanup in_ifaddr hash table; should be empty. */ hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask); } VNET_SYSUNINIT(ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_destroy, NULL); #endif #ifdef RSS /* * IP direct input routine. * * This is called when reinjecting completed fragments where * all of the previous checking and book-keeping has been done. */ void ip_direct_input(struct mbuf *m) { struct ip *ip; int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if (IPSEC_INPUT(ipv4, m, hlen, ip->ip_p) != 0) return; } #endif /* IPSEC */ IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; } #endif /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. */ void ip_input(struct mbuf *m) { struct rm_priotracker in_ifa_tracker; struct ip *ip = NULL; struct in_ifaddr *ia = NULL; struct ifaddr *ifa; struct ifnet *ifp; int checkif, hlen = 0; uint16_t sum, ip_len; int dchg = 0; /* dest changed after fw */ struct in_addr odst; /* original dst address */ M_ASSERTPKTHDR(m); + NET_EPOCH_ASSERT(); if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; /* Set up some basics that will be used later. */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip_len = ntohs(ip->ip_len); goto ours; } IPSTAT_INC(ips_total); if (m->m_pkthdr.len < sizeof(struct ip)) goto tooshort; if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { IPSTAT_INC(ips_toosmall); return; } ip = mtod(m, struct ip *); if (ip->ip_v != IPVERSION) { IPSTAT_INC(ips_badvers); goto bad; } hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ IPSTAT_INC(ips_badhlen); goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { IPSTAT_INC(ips_badhlen); return; } ip = mtod(m, struct ip *); } IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL); /* IN_LOOPBACK must not appear on the wire - RFC1122 */ ifp = m->m_pkthdr.rcvif; if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) || IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); goto bad; } } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); } else { if (hlen == sizeof(struct ip)) { sum = in_cksum_hdr(ip); } else { sum = in_cksum(m, hlen); } } if (sum) { IPSTAT_INC(ips_badsum); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) /* packet is dropped by traffic conditioner */ return; #endif ip_len = ntohs(ip->ip_len); if (ip_len < hlen) { IPSTAT_INC(ips_badlen); goto bad; } /* * Check that the amount of data in the buffers * is as at least much as the IP header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len < ip_len) { tooshort: IPSTAT_INC(ips_tooshort); goto bad; } if (m->m_pkthdr.len > ip_len) { if (m->m_len == m->m_pkthdr.len) { m->m_len = ip_len; m->m_pkthdr.len = ip_len; } else m_adj(m, ip_len - m->m_pkthdr.len); } /* * Try to forward the packet, but if we fail continue. * ip_tryforward() does not generate redirects, so fall * through to normal processing if redirects are required. * ip_tryforward() does inbound and outbound packet firewall * processing. If firewall has decided that destination becomes * our local address, it sets M_FASTFWD_OURS flag. In this * case skip another inbound firewall processing and update * ip pointer. */ if (V_ipforwarding != 0 && V_ipsendredirects == 0 #if defined(IPSEC) || defined(IPSEC_SUPPORT) && (!IPSEC_ENABLED(ipv4) || IPSEC_CAPS(ipv4, m, IPSEC_CAP_OPERABLE) == 0) #endif ) { if ((m = ip_tryforward(m)) == NULL) return; if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; ip = mtod(m, struct ip *); goto ours; } } #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Bypass packet filtering for packets previously handled by IPsec. */ if (IPSEC_ENABLED(ipv4) && IPSEC_CAPS(ipv4, m, IPSEC_CAP_BYPASS_FILTER) != 0) goto passin; #endif /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing (e.g. * by NAT rewriting). When this happens, tell * ip_forward to do the right thing. */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED_IN(V_inet_pfil_head)) goto passin; odst = ip->ip_dst; if (pfil_run_hooks(V_inet_pfil_head, &m, ifp, PFIL_IN, NULL) != PFIL_PASS) return; if (m == NULL) /* consumed by filter */ return; ip = mtod(m, struct ip *); dchg = (odst.s_addr != ip->ip_dst.s_addr); ifp = m->m_pkthdr.rcvif; if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; goto ours; } if (m->m_flags & M_IP_NEXTHOP) { if (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) { /* * Directly ship the packet on. This allows * forwarding packets originally destined to us * to some other directly connected host. */ ip_forward(m, 1); return; } } passin: /* * Process options and, if not destined for us, * ship it on. ip_dooptions returns 1 when an * error was detected (causing an icmp message * to be sent and the original packet to be freed). */ if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) return; /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no * matter if it is destined to another node, or whether it is * a multicast one, RSVP wants it! and prevents it from being forwarded * anywhere else. Also checks if the rsvp daemon is running before * grabbing the packet. */ if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) goto ours; /* * Check our list of addresses, to see if the packet is for us. * If we don't have any addresses, assume any unicast packet * we receive might be for us (and let the upper layers deal * with it). */ if (CK_STAILQ_EMPTY(&V_in_ifaddrhead) && (m->m_flags & (M_MCAST|M_BCAST)) == 0) goto ours; /* * Enable a consistency check between the destination address * and the arrival interface for a unicast packet (the RFC 1122 * strong ES model) if IP forwarding is disabled and the packet * is not locally generated and the packet is not subject to * 'ipfw fwd'. * * XXX - Checking also should be disabled if the destination * address is ipnat'ed to a different interface. * * XXX - Checking is incompatible with IP aliases added * to the loopback interface instead of the interface where * the packets are received. * * XXX - This is the case for carp vhost IPs as well so we * insert a workaround. If the packet got here, we already * checked with carp_iamatch() and carp_forus(). */ checkif = V_ip_checkinterface && (V_ipforwarding == 0) && ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) && ifp->if_carp == NULL && (dchg == 0); /* * Check for exact addresses in the hash bucket. */ IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { /* * If the address matches, verify that the packet * arrived via the correct interface if checking is * enabled. */ if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && (!checkif || ia->ia_ifp == ifp)) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto ours; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * Check for broadcast addresses. * * Only accept broadcast packets that arrive via the matching * interface. Reception of forwarded directed broadcasts would * be handled via ip_forward() and ether_output() with the loopback * into the stack for SIMPLEX interfaces handled by ether_output(). */ if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { - struct epoch_tracker et; - - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == ip->ip_dst.s_addr) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); - NET_EPOCH_EXIT(et); goto ours; } #ifdef BOOTP_COMPAT if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); - NET_EPOCH_EXIT(et); goto ours; } #endif } - NET_EPOCH_EXIT(et); ia = NULL; } /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */ if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { if (V_ip_mrouter) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } /* * The process-level routing daemon needs to receive * all multicast IGMP packets, whether or not this * host belongs to their destination groups. */ if (ip->ip_p == IPPROTO_IGMP) goto ours; IPSTAT_INC(ips_forward); } /* * Assume the packet is for us, to avoid prematurely taking * a lock on the in_multi hash. Protocols must perform * their own filtering and update statistics accordingly. */ goto ours; } if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) goto ours; if (ip->ip_dst.s_addr == INADDR_ANY) goto ours; /* * Not for us; forward if possible and desirable. */ if (V_ipforwarding == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); } else { ip_forward(m, dchg); } return; ours: #ifdef IPSTEALTH /* * IPSTEALTH: Process non-routing options only * if the packet is destined for us. */ if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) return; #endif /* IPSTEALTH */ /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) { /* XXXGL: shouldn't we save & set m_flags? */ m = ip_reass(m); if (m == NULL) return; ip = mtod(m, struct ip *); /* Get the header length of the reassembled packet */ hlen = ip->ip_hl << 2; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if (IPSEC_INPUT(ipv4, m, hlen, ip->ip_p) != 0) return; } #endif /* IPSEC */ /* * Switch out to protocol's input routine. */ IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; bad: m_freem(m); } /* * IP timer processing; * if a timer expires on a reassembly * queue, discard it. */ void ip_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ipreass_slowtimo(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } void ip_drain(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ipreass_drain(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * The protocol to be inserted into ip_protox[] must be already registered * in inetsw[], either statically or through pf_proto_register(). */ int ipproto_register(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* * The protocol slot must not be occupied by another protocol * already. An index pointing to IPPROTO_RAW is unused. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */ return (EEXIST); /* Find the protocol position in inetsw[] and set the index. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) { if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol == ipproto) { ip_protox[pr->pr_protocol] = pr - inetsw; return (0); } } return (EPROTONOSUPPORT); } int ipproto_unregister(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* Check if the protocol was indeed registered. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */ return (ENOENT); /* Reset the protocol slot to IPPROTO_RAW. */ ip_protox[ipproto] = pr - inetsw; return (0); } u_char inetctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, EHOSTUNREACH, 0, ENOPROTOOPT, ECONNREFUSED }; /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful * icmp message because icmp doesn't have a large enough repertoire * of codes and types. * * If not forwarding, just drop the packet. This could be confusing * if ipforwarding was zero but some routing protocol was advancing * us as a gateway to somewhere. However, we must let the routing * protocol deal with that. * * The srcrt parameter indicates whether the packet is being forwarded * via a source route. */ void ip_forward(struct mbuf *m, int srcrt) { struct ip *ip = mtod(m, struct ip *); struct in_ifaddr *ia; struct mbuf *mcopy; struct sockaddr_in *sin; struct in_addr dest; struct route ro; - struct epoch_tracker et; int error, type = 0, code = 0, mtu = 0; + NET_EPOCH_ASSERT(); + if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } if ( #ifdef IPSTEALTH V_ipstealth == 0 && #endif ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); return; } bzero(&ro, sizeof(ro)); sin = (struct sockaddr_in *)&ro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ip->ip_dst; #ifdef RADIX_MPATH rtalloc_mpath_fib(&ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), M_GETFIB(m)); #else in_rtalloc_ign(&ro, 0, M_GETFIB(m)); #endif - NET_EPOCH_ENTER(et); if (ro.ro_rt != NULL) { ia = ifatoia(ro.ro_rt->rt_ifa); } else ia = NULL; /* * Save the IP header and at most 8 bytes of the payload, * in case we need to generate an ICMP message to the src. * * XXX this can be optimized a lot by saving the data in a local * buffer on the stack (72 bytes at most), and only allocating the * mbuf if really necessary. The vast majority of the packets * are forwarded without having to send an ICMP back (either * because unnecessary, or because rate limited), so we are * really we are wasting a lot of work here. * * We don't use m_copym() because it might return a reference * to a shared cluster. Both this function and ip_output() * assume exclusive access to the IP header in `m', so any * data in a cluster may change before we reach icmp_error(). */ mcopy = m_gethdr(M_NOWAIT, m->m_type); if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) { /* * It's probably ok if the pkthdr dup fails (because * the deep copy of the tag chain failed), but for now * be conservative and just discard the copy since * code below may some day want the tags. */ m_free(mcopy); mcopy = NULL; } if (mcopy != NULL) { mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy)); mcopy->m_pkthdr.len = mcopy->m_len; m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); } #ifdef IPSTEALTH if (V_ipstealth == 0) #endif ip->ip_ttl -= IPTTLDEC; #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if ((error = IPSEC_FORWARD(ipv4, m)) != 0) { /* mbuf consumed by IPsec */ m_freem(mcopy); if (error != EINPROGRESS) IPSTAT_INC(ips_cantforward); - goto out; + return; } /* No IPsec processing required */ } #endif /* IPSEC */ /* * If forwarding packet using same interface that it came in on, * perhaps should send a redirect to sender to shortcut a hop. * Only send redirect if source is sending directly to us, * and if packet was not source routed (or has any options). * Also, don't send redirect if forwarding using a default route * or a route modified by a redirect. */ dest.s_addr = 0; if (!srcrt && V_ipsendredirects && ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) { struct rtentry *rt; rt = ro.ro_rt; if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && satosin(rt_key(rt))->sin_addr.s_addr != 0) { #define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) u_long src = ntohl(ip->ip_src.s_addr); if (RTA(rt) && (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { if (rt->rt_flags & RTF_GATEWAY) dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr; else dest.s_addr = ip->ip_dst.s_addr; /* Router requirements says to only send host redirects */ type = ICMP_REDIRECT; code = ICMP_REDIRECT_HOST; } } } error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); if (error == EMSGSIZE && ro.ro_rt) mtu = ro.ro_rt->rt_mtu; RO_RTFREE(&ro); if (error) IPSTAT_INC(ips_cantforward); else { IPSTAT_INC(ips_forward); if (type) IPSTAT_INC(ips_redirectsent); else { if (mcopy) m_freem(mcopy); - goto out; + return; } } if (mcopy == NULL) - goto out; + return; switch (error) { case 0: /* forwarded, but need redirect */ /* type, code set above */ break; case ENETUNREACH: case EHOSTUNREACH: case ENETDOWN: case EHOSTDOWN: default: type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; break; case EMSGSIZE: type = ICMP_UNREACH; code = ICMP_UNREACH_NEEDFRAG; /* * If the MTU was set before make sure we are below the * interface MTU. * If the MTU wasn't set before use the interface mtu or * fall back to the next smaller mtu step compared to the * current packet size. */ if (mtu != 0) { if (ia != NULL) mtu = min(mtu, ia->ia_ifp->if_mtu); } else { if (ia != NULL) mtu = ia->ia_ifp->if_mtu; else mtu = ip_next_mtu(ntohs(ip->ip_len), 0); } IPSTAT_INC(ips_cantfrag); break; case ENOBUFS: case EACCES: /* ipfw denied packet */ m_freem(mcopy); - goto out; + return; } icmp_error(mcopy, type, code, dest.s_addr, mtu); - out: - NET_EPOCH_EXIT(et); } #define CHECK_SO_CT(sp, ct) \ (((sp->so_options & SO_TIMESTAMP) && (sp->so_ts_clock == ct)) ? 1 : 0) void ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct mbuf *m) { bool stamped; stamped = false; if ((inp->inp_socket->so_options & SO_BINTIME) || CHECK_SO_CT(inp->inp_socket, SO_TS_BINTIME)) { struct bintime boottimebin, bt; struct timespec ts1; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &bt); getboottimebin(&boottimebin); bintime_add(&bt, &boottimebin); } else { bintime(&bt); } *mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt), SCM_BINTIME, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME_MICRO)) { struct bintime boottimebin, bt1; struct timespec ts1;; struct timeval tv; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &bt1); getboottimebin(&boottimebin); bintime_add(&bt1, &boottimebin); bintime2timeval(&bt1, &tv); } else { microtime(&tv); } *mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } else if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME)) { struct bintime boottimebin; struct timespec ts, ts1; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts); getboottimebin(&boottimebin); bintime2timespec(&boottimebin, &ts1); timespecadd(&ts, &ts1, &ts); } else { nanotime(&ts); } *mp = sbcreatecontrol((caddr_t)&ts, sizeof(ts), SCM_REALTIME, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } else if (CHECK_SO_CT(inp->inp_socket, SO_TS_MONOTONIC)) { struct timespec ts; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) mbuf_tstmp2timespec(m, &ts); else nanouptime(&ts); *mp = sbcreatecontrol((caddr_t)&ts, sizeof(ts), SCM_MONOTONIC, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } if (stamped && (m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { struct sock_timestamp_info sti; bzero(&sti, sizeof(sti)); sti.st_info_flags = ST_INFO_HW; if ((m->m_flags & M_TSTMP_HPREC) != 0) sti.st_info_flags |= ST_INFO_HW_HPREC; *mp = sbcreatecontrol((caddr_t)&sti, sizeof(sti), SCM_TIME_INFO, SOL_SOCKET); if (*mp != NULL) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVDSTADDR) { *mp = sbcreatecontrol((caddr_t)&ip->ip_dst, sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTTL) { *mp = sbcreatecontrol((caddr_t)&ip->ip_ttl, sizeof(u_char), IP_RECVTTL, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef notyet /* XXX * Moving these out of udp_input() made them even more broken * than they already were. */ /* options were tossed already */ if (inp->inp_flags & INP_RECVOPTS) { *mp = sbcreatecontrol((caddr_t)opts_deleted_above, sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } /* ip_srcroute doesn't do what we want here, need to fix */ if (inp->inp_flags & INP_RECVRETOPTS) { *mp = sbcreatecontrol((caddr_t)ip_srcroute(m), sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #endif if (inp->inp_flags & INP_RECVIF) { struct ifnet *ifp; struct sdlbuf { struct sockaddr_dl sdl; u_char pad[32]; } sdlbuf; struct sockaddr_dl *sdp; struct sockaddr_dl *sdl2 = &sdlbuf.sdl; if ((ifp = m->m_pkthdr.rcvif) && ifp->if_index && ifp->if_index <= V_if_index) { sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; /* * Change our mind and don't try copy. */ if (sdp->sdl_family != AF_LINK || sdp->sdl_len > sizeof(sdlbuf)) { goto makedummy; } bcopy(sdp, sdl2, sdp->sdl_len); } else { makedummy: sdl2->sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); sdl2->sdl_family = AF_LINK; sdl2->sdl_index = 0; sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } *mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len, IP_RECVIF, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTOS) { *mp = sbcreatecontrol((caddr_t)&ip->ip_tos, sizeof(u_char), IP_RECVTOS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); /* * XXX should handle the failure of one or the * other - don't populate both? */ *mp = sbcreatecontrol((caddr_t) &flowid, sizeof(uint32_t), IP_FLOWID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; *mp = sbcreatecontrol((caddr_t) &flow_type, sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef RSS if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { *mp = sbcreatecontrol((caddr_t) &rss_bucketid, sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } } #endif } /* * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on * locking. This code remains in ip_input.c as ip_mroute.c is optionally * compiled. */ VNET_DEFINE_STATIC(int, ip_rsvp_on); VNET_DEFINE(struct socket *, ip_rsvpd); #define V_ip_rsvp_on VNET(ip_rsvp_on) int ip_rsvp_init(struct socket *so) { if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; if (V_ip_rsvpd != NULL) return EADDRINUSE; V_ip_rsvpd = so; /* * This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ if (!V_ip_rsvp_on) { V_ip_rsvp_on = 1; V_rsvp_on++; } return 0; } int ip_rsvp_done(void) { V_ip_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ if (V_ip_rsvp_on) { V_ip_rsvp_on = 0; V_rsvp_on--; } return 0; } int rsvp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; m = *mp; *mp = NULL; if (rsvp_input_p) { /* call the real one if loaded */ *mp = m; rsvp_input_p(mp, offp, proto); return (IPPROTO_DONE); } /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ if (!V_rsvp_on) { m_freem(m); return (IPPROTO_DONE); } if (V_ip_rsvpd != NULL) { *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); } /* Drop the packet */ m_freem(m); return (IPPROTO_DONE); } diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c index 14ebb6ef1007..dd43b467f1f3 100644 --- a/sys/netinet/ip_mroute.c +++ b/sys/netinet/ip_mroute.c @@ -1,2939 +1,2934 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989 Stephen Deering * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 */ /* * IP multicast forwarding procedures * * Written by David Waitzman, BBN Labs, August 1988. * Modified by Steve Deering, Stanford, February 1989. * Modified by Mark J. Steiglitz, Stanford, May, 1991 * Modified by Van Jacobson, LBL, January 1993 * Modified by Ajit Thyagarajan, PARC, August 1993 * Modified by Bill Fenner, PARC, April 1995 * Modified by Ahmed Helmy, SGI, June 1996 * Modified by George Edmond Eddy (Rusty), ISI, February 1998 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 * Modified by Hitoshi Asaeda, WIDE, August 2000 * Modified by Pavlin Radoslavov, ICSI, October 2002 * * MROUTING Revision: 3.5 * and PIM-SMv2 and PIM-DM support, advanced API support, * bandwidth metering and signaling */ /* * TODO: Prefix functions with ipmf_. * TODO: Maintain a refcount on if_allmulti() in ifnet or in the protocol * domain attachment (if_afdata) so we can track consumers of that service. * TODO: Deprecate routing socket path for SIOCGETSGCNT and SIOCGETVIFCNT, * move it to socket options. * TODO: Cleanup LSRR removal further. * TODO: Push RSVP stubs into raw_ip.c. * TODO: Use bitstring.h for vif set. * TODO: Fix mrt6_ioctl dangling ref when dynamically loaded. * TODO: Sync ip6_mroute.c with this file. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_mrouting.h" #define _PIM_VT 1 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IPMF #define KTR_IPMF KTR_INET #endif #define VIFI_INVALID ((vifi_t) -1) VNET_DEFINE_STATIC(uint32_t, last_tv_sec); /* last time we processed this */ #define V_last_tv_sec VNET(last_tv_sec) static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast forwarding cache"); /* * Locking. We use two locks: one for the virtual interface table and * one for the forwarding table. These locks may be nested in which case * the VIF lock must always be taken first. Note that each lock is used * to cover not only the specific data structure but also related data * structures. */ static struct mtx mrouter_mtx; #define MROUTER_LOCK() mtx_lock(&mrouter_mtx) #define MROUTER_UNLOCK() mtx_unlock(&mrouter_mtx) #define MROUTER_LOCK_ASSERT() mtx_assert(&mrouter_mtx, MA_OWNED) #define MROUTER_LOCK_INIT() \ mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF) #define MROUTER_LOCK_DESTROY() mtx_destroy(&mrouter_mtx) static int ip_mrouter_cnt; /* # of vnets with active mrouters */ static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */ VNET_PCPUSTAT_DEFINE_STATIC(struct mrtstat, mrtstat); VNET_PCPUSTAT_SYSINIT(mrtstat); VNET_PCPUSTAT_SYSUNINIT(mrtstat); SYSCTL_VNET_PCPUSTAT(_net_inet_ip, OID_AUTO, mrtstat, struct mrtstat, mrtstat, "IPv4 Multicast Forwarding Statistics (struct mrtstat, " "netinet/ip_mroute.h)"); VNET_DEFINE_STATIC(u_long, mfchash); #define V_mfchash VNET(mfchash) #define MFCHASH(a, g) \ ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \ ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & V_mfchash) #define MFCHASHSIZE 256 static u_long mfchashsize; /* Hash size */ VNET_DEFINE_STATIC(u_char *, nexpire); /* 0..mfchashsize-1 */ #define V_nexpire VNET(nexpire) VNET_DEFINE_STATIC(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl); #define V_mfchashtbl VNET(mfchashtbl) static struct mtx mfc_mtx; #define MFC_LOCK() mtx_lock(&mfc_mtx) #define MFC_UNLOCK() mtx_unlock(&mfc_mtx) #define MFC_LOCK_ASSERT() mtx_assert(&mfc_mtx, MA_OWNED) #define MFC_LOCK_INIT() \ mtx_init(&mfc_mtx, "IPv4 multicast forwarding cache", NULL, MTX_DEF) #define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) VNET_DEFINE_STATIC(vifi_t, numvifs); #define V_numvifs VNET(numvifs) VNET_DEFINE_STATIC(struct vif, viftable[MAXVIFS]); #define V_viftable VNET(viftable) SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(viftable), sizeof(V_viftable), "S,vif[MAXVIFS]", "IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)"); static struct mtx vif_mtx; #define VIF_LOCK() mtx_lock(&vif_mtx) #define VIF_UNLOCK() mtx_unlock(&vif_mtx) #define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED) #define VIF_LOCK_INIT() \ mtx_init(&vif_mtx, "IPv4 multicast interfaces", NULL, MTX_DEF) #define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx) static eventhandler_tag if_detach_event_tag = NULL; VNET_DEFINE_STATIC(struct callout, expire_upcalls_ch); #define V_expire_upcalls_ch VNET(expire_upcalls_ch) #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ /* * Bandwidth meter variables and constants */ static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); /* * Pending timeouts are stored in a hash table, the key being the * expiration time. Periodically, the entries are analysed and processed. */ #define BW_METER_BUCKETS 1024 VNET_DEFINE_STATIC(struct bw_meter*, bw_meter_timers[BW_METER_BUCKETS]); #define V_bw_meter_timers VNET(bw_meter_timers) VNET_DEFINE_STATIC(struct callout, bw_meter_ch); #define V_bw_meter_ch VNET(bw_meter_ch) #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ /* * Pending upcalls are stored in a vector which is flushed when * full, or periodically */ VNET_DEFINE_STATIC(struct bw_upcall, bw_upcalls[BW_UPCALLS_MAX]); #define V_bw_upcalls VNET(bw_upcalls) VNET_DEFINE_STATIC(u_int, bw_upcalls_n); /* # of pending upcalls */ #define V_bw_upcalls_n VNET(bw_upcalls_n) VNET_DEFINE_STATIC(struct callout, bw_upcalls_ch); #define V_bw_upcalls_ch VNET(bw_upcalls_ch) #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ VNET_PCPUSTAT_DEFINE_STATIC(struct pimstat, pimstat); VNET_PCPUSTAT_SYSINIT(pimstat); VNET_PCPUSTAT_SYSUNINIT(pimstat); SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM"); SYSCTL_VNET_PCPUSTAT(_net_inet_pim, PIMCTL_STATS, stats, struct pimstat, pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)"); static u_long pim_squelch_wholepkt = 0; SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW, &pim_squelch_wholepkt, 0, "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified"); static const struct encaptab *pim_encap_cookie; static int pim_encapcheck(const struct mbuf *, int, int, void *); static int pim_input(struct mbuf *, int, int, void *); static const struct encap_config ipv4_encap_cfg = { .proto = IPPROTO_PIM, .min_length = sizeof(struct ip) + PIM_MINLEN, .exact_match = 8, .check = pim_encapcheck, .input = pim_input }; /* * Note: the PIM Register encapsulation adds the following in front of a * data packet: * * struct pim_encap_hdr { * struct ip ip; * struct pim_encap_pimhdr pim; * } * */ struct pim_encap_pimhdr { struct pim pim; uint32_t flags; }; #define PIM_ENCAP_TTL 64 static struct ip pim_encap_iphdr = { #if BYTE_ORDER == LITTLE_ENDIAN sizeof(struct ip) >> 2, IPVERSION, #else IPVERSION, sizeof(struct ip) >> 2, #endif 0, /* tos */ sizeof(struct ip), /* total length */ 0, /* id */ 0, /* frag offset */ PIM_ENCAP_TTL, IPPROTO_PIM, 0, /* checksum */ }; static struct pim_encap_pimhdr pim_encap_pimhdr = { { PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 0, /* reserved */ 0, /* checksum */ }, 0 /* flags */ }; VNET_DEFINE_STATIC(vifi_t, reg_vif_num) = VIFI_INVALID; #define V_reg_vif_num VNET(reg_vif_num) VNET_DEFINE_STATIC(struct ifnet, multicast_register_if); #define V_multicast_register_if VNET(multicast_register_if) /* * Private variables. */ static u_long X_ip_mcast_src(int); static int X_ip_mforward(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); static int X_ip_mrouter_done(void); static int X_ip_mrouter_get(struct socket *, struct sockopt *); static int X_ip_mrouter_set(struct socket *, struct sockopt *); static int X_legal_vif_num(int); static int X_mrt_ioctl(u_long, caddr_t, int); static int add_bw_upcall(struct bw_upcall *); static int add_mfc(struct mfcctl2 *); static int add_vif(struct vifctl *); static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *); static void bw_meter_process(void); static void bw_meter_receive_packet(struct bw_meter *, int, struct timeval *); static void bw_upcalls_send(void); static int del_bw_upcall(struct bw_upcall *); static int del_mfc(struct mfcctl2 *); static int del_vif(vifi_t); static int del_vif_locked(vifi_t); static void expire_bw_meter_process(void *); static void expire_bw_upcalls_send(void *); static void expire_mfc(struct mfc *); static void expire_upcalls(void *); static void free_bw_list(struct bw_meter *); static int get_sg_cnt(struct sioc_sg_req *); static int get_vif_cnt(struct sioc_vif_req *); static void if_detached_event(void *, struct ifnet *); static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); static int ip_mrouter_init(struct socket *, int); static __inline struct mfc * mfc_find(struct in_addr *, struct in_addr *); static void phyint_send(struct ip *, struct vif *, struct mbuf *); static struct mbuf * pim_register_prepare(struct ip *, struct mbuf *); static int pim_register_send(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_rp(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_upcall(struct ip *, struct vif *, struct mbuf *, struct mfc *); static void schedule_bw_meter(struct bw_meter *, struct timeval *); static void send_packet(struct vif *, struct mbuf *); static int set_api_config(uint32_t *); static int set_assert(int); static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); static void unschedule_bw_meter(struct bw_meter *); /* * Kernel multicast forwarding API capabilities and setup. * If more API capabilities are added to the kernel, they should be * recorded in `mrt_api_support'. */ #define MRT_API_VERSION 0x0305 static const int mrt_api_version = MRT_API_VERSION; static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | MRT_MFC_FLAGS_BORDER_VIF | MRT_MFC_RP | MRT_MFC_BW_UPCALL); VNET_DEFINE_STATIC(uint32_t, mrt_api_config); #define V_mrt_api_config VNET(mrt_api_config) VNET_DEFINE_STATIC(int, pim_assert_enabled); #define V_pim_assert_enabled VNET(pim_assert_enabled) static struct timeval pim_assert_interval = { 3, 0 }; /* Rate limit */ /* * Find a route for a given origin IP address and multicast group address. * Statistics must be updated by the caller. */ static __inline struct mfc * mfc_find(struct in_addr *o, struct in_addr *g) { struct mfc *rt; MFC_LOCK_ASSERT(); LIST_FOREACH(rt, &V_mfchashtbl[MFCHASH(*o, *g)], mfc_hash) { if (in_hosteq(rt->mfc_origin, *o) && in_hosteq(rt->mfc_mcastgrp, *g) && TAILQ_EMPTY(&rt->mfc_stall)) break; } return (rt); } /* * Handle MRT setsockopt commands to modify the multicast forwarding tables. */ static int X_ip_mrouter_set(struct socket *so, struct sockopt *sopt) { int error, optval; vifi_t vifi; struct vifctl vifc; struct mfcctl2 mfc; struct bw_upcall bw_upcall; uint32_t i; if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT) return EPERM; error = 0; switch (sopt->sopt_name) { case MRT_INIT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; error = ip_mrouter_init(so, optval); break; case MRT_DONE: error = ip_mrouter_done(); break; case MRT_ADD_VIF: error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); if (error) break; error = add_vif(&vifc); break; case MRT_DEL_VIF: error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); if (error) break; error = del_vif(vifi); break; case MRT_ADD_MFC: case MRT_DEL_MFC: /* * select data size depending on API version. */ if (sopt->sopt_name == MRT_ADD_MFC && V_mrt_api_config & MRT_API_FLAGS_ALL) { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2), sizeof(struct mfcctl2)); } else { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl), sizeof(struct mfcctl)); bzero((caddr_t)&mfc + sizeof(struct mfcctl), sizeof(mfc) - sizeof(struct mfcctl)); } if (error) break; if (sopt->sopt_name == MRT_ADD_MFC) error = add_mfc(&mfc); else error = del_mfc(&mfc); break; case MRT_ASSERT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; set_assert(optval); break; case MRT_API_CONFIG: error = sooptcopyin(sopt, &i, sizeof i, sizeof i); if (!error) error = set_api_config(&i); if (!error) error = sooptcopyout(sopt, &i, sizeof i); break; case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall, sizeof bw_upcall); if (error) break; if (sopt->sopt_name == MRT_ADD_BW_UPCALL) error = add_bw_upcall(&bw_upcall); else error = del_bw_upcall(&bw_upcall); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle MRT getsockopt commands */ static int X_ip_mrouter_get(struct socket *so, struct sockopt *sopt) { int error; switch (sopt->sopt_name) { case MRT_VERSION: error = sooptcopyout(sopt, &mrt_api_version, sizeof mrt_api_version); break; case MRT_ASSERT: error = sooptcopyout(sopt, &V_pim_assert_enabled, sizeof V_pim_assert_enabled); break; case MRT_API_SUPPORT: error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support); break; case MRT_API_CONFIG: error = sooptcopyout(sopt, &V_mrt_api_config, sizeof V_mrt_api_config); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle ioctl commands to obtain information from the cache */ static int X_mrt_ioctl(u_long cmd, caddr_t data, int fibnum __unused) { int error = 0; /* * Currently the only function calling this ioctl routine is rtioctl_fib(). * Typically, only root can create the raw socket in order to execute * this ioctl method, however the request might be coming from a prison */ error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error) return (error); switch (cmd) { case (SIOCGETVIFCNT): error = get_vif_cnt((struct sioc_vif_req *)data); break; case (SIOCGETSGCNT): error = get_sg_cnt((struct sioc_sg_req *)data); break; default: error = EINVAL; break; } return error; } /* * returns the packet, byte, rpf-failure count for the source group provided */ static int get_sg_cnt(struct sioc_sg_req *req) { struct mfc *rt; MFC_LOCK(); rt = mfc_find(&req->src, &req->grp); if (rt == NULL) { MFC_UNLOCK(); req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; return EADDRNOTAVAIL; } req->pktcnt = rt->mfc_pkt_cnt; req->bytecnt = rt->mfc_byte_cnt; req->wrong_if = rt->mfc_wrong_if; MFC_UNLOCK(); return 0; } /* * returns the input and output packet and byte counts on the vif provided */ static int get_vif_cnt(struct sioc_vif_req *req) { vifi_t vifi = req->vifi; VIF_LOCK(); if (vifi >= V_numvifs) { VIF_UNLOCK(); return EINVAL; } req->icount = V_viftable[vifi].v_pkt_in; req->ocount = V_viftable[vifi].v_pkt_out; req->ibytes = V_viftable[vifi].v_bytes_in; req->obytes = V_viftable[vifi].v_bytes_out; VIF_UNLOCK(); return 0; } static void if_detached_event(void *arg __unused, struct ifnet *ifp) { vifi_t vifi; u_long i; MROUTER_LOCK(); if (V_ip_mrouter == NULL) { MROUTER_UNLOCK(); return; } VIF_LOCK(); MFC_LOCK(); /* * Tear down multicast forwarder state associated with this ifnet. * 1. Walk the vif list, matching vifs against this ifnet. * 2. Walk the multicast forwarding cache (mfc) looking for * inner matches with this vif's index. * 3. Expire any matching multicast forwarding cache entries. * 4. Free vif state. This should disable ALLMULTI on the interface. */ for (vifi = 0; vifi < V_numvifs; vifi++) { if (V_viftable[vifi].v_ifp != ifp) continue; for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { if (rt->mfc_parent == vifi) { expire_mfc(rt); } } } del_vif_locked(vifi); } MFC_UNLOCK(); VIF_UNLOCK(); MROUTER_UNLOCK(); } /* * Enable multicast forwarding. */ static int ip_mrouter_init(struct socket *so, int version) { CTR3(KTR_IPMF, "%s: so_type %d, pr_protocol %d", __func__, so->so_type, so->so_proto->pr_protocol); if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP) return EOPNOTSUPP; if (version != 1) return ENOPROTOOPT; MROUTER_LOCK(); if (ip_mrouter_unloading) { MROUTER_UNLOCK(); return ENOPROTOOPT; } if (V_ip_mrouter != NULL) { MROUTER_UNLOCK(); return EADDRINUSE; } V_mfchashtbl = hashinit_flags(mfchashsize, M_MRTABLE, &V_mfchash, HASH_NOWAIT); callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, curvnet); callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, curvnet); callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, curvnet); V_ip_mrouter = so; ip_mrouter_cnt++; MROUTER_UNLOCK(); CTR1(KTR_IPMF, "%s: done", __func__); return 0; } /* * Disable multicast forwarding. */ static int X_ip_mrouter_done(void) { struct ifnet *ifp; u_long i; vifi_t vifi; MROUTER_LOCK(); if (V_ip_mrouter == NULL) { MROUTER_UNLOCK(); return EINVAL; } /* * Detach/disable hooks to the reset of the system. */ V_ip_mrouter = NULL; ip_mrouter_cnt--; V_mrt_api_config = 0; VIF_LOCK(); /* * For each phyint in use, disable promiscuous reception of all IP * multicasts. */ for (vifi = 0; vifi < V_numvifs; vifi++) { if (!in_nullhost(V_viftable[vifi].v_lcl_addr) && !(V_viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { ifp = V_viftable[vifi].v_ifp; if_allmulti(ifp, 0); } } bzero((caddr_t)V_viftable, sizeof(V_viftable)); V_numvifs = 0; V_pim_assert_enabled = 0; VIF_UNLOCK(); callout_stop(&V_expire_upcalls_ch); callout_stop(&V_bw_upcalls_ch); callout_stop(&V_bw_meter_ch); MFC_LOCK(); /* * Free all multicast forwarding cache entries. * Do not use hashdestroy(), as we must perform other cleanup. */ for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { expire_mfc(rt); } } free(V_mfchashtbl, M_MRTABLE); V_mfchashtbl = NULL; bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize); V_bw_upcalls_n = 0; bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers)); MFC_UNLOCK(); V_reg_vif_num = VIFI_INVALID; MROUTER_UNLOCK(); CTR1(KTR_IPMF, "%s: done", __func__); return 0; } /* * Set PIM assert processing global */ static int set_assert(int i) { if ((i != 1) && (i != 0)) return EINVAL; V_pim_assert_enabled = i; return 0; } /* * Configure API capabilities */ int set_api_config(uint32_t *apival) { u_long i; /* * We can set the API capabilities only if it is the first operation * after MRT_INIT. I.e.: * - there are no vifs installed * - pim_assert is not enabled * - the MFC table is empty */ if (V_numvifs > 0) { *apival = 0; return EPERM; } if (V_pim_assert_enabled) { *apival = 0; return EPERM; } MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { if (LIST_FIRST(&V_mfchashtbl[i]) != NULL) { MFC_UNLOCK(); *apival = 0; return EPERM; } } MFC_UNLOCK(); V_mrt_api_config = *apival & mrt_api_support; *apival = V_mrt_api_config; return 0; } /* * Add a vif to the vif table */ static int add_vif(struct vifctl *vifcp) { struct vif *vifp = V_viftable + vifcp->vifc_vifi; struct sockaddr_in sin = {sizeof sin, AF_INET}; struct ifaddr *ifa; struct ifnet *ifp; int error; VIF_LOCK(); if (vifcp->vifc_vifi >= MAXVIFS) { VIF_UNLOCK(); return EINVAL; } /* rate limiting is no longer supported by this code */ if (vifcp->vifc_rate_limit != 0) { log(LOG_ERR, "rate limiting is no longer supported\n"); VIF_UNLOCK(); return EINVAL; } if (!in_nullhost(vifp->v_lcl_addr)) { VIF_UNLOCK(); return EADDRINUSE; } if (in_nullhost(vifcp->vifc_lcl_addr)) { VIF_UNLOCK(); return EADDRNOTAVAIL; } /* Find the interface with an address in AF_INET family */ if (vifcp->vifc_flags & VIFF_REGISTER) { /* * XXX: Because VIFF_REGISTER does not really need a valid * local interface (e.g. it could be 127.0.0.2), we don't * check its address. */ ifp = NULL; } else { - struct epoch_tracker et; - sin.sin_addr = vifcp->vifc_lcl_addr; - NET_EPOCH_ENTER(et); ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == NULL) { - NET_EPOCH_EXIT(et); VIF_UNLOCK(); return EADDRNOTAVAIL; } ifp = ifa->ifa_ifp; - NET_EPOCH_EXIT(et); } if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) { CTR1(KTR_IPMF, "%s: tunnels are no longer supported", __func__); VIF_UNLOCK(); return EOPNOTSUPP; } else if (vifcp->vifc_flags & VIFF_REGISTER) { ifp = &V_multicast_register_if; CTR2(KTR_IPMF, "%s: add register vif for ifp %p", __func__, ifp); if (V_reg_vif_num == VIFI_INVALID) { if_initname(&V_multicast_register_if, "register_vif", 0); V_multicast_register_if.if_flags = IFF_LOOPBACK; V_reg_vif_num = vifcp->vifc_vifi; } } else { /* Make sure the interface supports multicast */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { VIF_UNLOCK(); return EOPNOTSUPP; } /* Enable promiscuous reception of all IP multicasts from the if */ error = if_allmulti(ifp, 1); if (error) { VIF_UNLOCK(); return error; } } vifp->v_flags = vifcp->vifc_flags; vifp->v_threshold = vifcp->vifc_threshold; vifp->v_lcl_addr = vifcp->vifc_lcl_addr; vifp->v_rmt_addr = vifcp->vifc_rmt_addr; vifp->v_ifp = ifp; /* initialize per vif pkt counters */ vifp->v_pkt_in = 0; vifp->v_pkt_out = 0; vifp->v_bytes_in = 0; vifp->v_bytes_out = 0; /* Adjust numvifs up if the vifi is higher than numvifs */ if (V_numvifs <= vifcp->vifc_vifi) V_numvifs = vifcp->vifc_vifi + 1; VIF_UNLOCK(); CTR4(KTR_IPMF, "%s: add vif %d laddr 0x%08x thresh %x", __func__, (int)vifcp->vifc_vifi, ntohl(vifcp->vifc_lcl_addr.s_addr), (int)vifcp->vifc_threshold); return 0; } /* * Delete a vif from the vif table */ static int del_vif_locked(vifi_t vifi) { struct vif *vifp; VIF_LOCK_ASSERT(); if (vifi >= V_numvifs) { return EINVAL; } vifp = &V_viftable[vifi]; if (in_nullhost(vifp->v_lcl_addr)) { return EADDRNOTAVAIL; } if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) if_allmulti(vifp->v_ifp, 0); if (vifp->v_flags & VIFF_REGISTER) V_reg_vif_num = VIFI_INVALID; bzero((caddr_t)vifp, sizeof (*vifp)); CTR2(KTR_IPMF, "%s: delete vif %d", __func__, (int)vifi); /* Adjust numvifs down */ for (vifi = V_numvifs; vifi > 0; vifi--) if (!in_nullhost(V_viftable[vifi-1].v_lcl_addr)) break; V_numvifs = vifi; return 0; } static int del_vif(vifi_t vifi) { int cc; VIF_LOCK(); cc = del_vif_locked(vifi); VIF_UNLOCK(); return cc; } /* * update an mfc entry without resetting counters and S,G addresses. */ static void update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { int i; rt->mfc_parent = mfccp->mfcc_parent; for (i = 0; i < V_numvifs; i++) { rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; rt->mfc_flags[i] = mfccp->mfcc_flags[i] & V_mrt_api_config & MRT_MFC_FLAGS_ALL; } /* set the RP address */ if (V_mrt_api_config & MRT_MFC_RP) rt->mfc_rp = mfccp->mfcc_rp; else rt->mfc_rp.s_addr = INADDR_ANY; } /* * fully initialize an mfc entry from the parameter. */ static void init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { rt->mfc_origin = mfccp->mfcc_origin; rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; update_mfc_params(rt, mfccp); /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; timevalclear(&rt->mfc_last_assert); } static void expire_mfc(struct mfc *rt) { struct rtdetq *rte, *nrte; MFC_LOCK_ASSERT(); free_bw_list(rt->mfc_bw_meter); TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { m_freem(rte->m); TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); free(rte, M_MRTABLE); } LIST_REMOVE(rt, mfc_hash); free(rt, M_MRTABLE); } /* * Add an mfc entry */ static int add_mfc(struct mfcctl2 *mfccp) { struct mfc *rt; struct rtdetq *rte, *nrte; u_long hash = 0; u_short nstl; VIF_LOCK(); MFC_LOCK(); rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); /* If an entry already exists, just update the fields */ if (rt) { CTR4(KTR_IPMF, "%s: update mfc orig 0x%08x group %lx parent %x", __func__, ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent); update_mfc_params(rt, mfccp); MFC_UNLOCK(); VIF_UNLOCK(); return (0); } /* * Find the entry for which the upcall was made and update */ nstl = 0; hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) && !TAILQ_EMPTY(&rt->mfc_stall)) { CTR5(KTR_IPMF, "%s: add mfc orig 0x%08x group %lx parent %x qh %p", __func__, ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent, TAILQ_FIRST(&rt->mfc_stall)); if (nstl++) CTR1(KTR_IPMF, "%s: multiple matches", __func__); init_mfc_params(rt, mfccp); rt->mfc_expire = 0; /* Don't clean this guy up */ V_nexpire[hash]--; /* Free queued packets, but attempt to forward them first. */ TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { if (rte->ifp != NULL) ip_mdq(rte->m, rte->ifp, rt, -1); m_freem(rte->m); TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall--; free(rte, M_MRTABLE); } } } /* * It is possible that an entry is being inserted without an upcall */ if (nstl == 0) { CTR1(KTR_IPMF, "%s: adding mfc w/o upcall", __func__); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) { init_mfc_params(rt, mfccp); if (rt->mfc_expire) V_nexpire[hash]--; rt->mfc_expire = 0; break; /* XXX */ } } if (rt == NULL) { /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return (ENOBUFS); } init_mfc_params(rt, mfccp); TAILQ_INIT(&rt->mfc_stall); rt->mfc_nstall = 0; rt->mfc_expire = 0; rt->mfc_bw_meter = NULL; /* insert new entry at head of hash chain */ LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash); } } MFC_UNLOCK(); VIF_UNLOCK(); return (0); } /* * Delete an mfc entry */ static int del_mfc(struct mfcctl2 *mfccp) { struct in_addr origin; struct in_addr mcastgrp; struct mfc *rt; origin = mfccp->mfcc_origin; mcastgrp = mfccp->mfcc_mcastgrp; CTR3(KTR_IPMF, "%s: delete mfc orig 0x%08x group %lx", __func__, ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr)); MFC_LOCK(); rt = mfc_find(&origin, &mcastgrp); if (rt == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } /* * free the bw_meter entries */ free_bw_list(rt->mfc_bw_meter); rt->mfc_bw_meter = NULL; LIST_REMOVE(rt, mfc_hash); free(rt, M_MRTABLE); MFC_UNLOCK(); return (0); } /* * Send a message to the routing daemon on the multicast routing socket. */ static int socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) { if (s) { SOCKBUF_LOCK(&s->so_rcv); if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm, NULL) != 0) { sorwakeup_locked(s); return 0; } SOCKBUF_UNLOCK(&s->so_rcv); } m_freem(mm); return -1; } /* * IP multicast forwarding function. This function assumes that the packet * pointed to by "ip" has arrived on (or is about to be sent to) the interface * pointed to by "ifp", and the packet is to be relayed to other networks * that have members of the packet's destination IP multicast group. * * The packet is returned unscathed to the caller, unless it is * erroneous, in which case a non-zero return value tells the caller to * discard it. */ #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo) { struct mfc *rt; int error; vifi_t vifi; CTR3(KTR_IPMF, "ip_mforward: delete mfc orig 0x%08x group %lx ifp %p", ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), ifp); if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) { /* * Packet arrived via a physical interface or * an encapsulated tunnel or a register_vif. */ } else { /* * Packet arrived through a source-route tunnel. * Source-route tunnels are no longer supported. */ return (1); } VIF_LOCK(); MFC_LOCK(); if (imo && ((vifi = imo->imo_multicast_vif) < V_numvifs)) { if (ip->ip_ttl < MAXTTL) ip->ip_ttl++; /* compensate for -1 in *_send routines */ error = ip_mdq(m, ifp, NULL, vifi); MFC_UNLOCK(); VIF_UNLOCK(); return error; } /* * Don't forward a packet with time-to-live of zero or one, * or a packet destined to a local-only group. */ if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) { MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* * Determine forwarding vifs from the forwarding cache table */ MRTSTAT_INC(mrts_mfc_lookups); rt = mfc_find(&ip->ip_src, &ip->ip_dst); /* Entry exists, so forward if necessary */ if (rt != NULL) { error = ip_mdq(m, ifp, rt, -1); MFC_UNLOCK(); VIF_UNLOCK(); return error; } else { /* * If we don't have a route for packet's origin, * Make a copy of the packet & send message to routing daemon */ struct mbuf *mb0; struct rtdetq *rte; u_long hash; int hlen = ip->ip_hl << 2; MRTSTAT_INC(mrts_mfc_misses); MRTSTAT_INC(mrts_no_route); CTR2(KTR_IPMF, "ip_mforward: no mfc for (0x%08x,%lx)", ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr)); /* * Allocate mbufs early so that we don't do extra work if we are * just going to fail anyway. Make sure to pullup the header so * that other people can't step on it. */ rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT|M_ZERO); if (rte == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } mb0 = m_copypacket(m, M_NOWAIT); if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < hlen)) mb0 = m_pullup(mb0, hlen); if (mb0 == NULL) { free(rte, M_MRTABLE); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* is there an upcall waiting for this flow ? */ hash = MFCHASH(ip->ip_src, ip->ip_dst); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(ip->ip_src, rt->mfc_origin) && in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) && !TAILQ_EMPTY(&rt->mfc_stall)) break; } if (rt == NULL) { int i; struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct mbuf *mm; /* * Locate the vifi for the incoming interface for this packet. * If none found, drop packet. */ for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= V_numvifs) /* vif not found, drop packet */ goto non_fatal; /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) goto fail; /* Make a copy of the header to send to the user level process */ mm = m_copym(mb0, 0, hlen, M_NOWAIT); if (mm == NULL) goto fail1; /* * Send message to routing daemon to install * a route into the kernel table */ im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_NOCACHE; im->im_mbz = 0; im->im_vif = vifi; MRTSTAT_INC(mrts_upcalls); k_igmpsrc.sin_addr = ip->ip_src; if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { CTR0(KTR_IPMF, "ip_mforward: socket queue full"); MRTSTAT_INC(mrts_upq_sockfull); fail1: free(rt, M_MRTABLE); fail: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* insert new entry at head of hash chain */ rt->mfc_origin.s_addr = ip->ip_src.s_addr; rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; rt->mfc_expire = UPCALL_EXPIRE; V_nexpire[hash]++; for (i = 0; i < V_numvifs; i++) { rt->mfc_ttls[i] = 0; rt->mfc_flags[i] = 0; } rt->mfc_parent = -1; /* clear the RP address */ rt->mfc_rp.s_addr = INADDR_ANY; rt->mfc_bw_meter = NULL; /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; timevalclear(&rt->mfc_last_assert); TAILQ_INIT(&rt->mfc_stall); rt->mfc_nstall = 0; /* link into table */ LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash); TAILQ_INSERT_HEAD(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall++; } else { /* determine if queue has overflowed */ if (rt->mfc_nstall > MAX_UPQ) { MRTSTAT_INC(mrts_upq_ovflw); non_fatal: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return (0); } TAILQ_INSERT_TAIL(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall++; } rte->m = mb0; rte->ifp = ifp; MFC_UNLOCK(); VIF_UNLOCK(); return 0; } } /* * Clean up the cache entry if upcall is not serviced */ static void expire_upcalls(void *arg) { u_long i; CURVNET_SET((struct vnet *) arg); MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; if (V_nexpire[i] == 0) continue; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { if (TAILQ_EMPTY(&rt->mfc_stall)) continue; if (rt->mfc_expire == 0 || --rt->mfc_expire > 0) continue; /* * free the bw_meter entries */ while (rt->mfc_bw_meter != NULL) { struct bw_meter *x = rt->mfc_bw_meter; rt->mfc_bw_meter = x->bm_mfc_next; free(x, M_BWMETER); } MRTSTAT_INC(mrts_cache_cleanups); CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__, (u_long)ntohl(rt->mfc_origin.s_addr), (u_long)ntohl(rt->mfc_mcastgrp.s_addr)); expire_mfc(rt); } } MFC_UNLOCK(); callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, curvnet); CURVNET_RESTORE(); } /* * Packet forwarding routine once entry in the cache is made */ static int ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) { struct ip *ip = mtod(m, struct ip *); vifi_t vifi; int plen = ntohs(ip->ip_len); VIF_LOCK_ASSERT(); /* * If xmt_vif is not -1, send on only the requested vif. * * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) */ if (xmt_vif < V_numvifs) { if (V_viftable[xmt_vif].v_flags & VIFF_REGISTER) pim_register_send(ip, V_viftable + xmt_vif, m, rt); else phyint_send(ip, V_viftable + xmt_vif, m); return 1; } /* * Don't forward if it didn't arrive from the parent vif for its origin. */ vifi = rt->mfc_parent; if ((vifi >= V_numvifs) || (V_viftable[vifi].v_ifp != ifp)) { CTR4(KTR_IPMF, "%s: rx on wrong ifp %p (vifi %d, v_ifp %p)", __func__, ifp, (int)vifi, V_viftable[vifi].v_ifp); MRTSTAT_INC(mrts_wrong_if); ++rt->mfc_wrong_if; /* * If we are doing PIM assert processing, send a message * to the routing daemon. * * XXX: A PIM-SM router needs the WRONGVIF detection so it * can complete the SPT switch, regardless of the type * of the iif (broadcast media, GRE tunnel, etc). */ if (V_pim_assert_enabled && (vifi < V_numvifs) && V_viftable[vifi].v_ifp) { if (ifp == &V_multicast_register_if) PIMSTAT_INC(pims_rcv_registers_wrongiif); /* Get vifi for the incoming packet */ for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= V_numvifs) return 0; /* The iif is not found: ignore the packet. */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF) return 0; /* WRONGVIF disabled: ignore the packet */ if (ratecheck(&rt->mfc_last_assert, &pim_assert_interval)) { struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct igmpmsg *im; int hlen = ip->ip_hl << 2; struct mbuf *mm = m_copym(m, 0, hlen, M_NOWAIT); if (mm && (!M_WRITABLE(mm) || mm->m_len < hlen)) mm = m_pullup(mm, hlen); if (mm == NULL) return ENOBUFS; im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_WRONGVIF; im->im_mbz = 0; im->im_vif = vifi; MRTSTAT_INC(mrts_upcalls); k_igmpsrc.sin_addr = im->im_src; if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { CTR1(KTR_IPMF, "%s: socket queue full", __func__); MRTSTAT_INC(mrts_upq_sockfull); return ENOBUFS; } } } return 0; } /* If I sourced this packet, it counts as output, else it was input. */ if (in_hosteq(ip->ip_src, V_viftable[vifi].v_lcl_addr)) { V_viftable[vifi].v_pkt_out++; V_viftable[vifi].v_bytes_out += plen; } else { V_viftable[vifi].v_pkt_in++; V_viftable[vifi].v_bytes_in += plen; } rt->mfc_pkt_cnt++; rt->mfc_byte_cnt += plen; /* * For each vif, decide if a copy of the packet should be forwarded. * Forward if: * - the ttl exceeds the vif's threshold * - there are group members downstream on interface */ for (vifi = 0; vifi < V_numvifs; vifi++) if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) { V_viftable[vifi].v_pkt_out++; V_viftable[vifi].v_bytes_out += plen; if (V_viftable[vifi].v_flags & VIFF_REGISTER) pim_register_send(ip, V_viftable + vifi, m, rt); else phyint_send(ip, V_viftable + vifi, m); } /* * Perform upcall-related bw measuring. */ if (rt->mfc_bw_meter != NULL) { struct bw_meter *x; struct timeval now; microtime(&now); MFC_LOCK_ASSERT(); for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) bw_meter_receive_packet(x, plen, &now); } return 0; } /* * Check if a vif number is legal/ok. This is used by in_mcast.c. */ static int X_legal_vif_num(int vif) { int ret; ret = 0; if (vif < 0) return (ret); VIF_LOCK(); if (vif < V_numvifs) ret = 1; VIF_UNLOCK(); return (ret); } /* * Return the local address used by this vif */ static u_long X_ip_mcast_src(int vifi) { in_addr_t addr; addr = INADDR_ANY; if (vifi < 0) return (addr); VIF_LOCK(); if (vifi < V_numvifs) addr = V_viftable[vifi].v_lcl_addr.s_addr; VIF_UNLOCK(); return (addr); } static void phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) { struct mbuf *mb_copy; int hlen = ip->ip_hl << 2; VIF_LOCK_ASSERT(); /* * Make a new reference to the packet; make sure that * the IP header is actually copied, not just referenced, * so that ip_output() only scribbles on the copy. */ mb_copy = m_copypacket(m, M_NOWAIT); if (mb_copy && (!M_WRITABLE(mb_copy) || mb_copy->m_len < hlen)) mb_copy = m_pullup(mb_copy, hlen); if (mb_copy == NULL) return; send_packet(vifp, mb_copy); } static void send_packet(struct vif *vifp, struct mbuf *m) { struct ip_moptions imo; int error __unused; VIF_LOCK_ASSERT(); imo.imo_multicast_ifp = vifp->v_ifp; imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; imo.imo_multicast_loop = 1; imo.imo_multicast_vif = -1; STAILQ_INIT(&imo.imo_head); /* * Re-entrancy should not be a problem here, because * the packets that we send out and are looped back at us * should get rejected because they appear to come from * the loopback interface, thus preventing looping. */ error = ip_output(m, NULL, NULL, IP_FORWARDING, &imo, NULL); CTR3(KTR_IPMF, "%s: vif %td err %d", __func__, (ptrdiff_t)(vifp - V_viftable), error); } /* * Stubs for old RSVP socket shim implementation. */ static int X_ip_rsvp_vif(struct socket *so __unused, struct sockopt *sopt __unused) { return (EOPNOTSUPP); } static void X_ip_rsvp_force_done(struct socket *so __unused) { } static int X_rsvp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; m = *mp; *mp = NULL; if (!V_rsvp_on) m_freem(m); return (IPPROTO_DONE); } /* * Code for bandwidth monitors */ /* * Define common interface for timeval-related methods */ #define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp) #define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp)) #define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp)) static uint32_t compute_bw_meter_flags(struct bw_upcall *req) { uint32_t flags = 0; if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) flags |= BW_METER_UNIT_PACKETS; if (req->bu_flags & BW_UPCALL_UNIT_BYTES) flags |= BW_METER_UNIT_BYTES; if (req->bu_flags & BW_UPCALL_GEQ) flags |= BW_METER_GEQ; if (req->bu_flags & BW_UPCALL_LEQ) flags |= BW_METER_LEQ; return flags; } /* * Add a bw_meter entry */ static int add_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; struct timeval now; struct bw_meter *x; uint32_t flags; if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; /* Test if the flags are valid */ if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) return EINVAL; if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) return EINVAL; if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) return EINVAL; /* Test if the threshold time interval is valid */ if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) return EINVAL; flags = compute_bw_meter_flags(req); /* * Find if we have already same bw_meter entry */ MFC_LOCK(); mfc = mfc_find(&req->bu_src, &req->bu_dst); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) { MFC_UNLOCK(); return 0; /* XXX Already installed */ } } /* Allocate the new bw_meter entry */ x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT); if (x == NULL) { MFC_UNLOCK(); return ENOBUFS; } /* Set the new bw_meter entry */ x->bm_threshold.b_time = req->bu_threshold.b_time; microtime(&now); x->bm_start_time = now; x->bm_threshold.b_packets = req->bu_threshold.b_packets; x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags = flags; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; /* Add the new bw_meter entry to the front of entries for this MFC */ x->bm_mfc = mfc; x->bm_mfc_next = mfc->mfc_bw_meter; mfc->mfc_bw_meter = x; schedule_bw_meter(x, &now); MFC_UNLOCK(); return 0; } static void free_bw_list(struct bw_meter *list) { while (list != NULL) { struct bw_meter *x = list; list = list->bm_mfc_next; unschedule_bw_meter(x); free(x, M_BWMETER); } } /* * Delete one or multiple bw_meter entries */ static int del_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct bw_meter *x; if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; MFC_LOCK(); /* Find the corresponding MFC entry */ mfc = mfc_find(&req->bu_src, &req->bu_dst); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { /* * Delete all bw_meter entries for this mfc */ struct bw_meter *list; list = mfc->mfc_bw_meter; mfc->mfc_bw_meter = NULL; free_bw_list(list); MFC_UNLOCK(); return 0; } else { /* Delete a single bw_meter entry */ struct bw_meter *prev; uint32_t flags = 0; flags = compute_bw_meter_flags(req); /* Find the bw_meter entry to delete */ for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; prev = x, x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) break; } if (x != NULL) { /* Delete entry from the list for this MFC */ if (prev != NULL) prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ else x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ unschedule_bw_meter(x); MFC_UNLOCK(); /* Free the bw_meter entry */ free(x, M_BWMETER); return 0; } else { MFC_UNLOCK(); return EINVAL; } } /* NOTREACHED */ } /* * Perform bandwidth measurement processing that may result in an upcall */ static void bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) { struct timeval delta; MFC_LOCK_ASSERT(); delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); if (x->bm_flags & BW_METER_GEQ) { /* * Processing for ">=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should deliver an upcall */ if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); x->bm_flags |= BW_METER_UPCALL_DELIVERED; } } } else if (x->bm_flags & BW_METER_LEQ) { /* * Processing for "<=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* * We are behind time with the multicast forwarding table * scanning for "<=" type of bw_meter entries, so test now * if we should deliver an upcall. */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); } /* Reschedule the bw_meter entry */ unschedule_bw_meter(x); schedule_bw_meter(x, nowp); } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should restart the measuring interval */ if ((x->bm_flags & BW_METER_UNIT_PACKETS && x->bm_measured.b_packets <= x->bm_threshold.b_packets) || (x->bm_flags & BW_METER_UNIT_BYTES && x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { /* Don't restart the measuring interval */ } else { /* Do restart the measuring interval */ /* * XXX: note that we don't unschedule and schedule, because this * might be too much overhead per packet. Instead, when we process * all entries for a given timer hash bin, we check whether it is * really a timeout. If not, we reschedule at that time. */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } } } /* * Prepare a bandwidth-related upcall */ static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) { struct timeval delta; struct bw_upcall *u; MFC_LOCK_ASSERT(); /* * Compute the measured time interval */ delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); /* * If there are too many pending upcalls, deliver them now */ if (V_bw_upcalls_n >= BW_UPCALLS_MAX) bw_upcalls_send(); /* * Set the bw_upcall entry */ u = &V_bw_upcalls[V_bw_upcalls_n++]; u->bu_src = x->bm_mfc->mfc_origin; u->bu_dst = x->bm_mfc->mfc_mcastgrp; u->bu_threshold.b_time = x->bm_threshold.b_time; u->bu_threshold.b_packets = x->bm_threshold.b_packets; u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; u->bu_measured.b_time = delta; u->bu_measured.b_packets = x->bm_measured.b_packets; u->bu_measured.b_bytes = x->bm_measured.b_bytes; u->bu_flags = 0; if (x->bm_flags & BW_METER_UNIT_PACKETS) u->bu_flags |= BW_UPCALL_UNIT_PACKETS; if (x->bm_flags & BW_METER_UNIT_BYTES) u->bu_flags |= BW_UPCALL_UNIT_BYTES; if (x->bm_flags & BW_METER_GEQ) u->bu_flags |= BW_UPCALL_GEQ; if (x->bm_flags & BW_METER_LEQ) u->bu_flags |= BW_UPCALL_LEQ; } /* * Send the pending bandwidth-related upcalls */ static void bw_upcalls_send(void) { struct mbuf *m; int len = V_bw_upcalls_n * sizeof(V_bw_upcalls[0]); struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; static struct igmpmsg igmpmsg = { 0, /* unused1 */ 0, /* unused2 */ IGMPMSG_BW_UPCALL,/* im_msgtype */ 0, /* im_mbz */ 0, /* im_vif */ 0, /* unused3 */ { 0 }, /* im_src */ { 0 } }; /* im_dst */ MFC_LOCK_ASSERT(); if (V_bw_upcalls_n == 0) return; /* No pending upcalls */ V_bw_upcalls_n = 0; /* * Allocate a new mbuf, initialize it with the header and * the payload for the pending calls. */ m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); return; } m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&V_bw_upcalls[0]); /* * Send the upcalls * XXX do we need to set the address in k_igmpsrc ? */ MRTSTAT_INC(mrts_upcalls); if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) { log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); MRTSTAT_INC(mrts_upq_sockfull); } } /* * Compute the timeout hash value for the bw_meter entries */ #define BW_METER_TIMEHASH(bw_meter, hash) \ do { \ struct timeval next_timeval = (bw_meter)->bm_start_time; \ \ BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ (hash) = next_timeval.tv_sec; \ if (next_timeval.tv_usec) \ (hash)++; /* XXX: make sure we don't timeout early */ \ (hash) %= BW_METER_BUCKETS; \ } while (0) /* * Schedule a timer to process periodically bw_meter entry of type "<=" * by linking the entry in the proper hash bucket. */ static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) { int time_hash; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; /* * Compute the timeout hash value and insert the entry */ BW_METER_TIMEHASH(x, time_hash); x->bm_time_next = V_bw_meter_timers[time_hash]; V_bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; } /* * Unschedule the periodic timer that processes bw_meter entry of type "<=" * by removing the entry from the proper hash bucket. */ static void unschedule_bw_meter(struct bw_meter *x) { int time_hash; struct bw_meter *prev, *tmp; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Compute the timeout hash value and delete the entry */ time_hash = x->bm_time_hash; if (time_hash >= BW_METER_BUCKETS) return; /* Entry was not scheduled */ for (prev = NULL, tmp = V_bw_meter_timers[time_hash]; tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) if (tmp == x) break; if (tmp == NULL) panic("unschedule_bw_meter: bw_meter entry not found"); if (prev != NULL) prev->bm_time_next = x->bm_time_next; else V_bw_meter_timers[time_hash] = x->bm_time_next; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; } /* * Process all "<=" type of bw_meter that should be processed now, * and for each entry prepare an upcall if necessary. Each processed * entry is rescheduled again for the (periodic) processing. * * This is run periodically (once per second normally). On each round, * all the potentially matching entries are in the hash slot that we are * looking at. */ static void bw_meter_process() { uint32_t loops; int i; struct timeval now, process_endtime; microtime(&now); if (V_last_tv_sec == now.tv_sec) return; /* nothing to do */ loops = now.tv_sec - V_last_tv_sec; V_last_tv_sec = now.tv_sec; if (loops > BW_METER_BUCKETS) loops = BW_METER_BUCKETS; MFC_LOCK(); /* * Process all bins of bw_meter entries from the one after the last * processed to the current one. On entry, i points to the last bucket * visited, so we need to increment i at the beginning of the loop. */ for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { struct bw_meter *x, *tmp_list; if (++i >= BW_METER_BUCKETS) i = 0; /* Disconnect the list of bw_meter entries from the bin */ tmp_list = V_bw_meter_timers[i]; V_bw_meter_timers[i] = NULL; /* Process the list of bw_meter entries */ while (tmp_list != NULL) { x = tmp_list; tmp_list = tmp_list->bm_time_next; /* Test if the time interval is over */ process_endtime = x->bm_start_time; BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); if (BW_TIMEVALCMP(&process_endtime, &now, >)) { /* Not yet: reschedule, but don't reset */ int time_hash; BW_METER_TIMEHASH(x, time_hash); if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { /* * XXX: somehow the bin processing is a bit ahead of time. * Put the entry in the next bin. */ if (++time_hash >= BW_METER_BUCKETS) time_hash = 0; } x->bm_time_next = V_bw_meter_timers[time_hash]; V_bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; continue; } /* * Test if we should deliver an upcall */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, &now); } /* * Reschedule for next processing */ schedule_bw_meter(x, &now); } } /* Send all upcalls that are pending delivery */ bw_upcalls_send(); MFC_UNLOCK(); } /* * A periodic function for sending all upcalls that are pending delivery */ static void expire_bw_upcalls_send(void *arg) { CURVNET_SET((struct vnet *) arg); MFC_LOCK(); bw_upcalls_send(); MFC_UNLOCK(); callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, curvnet); CURVNET_RESTORE(); } /* * A periodic function for periodic scanning of the multicast forwarding * table for processing all "<=" bw_meter entries. */ static void expire_bw_meter_process(void *arg) { CURVNET_SET((struct vnet *) arg); if (V_mrt_api_config & MRT_MFC_BW_UPCALL) bw_meter_process(); callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, curvnet); CURVNET_RESTORE(); } /* * End of bandwidth monitoring code */ /* * Send the packet up to the user daemon, or eventually do kernel encapsulation * */ static int pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m, struct mfc *rt) { struct mbuf *mb_copy, *mm; /* * Do not send IGMP_WHOLEPKT notifications to userland, if the * rendezvous point was unspecified, and we were told not to. */ if (pim_squelch_wholepkt != 0 && (V_mrt_api_config & MRT_MFC_RP) && in_nullhost(rt->mfc_rp)) return 0; mb_copy = pim_register_prepare(ip, m); if (mb_copy == NULL) return ENOBUFS; /* * Send all the fragments. Note that the mbuf for each fragment * is freed by the sending machinery. */ for (mm = mb_copy; mm; mm = mb_copy) { mb_copy = mm->m_nextpkt; mm->m_nextpkt = 0; mm = m_pullup(mm, sizeof(struct ip)); if (mm != NULL) { ip = mtod(mm, struct ip *); if ((V_mrt_api_config & MRT_MFC_RP) && !in_nullhost(rt->mfc_rp)) { pim_register_send_rp(ip, vifp, mm, rt); } else { pim_register_send_upcall(ip, vifp, mm, rt); } } } return 0; } /* * Return a copy of the data packet that is ready for PIM Register * encapsulation. * XXX: Note that in the returned copy the IP header is a valid one. */ static struct mbuf * pim_register_prepare(struct ip *ip, struct mbuf *m) { struct mbuf *mb_copy = NULL; int mtu; /* Take care of delayed checksums */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } /* * Copy the old packet & pullup its IP header into the * new mbuf so we can modify it. */ mb_copy = m_copypacket(m, M_NOWAIT); if (mb_copy == NULL) return NULL; mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); if (mb_copy == NULL) return NULL; /* take care of the TTL */ ip = mtod(mb_copy, struct ip *); --ip->ip_ttl; /* Compute the MTU after the PIM Register encapsulation */ mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); if (ntohs(ip->ip_len) <= mtu) { /* Turn the IP header into a valid one */ ip->ip_sum = 0; ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); } else { /* Fragment the packet */ mb_copy->m_pkthdr.csum_flags |= CSUM_IP; if (ip_fragment(ip, &mb_copy, mtu, 0) != 0) { m_freem(mb_copy); return NULL; } } return mb_copy; } /* * Send an upcall with the data packet to the user-level process. */ static int pim_register_send_upcall(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; int len = ntohs(ip->ip_len); struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; VIF_LOCK_ASSERT(); /* * Add a new mbuf with an upcall header */ mb_first = m_gethdr(M_NOWAIT, MT_DATA); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); mb_first->m_len = sizeof(struct igmpmsg); mb_first->m_next = mb_copy; /* Send message to routing daemon */ im = mtod(mb_first, struct igmpmsg *); im->im_msgtype = IGMPMSG_WHOLEPKT; im->im_mbz = 0; im->im_vif = vifp - V_viftable; im->im_src = ip->ip_src; im->im_dst = ip->ip_dst; k_igmpsrc.sin_addr = ip->ip_src; MRTSTAT_INC(mrts_upcalls); if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) { CTR1(KTR_IPMF, "%s: socket queue full", __func__); MRTSTAT_INC(mrts_upq_sockfull); return ENOBUFS; } /* Keep statistics */ PIMSTAT_INC(pims_snd_registers_msgs); PIMSTAT_ADD(pims_snd_registers_bytes, len); return 0; } /* * Encapsulate the data packet in PIM Register message and send it to the RP. */ static int pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; struct ip *ip_outer; struct pim_encap_pimhdr *pimhdr; int len = ntohs(ip->ip_len); vifi_t vifi = rt->mfc_parent; VIF_LOCK_ASSERT(); if ((vifi >= V_numvifs) || in_nullhost(V_viftable[vifi].v_lcl_addr)) { m_freem(mb_copy); return EADDRNOTAVAIL; /* The iif vif is invalid */ } /* * Add a new mbuf with the encapsulating header */ mb_first = m_gethdr(M_NOWAIT, MT_DATA); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); mb_first->m_next = mb_copy; mb_first->m_pkthdr.len = len + mb_first->m_len; /* * Fill in the encapsulating IP and PIM header */ ip_outer = mtod(mb_first, struct ip *); *ip_outer = pim_encap_iphdr; ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr)); ip_outer->ip_src = V_viftable[vifi].v_lcl_addr; ip_outer->ip_dst = rt->mfc_rp; /* * Copy the inner header TOS to the outer header, and take care of the * IP_DF bit. */ ip_outer->ip_tos = ip->ip_tos; if (ip->ip_off & htons(IP_DF)) ip_outer->ip_off |= htons(IP_DF); ip_fillid(ip_outer); pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer + sizeof(pim_encap_iphdr)); *pimhdr = pim_encap_pimhdr; /* If the iif crosses a border, set the Border-bit */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & V_mrt_api_config) pimhdr->flags |= htonl(PIM_BORDER_REGISTER); mb_first->m_data += sizeof(pim_encap_iphdr); pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); mb_first->m_data -= sizeof(pim_encap_iphdr); send_packet(vifp, mb_first); /* Keep statistics */ PIMSTAT_INC(pims_snd_registers_msgs); PIMSTAT_ADD(pims_snd_registers_bytes, len); return 0; } /* * pim_encapcheck() is called by the encap4_input() path at runtime to * determine if a packet is for PIM; allowing PIM to be dynamically loaded * into the kernel. */ static int pim_encapcheck(const struct mbuf *m __unused, int off __unused, int proto __unused, void *arg __unused) { KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM")); return (8); /* claim the datagram. */ } /* * PIM-SMv2 and PIM-DM messages processing. * Receives and verifies the PIM control messages, and passes them * up to the listening socket, using rip_input(). * The only message with special processing is the PIM_REGISTER message * (used by PIM-SM): the PIM header is stripped off, and the inner packet * is passed to if_simloop(). */ static int pim_input(struct mbuf *m, int off, int proto, void *arg __unused) { struct ip *ip = mtod(m, struct ip *); struct pim *pim; int iphlen = off; int minlen; int datalen = ntohs(ip->ip_len) - iphlen; int ip_tos; /* Keep statistics */ PIMSTAT_INC(pims_rcv_total_msgs); PIMSTAT_ADD(pims_rcv_total_bytes, datalen); /* * Validate lengths */ if (datalen < PIM_MINLEN) { PIMSTAT_INC(pims_rcv_tooshort); CTR3(KTR_IPMF, "%s: short packet (%d) from 0x%08x", __func__, datalen, ntohl(ip->ip_src.s_addr)); m_freem(m); return (IPPROTO_DONE); } /* * If the packet is at least as big as a REGISTER, go agead * and grab the PIM REGISTER header size, to avoid another * possible m_pullup() later. * * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 */ minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); /* * Get the IP and PIM headers in contiguous memory, and * possibly the PIM REGISTER header. */ if (m->m_len < minlen && (m = m_pullup(m, minlen)) == NULL) { CTR1(KTR_IPMF, "%s: m_pullup() failed", __func__); return (IPPROTO_DONE); } /* m_pullup() may have given us a new mbuf so reset ip. */ ip = mtod(m, struct ip *); ip_tos = ip->ip_tos; /* adjust mbuf to point to the PIM header */ m->m_data += iphlen; m->m_len -= iphlen; pim = mtod(m, struct pim *); /* * Validate checksum. If PIM REGISTER, exclude the data packet. * * XXX: some older PIMv2 implementations don't make this distinction, * so for compatibility reason perform the checksum over part of the * message, and if error, then over the whole message. */ if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { /* do nothing, checksum okay */ } else if (in_cksum(m, datalen)) { PIMSTAT_INC(pims_rcv_badsum); CTR1(KTR_IPMF, "%s: invalid checksum", __func__); m_freem(m); return (IPPROTO_DONE); } /* PIM version check */ if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { PIMSTAT_INC(pims_rcv_badversion); CTR3(KTR_IPMF, "%s: bad version %d expect %d", __func__, (int)PIM_VT_V(pim->pim_vt), PIM_VERSION); m_freem(m); return (IPPROTO_DONE); } /* restore mbuf back to the outer IP */ m->m_data -= iphlen; m->m_len += iphlen; if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { /* * Since this is a REGISTER, we'll make a copy of the register * headers ip + pim + u_int32 + encap_ip, to be passed up to the * routing daemon. */ struct sockaddr_in dst = { sizeof(dst), AF_INET }; struct mbuf *mcp; struct ip *encap_ip; u_int32_t *reghdr; struct ifnet *vifp; VIF_LOCK(); if ((V_reg_vif_num >= V_numvifs) || (V_reg_vif_num == VIFI_INVALID)) { VIF_UNLOCK(); CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__, (int)V_reg_vif_num); m_freem(m); return (IPPROTO_DONE); } /* XXX need refcnt? */ vifp = V_viftable[V_reg_vif_num].v_ifp; VIF_UNLOCK(); /* * Validate length */ if (datalen < PIM_REG_MINLEN) { PIMSTAT_INC(pims_rcv_tooshort); PIMSTAT_INC(pims_rcv_badregisters); CTR1(KTR_IPMF, "%s: register packet size too small", __func__); m_freem(m); return (IPPROTO_DONE); } reghdr = (u_int32_t *)(pim + 1); encap_ip = (struct ip *)(reghdr + 1); CTR3(KTR_IPMF, "%s: register: encap ip src 0x%08x len %d", __func__, ntohl(encap_ip->ip_src.s_addr), ntohs(encap_ip->ip_len)); /* verify the version number of the inner packet */ if (encap_ip->ip_v != IPVERSION) { PIMSTAT_INC(pims_rcv_badregisters); CTR1(KTR_IPMF, "%s: bad encap ip version", __func__); m_freem(m); return (IPPROTO_DONE); } /* verify the inner packet is destined to a mcast group */ if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) { PIMSTAT_INC(pims_rcv_badregisters); CTR2(KTR_IPMF, "%s: bad encap ip dest 0x%08x", __func__, ntohl(encap_ip->ip_dst.s_addr)); m_freem(m); return (IPPROTO_DONE); } /* If a NULL_REGISTER, pass it to the daemon */ if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) goto pim_input_to_daemon; /* * Copy the TOS from the outer IP header to the inner IP header. */ if (encap_ip->ip_tos != ip_tos) { /* Outer TOS -> inner TOS */ encap_ip->ip_tos = ip_tos; /* Recompute the inner header checksum. Sigh... */ /* adjust mbuf to point to the inner IP header */ m->m_data += (iphlen + PIM_MINLEN); m->m_len -= (iphlen + PIM_MINLEN); encap_ip->ip_sum = 0; encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); /* restore mbuf to point back to the outer IP header */ m->m_data -= (iphlen + PIM_MINLEN); m->m_len += (iphlen + PIM_MINLEN); } /* * Decapsulate the inner IP packet and loopback to forward it * as a normal multicast packet. Also, make a copy of the * outer_iphdr + pimhdr + reghdr + encap_iphdr * to pass to the daemon later, so it can take the appropriate * actions (e.g., send back PIM_REGISTER_STOP). * XXX: here m->m_data points to the outer IP header. */ mcp = m_copym(m, 0, iphlen + PIM_REG_MINLEN, M_NOWAIT); if (mcp == NULL) { CTR1(KTR_IPMF, "%s: m_copym() failed", __func__); m_freem(m); return (IPPROTO_DONE); } /* Keep statistics */ /* XXX: registers_bytes include only the encap. mcast pkt */ PIMSTAT_INC(pims_rcv_registers_msgs); PIMSTAT_ADD(pims_rcv_registers_bytes, ntohs(encap_ip->ip_len)); /* * forward the inner ip packet; point m_data at the inner ip. */ m_adj(m, iphlen + PIM_MINLEN); CTR4(KTR_IPMF, "%s: forward decap'd REGISTER: src %lx dst %lx vif %d", __func__, (u_long)ntohl(encap_ip->ip_src.s_addr), (u_long)ntohl(encap_ip->ip_dst.s_addr), (int)V_reg_vif_num); /* NB: vifp was collected above; can it change on us? */ if_simloop(vifp, m, dst.sin_family, 0); /* prepare the register head to send to the mrouting daemon */ m = mcp; } pim_input_to_daemon: /* * Pass the PIM message up to the daemon; if it is a Register message, * pass the 'head' only up to the daemon. This includes the * outer IP header, PIM header, PIM-Register header and the * inner IP header. * XXX: the outer IP header pkt size of a Register is not adjust to * reflect the fact that the inner multicast data is truncated. */ return (rip_input(&m, &off, proto)); } static int sysctl_mfctable(SYSCTL_HANDLER_ARGS) { struct mfc *rt; int error, i; if (req->newptr) return (EPERM); if (V_mfchashtbl == NULL) /* XXX unlocked */ return (0); error = sysctl_wire_old_buffer(req, 0); if (error) return (error); MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { LIST_FOREACH(rt, &V_mfchashtbl[i], mfc_hash) { error = SYSCTL_OUT(req, rt, sizeof(struct mfc)); if (error) goto out_locked; } } out_locked: MFC_UNLOCK(); return (error); } static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, sysctl_mfctable, "IPv4 Multicast Forwarding Table " "(struct *mfc[mfchashsize], netinet/ip_mroute.h)"); static void vnet_mroute_init(const void *unused __unused) { V_nexpire = malloc(mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO); bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers)); callout_init(&V_expire_upcalls_ch, 1); callout_init(&V_bw_upcalls_ch, 1); callout_init(&V_bw_meter_ch, 1); } VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init, NULL); static void vnet_mroute_uninit(const void *unused __unused) { free(V_nexpire, M_MRTABLE); V_nexpire = NULL; } VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, vnet_mroute_uninit, NULL); static int ip_mroute_modevent(module_t mod, int type, void *unused) { switch (type) { case MOD_LOAD: MROUTER_LOCK_INIT(); if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, if_detached_event, NULL, EVENTHANDLER_PRI_ANY); if (if_detach_event_tag == NULL) { printf("ip_mroute: unable to register " "ifnet_departure_event handler\n"); MROUTER_LOCK_DESTROY(); return (EINVAL); } MFC_LOCK_INIT(); VIF_LOCK_INIT(); mfchashsize = MFCHASHSIZE; if (TUNABLE_ULONG_FETCH("net.inet.ip.mfchashsize", &mfchashsize) && !powerof2(mfchashsize)) { printf("WARNING: %s not a power of 2; using default\n", "net.inet.ip.mfchashsize"); mfchashsize = MFCHASHSIZE; } pim_squelch_wholepkt = 0; TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt", &pim_squelch_wholepkt); pim_encap_cookie = ip_encap_attach(&ipv4_encap_cfg, NULL, M_WAITOK); if (pim_encap_cookie == NULL) { printf("ip_mroute: unable to attach pim encap\n"); VIF_LOCK_DESTROY(); MFC_LOCK_DESTROY(); MROUTER_LOCK_DESTROY(); return (EINVAL); } ip_mcast_src = X_ip_mcast_src; ip_mforward = X_ip_mforward; ip_mrouter_done = X_ip_mrouter_done; ip_mrouter_get = X_ip_mrouter_get; ip_mrouter_set = X_ip_mrouter_set; ip_rsvp_force_done = X_ip_rsvp_force_done; ip_rsvp_vif = X_ip_rsvp_vif; legal_vif_num = X_legal_vif_num; mrt_ioctl = X_mrt_ioctl; rsvp_input_p = X_rsvp_input; break; case MOD_UNLOAD: /* * Typically module unload happens after the user-level * process has shutdown the kernel services (the check * below insures someone can't just yank the module out * from under a running process). But if the module is * just loaded and then unloaded w/o starting up a user * process we still need to cleanup. */ MROUTER_LOCK(); if (ip_mrouter_cnt != 0) { MROUTER_UNLOCK(); return (EINVAL); } ip_mrouter_unloading = 1; MROUTER_UNLOCK(); EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); if (pim_encap_cookie) { ip_encap_detach(pim_encap_cookie); pim_encap_cookie = NULL; } ip_mcast_src = NULL; ip_mforward = NULL; ip_mrouter_done = NULL; ip_mrouter_get = NULL; ip_mrouter_set = NULL; ip_rsvp_force_done = NULL; ip_rsvp_vif = NULL; legal_vif_num = NULL; mrt_ioctl = NULL; rsvp_input_p = NULL; VIF_LOCK_DESTROY(); MFC_LOCK_DESTROY(); MROUTER_LOCK_DESTROY(); break; default: return EOPNOTSUPP; } return 0; } static moduledata_t ip_mroutemod = { "ip_mroute", ip_mroute_modevent, 0 }; DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE); diff --git a/sys/netinet/ip_options.c b/sys/netinet/ip_options.c index 87202eb806c4..cb9920d86d9e 100644 --- a/sys/netinet/ip_options.c +++ b/sys/netinet/ip_options.c @@ -1,765 +1,761 @@ /* * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * Copyright (c) 2005 Andre Oppermann, Internet Business Solutions AG. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ipstealth.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_DEFINE_STATIC(int, ip_dosourceroute); SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_dosourceroute), 0, "Enable forwarding source routed IP packets"); #define V_ip_dosourceroute VNET(ip_dosourceroute) VNET_DEFINE_STATIC(int, ip_acceptsourceroute); SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_acceptsourceroute), 0, "Enable accepting source routed IP packets"); #define V_ip_acceptsourceroute VNET(ip_acceptsourceroute) VNET_DEFINE(int, ip_doopts) = 1; /* 0 = ignore, 1 = process, 2 = reject */ SYSCTL_INT(_net_inet_ip, OID_AUTO, process_options, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_doopts), 0, "Enable IP options processing ([LS]SRR, RR, TS)"); static void save_rte(struct mbuf *m, u_char *, struct in_addr); /* * Do option processing on a datagram, possibly discarding it if bad options * are encountered, or forwarding it if source-routed. * * The pass argument is used when operating in the IPSTEALTH mode to tell * what options to process: [LS]SRR (pass 0) or the others (pass 1). The * reason for as many as two passes is that when doing IPSTEALTH, non-routing * options should be processed only if the packet is for us. * * Returns 1 if packet has been forwarded/freed, 0 if the packet should be * processed further. */ int ip_dooptions(struct mbuf *m, int pass) { struct ip *ip = mtod(m, struct ip *); u_char *cp; struct in_ifaddr *ia; int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; struct in_addr *sin, dst; uint32_t ntime; struct nhop4_extended nh_ext; struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; - struct epoch_tracker et; + + NET_EPOCH_ASSERT(); /* Ignore or reject packets with IP options. */ if (V_ip_doopts == 0) return 0; else if (V_ip_doopts == 2) { type = ICMP_UNREACH; code = ICMP_UNREACH_FILTER_PROHIB; - goto bad_unlocked; + goto bad; } - NET_EPOCH_ENTER(et); dst = ip->ip_dst; cp = (u_char *)(ip + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } optlen = cp[IPOPT_OLEN]; if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } } switch (opt) { default: break; /* * Source routing with record. Find interface with current * destination address. If none on this machine then drop if * strictly routed, or do nothing if loosely routed. Record * interface address and bring up next address component. If * strictly routed make sure next address is on directly * accessible net. */ case IPOPT_LSRR: case IPOPT_SSRR: #ifdef IPSTEALTH if (V_ipstealth && pass > 0) break; #endif if (optlen < IPOPT_OFFSET + sizeof(*cp)) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } ipaddr.sin_addr = ip->ip_dst; if (ifa_ifwithaddr_check((struct sockaddr *)&ipaddr) == 0) { if (opt == IPOPT_SSRR) { type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; } if (!V_ip_dosourceroute) goto nosourcerouting; /* * Loose routing, and not at next destination * yet; nothing to do except forward. */ break; } off--; /* 0 origin */ if (off > optlen - (int)sizeof(struct in_addr)) { /* * End of source route. Should be for us. */ if (!V_ip_acceptsourceroute) goto nosourcerouting; save_rte(m, cp, ip->ip_src); break; } #ifdef IPSTEALTH if (V_ipstealth) goto dropit; #endif if (!V_ip_dosourceroute) { if (V_ipforwarding) { char srcbuf[INET_ADDRSTRLEN]; char dstbuf[INET_ADDRSTRLEN]; /* * Acting as a router, so generate * ICMP */ nosourcerouting: log(LOG_WARNING, "attempted source route from %s " "to %s\n", inet_ntoa_r(ip->ip_src, srcbuf), inet_ntoa_r(ip->ip_dst, dstbuf)); type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; } else { /* * Not acting as a router, so * silently drop. */ #ifdef IPSTEALTH dropit: #endif IPSTAT_INC(ips_cantforward); m_freem(m); - NET_EPOCH_EXIT(et); return (1); } } /* * locate outgoing interface */ (void)memcpy(&ipaddr.sin_addr, cp + off, sizeof(ipaddr.sin_addr)); type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; if (opt == IPOPT_SSRR) { #define INA struct in_ifaddr * #define SA struct sockaddr * ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr, RT_ALL_FIBS); if (ia == NULL) ia = (INA)ifa_ifwithnet((SA)&ipaddr, 0, RT_ALL_FIBS); if (ia == NULL) goto bad; memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); } else { /* XXX MRT 0 for routing */ if (fib4_lookup_nh_ext(M_GETFIB(m), ipaddr.sin_addr, 0, 0, &nh_ext) != 0) goto bad; memcpy(cp + off, &nh_ext.nh_src, sizeof(struct in_addr)); } ip->ip_dst = ipaddr.sin_addr; cp[IPOPT_OFFSET] += sizeof(struct in_addr); /* * Let ip_intr's mcast routing check handle mcast pkts */ forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr)); break; case IPOPT_RR: #ifdef IPSTEALTH if (V_ipstealth && pass == 0) break; #endif if (optlen < IPOPT_OFFSET + sizeof(*cp)) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } /* * If no space remains, ignore. */ off--; /* 0 origin */ if (off > optlen - (int)sizeof(struct in_addr)) break; (void)memcpy(&ipaddr.sin_addr, &ip->ip_dst, sizeof(ipaddr.sin_addr)); /* * Locate outgoing interface; if we're the * destination, use the incoming interface (should be * same). */ if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) != NULL) { memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); } else if (fib4_lookup_nh_ext(M_GETFIB(m), ipaddr.sin_addr, 0, 0, &nh_ext) == 0) { memcpy(cp + off, &nh_ext.nh_src, sizeof(struct in_addr)); } else { type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; goto bad; } cp[IPOPT_OFFSET] += sizeof(struct in_addr); break; case IPOPT_TS: #ifdef IPSTEALTH if (V_ipstealth && pass == 0) break; #endif code = cp - (u_char *)ip; if (optlen < 4 || optlen > 40) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < 5) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if (off > optlen - (int)sizeof(int32_t)) { cp[IPOPT_OFFSET + 1] += (1 << 4); if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } break; } off--; /* 0 origin */ sin = (struct in_addr *)(cp + off); switch (cp[IPOPT_OFFSET + 1] & 0x0f) { case IPOPT_TS_TSONLY: break; case IPOPT_TS_TSANDADDR: if (off + sizeof(uint32_t) + sizeof(struct in_addr) > optlen) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } ipaddr.sin_addr = dst; ia = (INA)ifaof_ifpforaddr((SA)&ipaddr, m->m_pkthdr.rcvif); if (ia == NULL) continue; (void)memcpy(sin, &IA_SIN(ia)->sin_addr, sizeof(struct in_addr)); cp[IPOPT_OFFSET] += sizeof(struct in_addr); off += sizeof(struct in_addr); break; case IPOPT_TS_PRESPEC: if (off + sizeof(uint32_t) + sizeof(struct in_addr) > optlen) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } (void)memcpy(&ipaddr.sin_addr, sin, sizeof(struct in_addr)); if (ifa_ifwithaddr_check((SA)&ipaddr) == 0) continue; cp[IPOPT_OFFSET] += sizeof(struct in_addr); off += sizeof(struct in_addr); break; default: code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip; goto bad; } ntime = iptime(); (void)memcpy(cp + off, &ntime, sizeof(uint32_t)); cp[IPOPT_OFFSET] += sizeof(uint32_t); } } - NET_EPOCH_EXIT(et); if (forward && V_ipforwarding) { ip_forward(m, 1); return (1); } return (0); bad: - NET_EPOCH_EXIT(et); -bad_unlocked: icmp_error(m, type, code, 0, 0); IPSTAT_INC(ips_badoptions); return (1); } /* * Save incoming source route for use in replies, to be picked up later by * ip_srcroute if the receiver is interested. */ static void save_rte(struct mbuf *m, u_char *option, struct in_addr dst) { unsigned olen; struct ipopt_tag *opts; opts = (struct ipopt_tag *)m_tag_get(PACKET_TAG_IPOPTIONS, sizeof(struct ipopt_tag), M_NOWAIT); if (opts == NULL) return; olen = option[IPOPT_OLEN]; if (olen > sizeof(opts->ip_srcrt) - (1 + sizeof(dst))) { m_tag_free((struct m_tag *)opts); return; } bcopy(option, opts->ip_srcrt.srcopt, olen); opts->ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); opts->ip_srcrt.dst = dst; m_tag_prepend(m, (struct m_tag *)opts); } /* * Retrieve incoming source route for use in replies, in the same form used * by setsockopt. The first hop is placed before the options, will be * removed later. */ struct mbuf * ip_srcroute(struct mbuf *m0) { struct in_addr *p, *q; struct mbuf *m; struct ipopt_tag *opts; opts = (struct ipopt_tag *)m_tag_find(m0, PACKET_TAG_IPOPTIONS, NULL); if (opts == NULL) return (NULL); if (opts->ip_nhops == 0) return (NULL); m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); #define OPTSIZ (sizeof(opts->ip_srcrt.nop) + sizeof(opts->ip_srcrt.srcopt)) /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */ m->m_len = opts->ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) + OPTSIZ; /* * First, save first hop for return route. */ p = &(opts->ip_srcrt.route[opts->ip_nhops - 1]); *(mtod(m, struct in_addr *)) = *p--; /* * Copy option fields and padding (nop) to mbuf. */ opts->ip_srcrt.nop = IPOPT_NOP; opts->ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF; (void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), &(opts->ip_srcrt.nop), OPTSIZ); q = (struct in_addr *)(mtod(m, caddr_t) + sizeof(struct in_addr) + OPTSIZ); #undef OPTSIZ /* * Record return path as an IP source route, reversing the path * (pointers are now aligned). */ while (p >= opts->ip_srcrt.route) { *q++ = *p--; } /* * Last hop goes to final destination. */ *q = opts->ip_srcrt.dst; m_tag_delete(m0, (struct m_tag *)opts); return (m); } /* * Strip out IP options, at higher level protocol in the kernel. */ void ip_stripoptions(struct mbuf *m) { struct ip *ip = mtod(m, struct ip *); int olen; olen = (ip->ip_hl << 2) - sizeof(struct ip); m->m_len -= olen; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len -= olen; ip->ip_len = htons(ntohs(ip->ip_len) - olen); ip->ip_hl = sizeof(struct ip) >> 2; bcopy((char *)ip + sizeof(struct ip) + olen, (ip + 1), (size_t )(m->m_len - sizeof(struct ip))); } /* * Insert IP options into preformed packet. Adjust IP destination as * required for IP source routing, as indicated by a non-zero in_addr at the * start of the options. * * XXX This routine assumes that the packet has no options in place. */ struct mbuf * ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) { struct ipoption *p = mtod(opt, struct ipoption *); struct mbuf *n; struct ip *ip = mtod(m, struct ip *); unsigned optlen; optlen = opt->m_len - sizeof(p->ipopt_dst); if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) { *phlen = 0; return (m); /* XXX should fail */ } if (p->ipopt_dst.s_addr) ip->ip_dst = p->ipopt_dst; if (!M_WRITABLE(m) || M_LEADINGSPACE(m) < optlen) { n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) { *phlen = 0; return (m); } m_move_pkthdr(n, m); n->m_pkthdr.rcvif = NULL; n->m_pkthdr.len += optlen; m->m_len -= sizeof(struct ip); m->m_data += sizeof(struct ip); n->m_next = m; m = n; m->m_len = optlen + sizeof(struct ip); m->m_data += max_linkhdr; bcopy(ip, mtod(m, void *), sizeof(struct ip)); } else { m->m_data -= optlen; m->m_len += optlen; m->m_pkthdr.len += optlen; bcopy(ip, mtod(m, void *), sizeof(struct ip)); } ip = mtod(m, struct ip *); bcopy(p->ipopt_list, ip + 1, optlen); *phlen = sizeof(struct ip) + optlen; ip->ip_v = IPVERSION; ip->ip_hl = *phlen >> 2; ip->ip_len = htons(ntohs(ip->ip_len) + optlen); return (m); } /* * Copy options from ip to jp, omitting those not copied during * fragmentation. */ int ip_optcopy(struct ip *ip, struct ip *jp) { u_char *cp, *dp; int opt, optlen, cnt; cp = (u_char *)(ip + 1); dp = (u_char *)(jp + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) { /* Preserve for IP mcast tunnel's LSRR alignment. */ *dp++ = IPOPT_NOP; optlen = 1; continue; } KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp), ("ip_optcopy: malformed ipv4 option")); optlen = cp[IPOPT_OLEN]; KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt, ("ip_optcopy: malformed ipv4 option")); /* Bogus lengths should have been caught by ip_dooptions. */ if (optlen > cnt) optlen = cnt; if (IPOPT_COPIED(opt)) { bcopy(cp, dp, optlen); dp += optlen; } } for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) *dp++ = IPOPT_EOL; return (optlen); } /* * Set up IP options in pcb for insertion in output packets. Store in mbuf * with pointer in pcbopt, adding pseudo-option with destination address if * source routed. */ int ip_pcbopts(struct inpcb *inp, int optname, struct mbuf *m) { int cnt, optlen; u_char *cp; struct mbuf **pcbopt; u_char opt; INP_WLOCK_ASSERT(inp); pcbopt = &inp->inp_options; /* turn off any old options */ if (*pcbopt) (void)m_free(*pcbopt); *pcbopt = NULL; if (m == NULL || m->m_len == 0) { /* * Only turning off any previous options. */ if (m != NULL) (void)m_free(m); return (0); } if (m->m_len % sizeof(int32_t)) goto bad; /* * IP first-hop destination address will be stored before actual * options; move other options back and clear it when none present. */ if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) goto bad; cnt = m->m_len; m->m_len += sizeof(struct in_addr); cp = mtod(m, u_char *) + sizeof(struct in_addr); bcopy(mtod(m, void *), cp, (unsigned)cnt); bzero(mtod(m, void *), sizeof(struct in_addr)); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) goto bad; optlen = cp[IPOPT_OLEN]; if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) goto bad; } switch (opt) { default: break; case IPOPT_LSRR: case IPOPT_SSRR: /* * User process specifies route as: * * ->A->B->C->D * * D must be our final destination (but we can't * check that since we may not have connected yet). * A is first hop destination, which doesn't appear * in actual IP option, but is stored before the * options. */ /* XXX-BZ PRIV_NETINET_SETHDROPTS? */ if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) goto bad; m->m_len -= sizeof(struct in_addr); cnt -= sizeof(struct in_addr); optlen -= sizeof(struct in_addr); cp[IPOPT_OLEN] = optlen; /* * Move first hop before start of options. */ bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), sizeof(struct in_addr)); /* * Then copy rest of options back * to close up the deleted entry. */ bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)), &cp[IPOPT_OFFSET+1], (unsigned)cnt - (IPOPT_MINOFF - 1)); break; } } if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) goto bad; *pcbopt = m; return (0); bad: (void)m_free(m); return (EINVAL); } /* * Check for the presence of the IP Router Alert option [RFC2113] * in the header of an IPv4 datagram. * * This call is not intended for use from the forwarding path; it is here * so that protocol domains may check for the presence of the option. * Given how FreeBSD's IPv4 stack is currently structured, the Router Alert * option does not have much relevance to the implementation, though this * may change in future. * Router alert options SHOULD be passed if running in IPSTEALTH mode and * we are not the endpoint. * Length checks on individual options should already have been performed * by ip_dooptions() therefore they are folded under INVARIANTS here. * * Return zero if not present or options are invalid, non-zero if present. */ int ip_checkrouteralert(struct mbuf *m) { struct ip *ip = mtod(m, struct ip *); u_char *cp; int opt, optlen, cnt, found_ra; found_ra = 0; cp = (u_char *)(ip + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { #ifdef INVARIANTS if (cnt < IPOPT_OLEN + sizeof(*cp)) break; #endif optlen = cp[IPOPT_OLEN]; #ifdef INVARIANTS if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) break; #endif } switch (opt) { case IPOPT_RA: #ifdef INVARIANTS if (optlen != IPOPT_OFFSET + sizeof(uint16_t) || (*((uint16_t *)&cp[IPOPT_OFFSET]) != 0)) break; else #endif found_ra = 1; break; default: break; } } return (found_ra); } diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index cbd2d72188fa..9266dce3f7c2 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -1,1579 +1,1582 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include "opt_mbuf_stress_test.h" #include "opt_mpath.h" #include "opt_ratelimit.h" #include "opt_route.h" #include "opt_rss.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #include #endif #include #include #include #ifdef MBUF_STRESS_TEST static int mbuf_frag_size = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); #endif static void ip_mloopback(struct ifnet *, const struct mbuf *, int); extern int in_mcast_loop; extern struct protosw inetsw[]; static inline int ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, int flags, struct inpcb *inp, struct sockaddr_in *dst, int *fibnum, int *error) { struct m_tag *fwd_tag = NULL; struct mbuf *m; struct in_addr odst; struct ip *ip; int pflags = PFIL_OUT; if (flags & IP_FORWARDING) pflags |= PFIL_FWD; m = *mp; ip = mtod(m, struct ip *); /* Run through list of hooks for output packets. */ odst.s_addr = ip->ip_dst.s_addr; switch (pfil_run_hooks(V_inet_pfil_head, mp, ifp, pflags, inp)) { case PFIL_DROPPED: *error = EPERM; /* FALLTHROUGH */ case PFIL_CONSUMED: return 1; /* Finished */ case PFIL_PASS: *error = 0; } m = *mp; ip = mtod(m, struct ip *); /* See if destination IP address was changed by packet filter. */ if (odst.s_addr != ip->ip_dst.s_addr) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip_input(). */ if (in_localip(ip->ip_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif *error = netisr_queue(NETISR_IP, m); return 1; /* Finished */ } bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; return -1; /* Reloop */ } /* See if fib was changed by packet filter. */ if ((*fibnum) != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; *fibnum = M_GETFIB(m); return -1; /* Reloop for FIB change */ } /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; *error = netisr_queue(NETISR_IP, m); return 1; /* Finished */ } /* Or forward to some other address? */ if ((m->m_flags & M_IP_NEXTHOP) && ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) { bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP_NEXTHOP; m_tag_delete(m, fwd_tag); return -1; /* Reloop for CHANGE of dst */ } return 0; } static int ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m, const struct sockaddr_in *gw, struct route *ro, bool stamp_tag) { #ifdef KERN_TLS struct ktls_session *tls = NULL; #endif struct m_snd_tag *mst; int error; MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); mst = NULL; #ifdef KERN_TLS /* * If this is an unencrypted TLS record, save a reference to * the record. This local reference is used to call * ktls_output_eagain after the mbuf has been freed (thus * dropping the mbuf's reference) in if_output. */ if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) { tls = ktls_hold(m->m_next->m_ext.ext_pgs->tls); mst = tls->snd_tag; /* * If a TLS session doesn't have a valid tag, it must * have had an earlier ifp mismatch, so drop this * packet. */ if (mst == NULL) { error = EAGAIN; goto done; } } #endif #ifdef RATELIMIT if (inp != NULL && mst == NULL) { if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 || (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp)) in_pcboutput_txrtlmt(inp, ifp, m); if (inp->inp_snd_tag != NULL) mst = inp->inp_snd_tag; } #endif if (stamp_tag && mst != NULL) { KASSERT(m->m_pkthdr.rcvif == NULL, ("trying to add a send tag to a forwarded packet")); if (mst->ifp != ifp) { error = EAGAIN; goto done; } /* stamp send tag on mbuf */ m->m_pkthdr.snd_tag = m_snd_tag_ref(mst); m->m_pkthdr.csum_flags |= CSUM_SND_TAG; } error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); done: /* Check for route change invalidating send tags. */ #ifdef KERN_TLS if (tls != NULL) { if (error == EAGAIN) error = ktls_output_eagain(inp, tls); ktls_free(tls); } #endif #ifdef RATELIMIT if (error == EAGAIN) in_pcboutput_eagain(inp); #endif return (error); } /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route ro is present and has ro_rt initialized, route lookup would be * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, * then result of route lookup is stored in ro->ro_rt. * * In the IP forwarding case, the packet will arrive with options already * inserted, so must have a NULL opt pointer. */ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { struct rm_priotracker in_ifa_tracker; struct epoch_tracker et; struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int mtu; int error = 0; struct sockaddr_in *dst, sin; const struct sockaddr_in *gw; struct in_ifaddr *ia; struct in_addr src; int isbroadcast; uint16_t ip_len, ip_off; uint32_t fibnum; #if defined(IPSEC) || defined(IPSEC_SUPPORT) int no_route_but_check_spd = 0; #endif M_ASSERTPKTHDR(m); if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); if ((flags & IP_NODEFAULTFLOWID) == 0) { m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } #ifdef NUMA m->m_pkthdr.numa_domain = inp->inp_numa_domain; #endif } if (opt) { int len = 0; m = ip_insertoptions(m, opt, &len); if (len != 0) hlen = len; /* ip->ip_hl is updated above */ } ip = mtod(m, struct ip *); ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip_fillid(ip); } else { /* Header already set, fetch hlen from there */ hlen = ip->ip_hl << 2; } if ((flags & IP_FORWARDING) == 0) IPSTAT_INC(ips_localout); /* * dst/gw handling: * * gw is readonly but can point either to dst OR rt_gateway, * therefore we need restore gw if we're redoing lookup. */ fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); if (ro != NULL) dst = (struct sockaddr_in *)&ro->ro_dst; else dst = &sin; if (ro == NULL || ro->ro_rt == NULL) { bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } gw = dst; NET_EPOCH_ENTER(et); again: /* * Validate route against routing table additions; * a better/more specific route might have been added. */ if (inp != NULL && ro != NULL && ro->ro_rt != NULL) RT_VALIDATE(ro, &inp->inp_rt_cookie, fibnum); /* * If there is a cached route, * check that it is to the same destination * and is still up. If not, free it and try again. * The address family should also be checked in case of sharing the * cache with IPv6. * Also check whether routing cache needs invalidation. */ if (ro != NULL && ro->ro_rt != NULL && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || ro->ro_rt->rt_ifp == NULL || !RT_LINK_IS_UP(ro->ro_rt->rt_ifp) || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) RO_INVALIDATE_CACHE(ro); ia = NULL; /* * If routing to interface only, short circuit routing lookup. * The use of an all-ones broadcast address implies this; an * interface is specified by the broadcast address of an interface, * or the destination address of a ptp interface. */ if (flags & IP_SENDONES) { if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } ip->ip_dst.s_addr = INADDR_BROADCAST; dst->sin_addr = ip->ip_dst; ifp = ia->ia_ifp; mtu = ifp->if_mtu; ip->ip_ttl = 1; isbroadcast = 1; src = IA_SIN(ia)->sin_addr; } else if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0, M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } ifp = ia->ia_ifp; mtu = ifp->if_mtu; ip->ip_ttl = 1; isbroadcast = ifp->if_flags & IFF_BROADCAST ? in_ifaddr_broadcast(dst->sin_addr, ia) : 0; src = IA_SIN(ia)->sin_addr; } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; mtu = ifp->if_mtu; IFP_TO_IA(ifp, ia, &in_ifa_tracker); isbroadcast = 0; /* fool gcc */ /* Interface may have no addresses. */ if (ia != NULL) src = IA_SIN(ia)->sin_addr; else src.s_addr = INADDR_ANY; } else if (ro != NULL) { if (ro->ro_rt == NULL) { /* * We want to do any cloning requested by the link * layer, as this is probably required in all cases * for correct operation (as it is for ARP). */ #ifdef RADIX_MPATH rtalloc_mpath_fib(ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), fibnum); #else in_rtalloc_ign(ro, 0, fibnum); #endif if (ro->ro_rt == NULL || (ro->ro_rt->rt_flags & RTF_UP) == 0 || ro->ro_rt->rt_ifp == NULL || !RT_LINK_IS_UP(ro->ro_rt->rt_ifp)) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * There is no route for this packet, but it is * possible that a matching SPD entry exists. */ no_route_but_check_spd = 1; mtu = 0; /* Silence GCC warning. */ goto sendit; #endif IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } } ia = ifatoia(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; counter_u64_add(ro->ro_rt->rt_pksent, 1); rt_update_ro_flags(ro); if (ro->ro_rt->rt_flags & RTF_GATEWAY) gw = (struct sockaddr_in *)ro->ro_rt->rt_gateway; if (ro->ro_rt->rt_flags & RTF_HOST) isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); else if (ifp->if_flags & IFF_BROADCAST) isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia); else isbroadcast = 0; if (ro->ro_rt->rt_flags & RTF_HOST) mtu = ro->ro_rt->rt_mtu; else mtu = ifp->if_mtu; src = IA_SIN(ia)->sin_addr; } else { struct nhop4_extended nh; bzero(&nh, sizeof(nh)); if (fib4_lookup_nh_ext(M_GETFIB(m), ip->ip_dst, 0, 0, &nh) != 0) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * There is no route for this packet, but it is * possible that a matching SPD entry exists. */ no_route_but_check_spd = 1; mtu = 0; /* Silence GCC warning. */ goto sendit; #endif IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } ifp = nh.nh_ifp; mtu = nh.nh_mtu; /* * We are rewriting here dst to be gw actually, contradicting * comment at the beginning of the function. However, in this * case we are always dealing with on stack dst. * In case if pfil(9) sends us back to beginning of the * function, the dst would be rewritten by ip_output_pfil(). */ MPASS(dst == &sin); dst->sin_addr = nh.nh_addr; ia = nh.nh_ia; src = nh.nh_src; isbroadcast = (((nh.nh_flags & (NHF_HOST | NHF_BROADCAST)) == (NHF_HOST | NHF_BROADCAST)) || ((ifp->if_flags & IFF_BROADCAST) && in_ifaddr_broadcast(dst->sin_addr, ia))); } /* Catch a possible divide by zero later. */ KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (rt_flags=0x%08x) ifp=%p", __func__, mtu, ro, (ro != NULL && ro->ro_rt != NULL) ? ro->ro_rt->rt_flags : 0, ifp)); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { m->m_flags |= M_MCAST; /* * IP destination address is multicast. Make sure "gw" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ gw = dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { ip->ip_ttl = imo->imo_multicast_ttl; if (imo->imo_multicast_vif != -1) ip->ip_src.s_addr = ip_mcast_src ? ip_mcast_src(imo->imo_multicast_vif) : INADDR_ANY; } else ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } } /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) ip->ip_src = src; if ((imo == NULL && in_mcast_loop) || (imo && imo->imo_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we are not a member * of the group; ip_input() will filter it later, * thus deferring a hash lookup and mutex acquisition * at the expense of a cheap copy using m_copym(). */ ip_mloopback(ifp, m, hlen); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IP_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip_mloopback(), * above, will be forwarded by the ip_input() routine, * if necessary. */ if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * If rsvp daemon is not running, do not * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ if (!V_rsvp_on) imo = NULL; if (ip_mforward && ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip_mloopback() will * loop back a copy. ip_input() will drop the copy if * this host does not belong to the destination group on * the loopback interface. */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); goto done; } goto sendit; } /* * If the source address is not specified yet, use the address * of the outoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) ip->ip_src = src; /* * Look for broadcast address and * verify user is allowed to send * such a packet. */ if (isbroadcast) { if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EADDRNOTAVAIL; goto bad; } if ((flags & IP_ALLOWBROADCAST) == 0) { error = EACCES; goto bad; } /* don't allow broadcast messages to be fragmented */ if (ip_len > mtu) { error = EMSGSIZE; goto bad; } m->m_flags |= M_BCAST; } else { m->m_flags &= ~M_BCAST; } sendit: #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if ((error = IPSEC_OUTPUT(ipv4, m, inp)) != 0) { if (error == EINPROGRESS) error = 0; goto done; } } /* * Check if there was a route for this packet; return error if not. */ if (no_route_but_check_spd) { IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } /* Update variables that are affected by ipsec4_output(). */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ if (PFIL_HOOKED_OUT(V_inet_pfil_head)) { switch (ip_output_pfil(&m, ifp, flags, inp, dst, &fibnum, &error)) { case 1: /* Finished */ goto done; case 0: /* Continue normally */ ip = mtod(m, struct ip *); break; case -1: /* Need to try again */ /* Reset everything for a new round */ if (ro != NULL) { RO_RTFREE(ro); ro->ro_prepend = NULL; } gw = dst; ip = mtod(m, struct ip *); goto again; } } /* IN_LOOPBACK must not appear on the wire - RFC1122. */ if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) || IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); error = EADDRNOTAVAIL; goto bad; } } m->m_pkthdr.csum_flags |= CSUM_IP; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { m = mb_unmapped_to_ext(m); if (m == NULL) { IPSTAT_INC(ips_odropped); error = ENOBUFS; goto bad; } in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) { m = mb_unmapped_to_ext(m); if (m == NULL) { IPSTAT_INC(ips_odropped); error = ENOBUFS; goto bad; } } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { m = mb_unmapped_to_ext(m); if (m == NULL) { IPSTAT_INC(ips_odropped); error = ENOBUFS; goto bad; } sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. */ if (ip_len <= mtu || (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) { ip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } /* * Record statistics for this interface address. * With CSUM_TSO the byte/packet count will be slightly * incorrect because we count the IP+TCP headers only * once instead of for every generated packet. */ if (!(flags & IP_FORWARDING) && ia) { if (m->m_pkthdr.csum_flags & CSUM_TSO) counter_u64_add(ia->ia_ifa.ifa_opackets, m->m_pkthdr.len / m->m_pkthdr.tso_segsz); else counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } #ifdef MBUF_STRESS_TEST if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) m = m_fragment(m, M_NOWAIT, mbuf_frag_size); #endif /* * Reset layer specific mbuf flags * to avoid confusing lower layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = ip_output_send(inp, ifp, m, gw, ro, (flags & IP_NO_SND_TAG_RL) ? false : true); goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; IPSTAT_INC(ips_cantfrag); goto bad; } /* * Too large for interface; fragment if possible. If successful, * on return, m will point to a list of packets to be sent. */ error = ip_fragment(ip, &m, mtu, ifp->if_hwassist); if (error) goto bad; for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } /* * Reset layer specific mbuf flags * to avoid confusing upper layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp, mtod(m, struct ip *), NULL); error = ip_output_send(inp, ifp, m, gw, ro, true); } else m_freem(m); } if (error == 0) IPSTAT_INC(ips_fragmented); done: NET_EPOCH_EXIT(et); return (error); bad: m_freem(m); goto done; } /* * Create a chain of fragments which fit the given mtu. m_frag points to the * mbuf to be fragmented; on return it points to the chain with the fragments. * Return 0 if no error. If error, m_frag may contain a partially built * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags) { int error = 0; int hlen = ip->ip_hl << 2; int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ int off; struct mbuf *m0 = *m_frag; /* the original packet */ int firstlen; struct mbuf **mnext; int nfrags; uint16_t ip_len, ip_off; ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); if (ip_off & IP_DF) { /* Fragmentation not allowed */ IPSTAT_INC(ips_cantfrag); return EMSGSIZE; } /* * Must be able to put at least 8 bytes per fragment. */ if (len < 8) return EMSGSIZE; /* * If the interface will not calculate checksums on * fragmented packets, then do it here. */ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m0 = mb_unmapped_to_ext(m0); if (m0 == NULL) { error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m0->m_pkthdr.csum_flags & CSUM_SCTP) { m0 = mb_unmapped_to_ext(m0); if (m0 == NULL) { error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } sctp_delayed_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif if (len > PAGE_SIZE) { /* * Fragment large datagrams such that each segment * contains a multiple of PAGE_SIZE amount of data, * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. * * XXX When does this help given that sender and receiver * could have different page sizes, and also mtu could * be less than the receiver's page size ? */ int newlen; off = MIN(mtu, m0->m_pkthdr.len); /* * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) goto smart_frag_failure; off = ((off - hlen) & ~7) + hlen; newlen = (~PAGE_MASK) & mtu; if ((newlen + sizeof (struct ip)) > mtu) { /* we failed, go back the default */ smart_frag_failure: newlen = len; off = hlen + len; } len = newlen; } else { off = hlen + len; } firstlen = off - hlen; mnext = &m0->m_nextpkt; /* pointer to next packet */ /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. * Here, m0 is the original packet, m is the fragment being created. * The fragments are linked off the m_nextpkt of the original * packet, which after processing serves as the first fragment. */ for (nfrags = 1; off < ip_len; off += len, nfrags++) { struct ip *mhip; /* ip header on the fragment */ struct mbuf *m; int mhlen = sizeof (struct ip); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } /* * Make sure the complete packet header gets copied * from the originating mbuf to the newly created * mbuf. This also ensures that existing firewall * classification(s), VLAN tags and so on get copied * to the resulting fragmented packet(s): */ if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) { m_free(m); error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } /* * In the first mbuf, leave room for the link header, then * copy the original IP header including options. The payload * goes into an additional mbuf chain returned by m_copym(). */ m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); *mhip = *ip; if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); mhip->ip_v = IPVERSION; mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; /* XXX do we need to add ip_off below ? */ mhip->ip_off = ((off - hlen) >> 3) + ip_off; if (off + len >= ip_len) len = ip_len - off; else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copym(m0, off, len, M_NOWAIT); if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ IPSTAT_INC(ips_odropped); goto done; } m->m_pkthdr.len = mhlen + len; #ifdef MAC mac_netinet_fragment(m0, m); #endif mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { mhip->ip_sum = in_cksum(m, mhlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } *mnext = m; mnext = &m->m_nextpkt; } IPSTAT_ADD(ips_ofragments, nfrags); /* * Update first fragment by trimming what's been copied out * and updating header. */ m_adj(m0, hlen + firstlen - ip_len); m0->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m0->m_pkthdr.len); ip->ip_off = htons(ip_off | IP_MF); ip->ip_sum = 0; if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { ip->ip_sum = in_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_IP; } done: *m_frag = m0; return error; } void in_delayed_cksum(struct mbuf *m) { struct ip *ip; struct udphdr *uh; uint16_t cklen, csum, offset; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; if (m->m_pkthdr.csum_flags & CSUM_UDP) { /* if udp header is not in the first mbuf copy udplen */ if (offset + sizeof(struct udphdr) > m->m_len) { m_copydata(m, offset + offsetof(struct udphdr, uh_ulen), sizeof(cklen), (caddr_t)&cklen); cklen = ntohs(cklen); } else { uh = (struct udphdr *)mtodo(m, offset); cklen = ntohs(uh->uh_ulen); } csum = in_cksum_skip(m, cklen + offset, offset); if (csum == 0) csum = 0xffff; } else { cklen = ntohs(ip->ip_len); csum = in_cksum_skip(m, cklen, offset); } offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(csum) > m->m_len) m_copyback(m, offset, sizeof(csum), (caddr_t)&csum); else *(u_short *)mtodo(m, offset) = csum; } /* * IP socket option processing. */ int ip_ctloutput(struct socket *so, struct sockopt *sopt) { - struct inpcb *inp = sotoinpcb(so); + struct inpcb *inp = sotoinpcb(so); + struct epoch_tracker et; int error, optval; #ifdef RSS uint32_t rss_bucket; int retval; #endif error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(inp); if ((so->so_options & SO_REUSEADDR) != 0) inp->inp_flags2 |= INP_REUSEADDR; else inp->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT) != 0) inp->inp_flags2 |= INP_REUSEPORT; else inp->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT_LB: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT_LB) != 0) inp->inp_flags2 |= INP_REUSEPORT_LB; else inp->inp_flags2 &= ~INP_REUSEPORT_LB; INP_WUNLOCK(inp); error = 0; break; case SO_SETFIB: INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); error = 0; break; case SO_MAX_PACING_RATE: #ifdef RATELIMIT INP_WLOCK(inp); inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; INP_WUNLOCK(inp); error = 0; #else error = EOPNOTSUPP; #endif break; default: break; } } return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif { struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); if (error) { m_free(m); break; } INP_WLOCK(inp); error = ip_pcbopts(inp, sopt->sopt_name, m); INP_WUNLOCK(inp); return (error); } case IP_BINDANY: if (sopt->sopt_td != NULL) { error = priv_check(sopt->sopt_td, PRIV_NETINET_BINDANY); if (error) break; } /* FALLTHROUGH */ case IP_BINDMULTI: #ifdef RSS case IP_RSS_LISTEN_BUCKET: #endif case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_ORIGDSTADDR: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_ONESBCAST: case IP_DONTFRAG: case IP_RECVTOS: case IP_RECVFLOWID: #ifdef RSS case IP_RECVRSSBUCKETID: #endif error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos = optval; break; case IP_TTL: inp->inp_ip_ttl = optval; break; case IP_MINTTL: if (optval >= 0 && optval <= MAXTTL) inp->inp_ip_minttl = optval; else error = EINVAL; break; #define OPTSET(bit) do { \ INP_WLOCK(inp); \ if (optval) \ inp->inp_flags |= bit; \ else \ inp->inp_flags &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) #define OPTSET2(bit, val) do { \ INP_WLOCK(inp); \ if (val) \ inp->inp_flags2 |= bit; \ else \ inp->inp_flags2 &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; case IP_RECVRETOPTS: OPTSET(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: OPTSET(INP_RECVDSTADDR); break; case IP_ORIGDSTADDR: OPTSET2(INP_ORIGDSTADDR, optval); break; case IP_RECVTTL: OPTSET(INP_RECVTTL); break; case IP_RECVIF: OPTSET(INP_RECVIF); break; case IP_ONESBCAST: OPTSET(INP_ONESBCAST); break; case IP_DONTFRAG: OPTSET(INP_DONTFRAG); break; case IP_BINDANY: OPTSET(INP_BINDANY); break; case IP_RECVTOS: OPTSET(INP_RECVTOS); break; case IP_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; case IP_RECVFLOWID: OPTSET2(INP_RECVFLOWID, optval); break; #ifdef RSS case IP_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { inp->inp_rss_listen_bucket = optval; OPTSET2(INP_RSS_BUCKET_SET, 1); } else { error = EINVAL; } break; case IP_RECVRSSBUCKETID: OPTSET2(INP_RECVRSSBUCKETID, optval); break; #endif } break; #undef OPTSET #undef OPTSET2 /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case IP_MSFILTER: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_setmoptions(inp, sopt); break; case IP_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(inp); switch (optval) { case IP_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IP_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IP_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(inp); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IP_IPSEC_POLICY: if (IPSEC_ENABLED(ipv4)) { error = IPSEC_PCBCTL(ipv4, inp, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: INP_RLOCK(inp); if (inp->inp_options) { struct mbuf *options; options = m_copym(inp->inp_options, 0, M_COPYALL, M_NOWAIT); INP_RUNLOCK(inp); if (options != NULL) { error = sooptcopyout(sopt, mtod(options, char *), options->m_len); m_freem(options); } else error = ENOMEM; } else { INP_RUNLOCK(inp); sopt->sopt_valsize = 0; } break; case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_ORIGDSTADDR: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_PORTRANGE: case IP_ONESBCAST: case IP_DONTFRAG: case IP_BINDANY: case IP_RECVTOS: case IP_BINDMULTI: case IP_FLOWID: case IP_FLOWTYPE: case IP_RECVFLOWID: #ifdef RSS case IP_RSSBUCKETID: case IP_RECVRSSBUCKETID: #endif switch (sopt->sopt_name) { case IP_TOS: optval = inp->inp_ip_tos; break; case IP_TTL: optval = inp->inp_ip_ttl; break; case IP_MINTTL: optval = inp->inp_ip_minttl; break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) #define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); break; case IP_RECVRETOPTS: optval = OPTBIT(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: optval = OPTBIT(INP_RECVDSTADDR); break; case IP_ORIGDSTADDR: optval = OPTBIT2(INP_ORIGDSTADDR); break; case IP_RECVTTL: optval = OPTBIT(INP_RECVTTL); break; case IP_RECVIF: optval = OPTBIT(INP_RECVIF); break; case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; else if (inp->inp_flags & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = 0; break; case IP_ONESBCAST: optval = OPTBIT(INP_ONESBCAST); break; case IP_DONTFRAG: optval = OPTBIT(INP_DONTFRAG); break; case IP_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IP_RECVTOS: optval = OPTBIT(INP_RECVTOS); break; case IP_FLOWID: optval = inp->inp_flowid; break; case IP_FLOWTYPE: optval = inp->inp_flowtype; break; case IP_RECVFLOWID: optval = OPTBIT2(INP_RECVFLOWID); break; #ifdef RSS case IP_RSSBUCKETID: retval = rss_hash2bucket(inp->inp_flowid, inp->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; case IP_RECVRSSBUCKETID: optval = OPTBIT2(INP_RECVRSSBUCKETID); break; #endif case IP_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_MSFILTER: + NET_EPOCH_ENTER(et); error = inp_getmoptions(inp, sopt); + NET_EPOCH_EXIT(et); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IP_IPSEC_POLICY: if (IPSEC_ENABLED(ipv4)) { error = IPSEC_PCBCTL(ipv4, inp, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Routine called from ip_output() to loop back a copy of an IP multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be a loopback interface -- evil, but easier than * replicating that code here. */ static void ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen) { struct ip *ip; struct mbuf *copym; /* * Make a deep copy of the packet because we're going to * modify the pack in order to generate checksums. */ copym = m_dup(m, M_NOWAIT); if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* If needed, compute the checksum and mark it as valid. */ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(copym); copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } /* * We don't bother to fragment if the IP length is greater * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); if_simloop(ifp, copym, AF_INET, 0); } } diff --git a/sys/netinet/sctp_bsd_addr.c b/sys/netinet/sctp_bsd_addr.c index e0432fb6b689..df8841c767c3 100644 --- a/sys/netinet/sctp_bsd_addr.c +++ b/sys/netinet/sctp_bsd_addr.c @@ -1,550 +1,549 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Declare all of our malloc named types */ MALLOC_DEFINE(SCTP_M_MAP, "sctp_map", "sctp asoc map descriptor"); MALLOC_DEFINE(SCTP_M_STRMI, "sctp_stri", "sctp stream in array"); MALLOC_DEFINE(SCTP_M_STRMO, "sctp_stro", "sctp stream out array"); MALLOC_DEFINE(SCTP_M_ASC_ADDR, "sctp_aadr", "sctp asconf address"); MALLOC_DEFINE(SCTP_M_ASC_IT, "sctp_a_it", "sctp asconf iterator"); MALLOC_DEFINE(SCTP_M_AUTH_CL, "sctp_atcl", "sctp auth chunklist"); MALLOC_DEFINE(SCTP_M_AUTH_KY, "sctp_atky", "sctp auth key"); MALLOC_DEFINE(SCTP_M_AUTH_HL, "sctp_athm", "sctp auth hmac list"); MALLOC_DEFINE(SCTP_M_AUTH_IF, "sctp_athi", "sctp auth info"); MALLOC_DEFINE(SCTP_M_STRESET, "sctp_stre", "sctp stream reset"); MALLOC_DEFINE(SCTP_M_CMSG, "sctp_cmsg", "sctp CMSG buffer"); MALLOC_DEFINE(SCTP_M_COPYAL, "sctp_cpal", "sctp copy all"); MALLOC_DEFINE(SCTP_M_VRF, "sctp_vrf", "sctp vrf struct"); MALLOC_DEFINE(SCTP_M_IFA, "sctp_ifa", "sctp ifa struct"); MALLOC_DEFINE(SCTP_M_IFN, "sctp_ifn", "sctp ifn struct"); MALLOC_DEFINE(SCTP_M_TIMW, "sctp_timw", "sctp time block"); MALLOC_DEFINE(SCTP_M_MVRF, "sctp_mvrf", "sctp mvrf pcb list"); MALLOC_DEFINE(SCTP_M_ITER, "sctp_iter", "sctp iterator control"); MALLOC_DEFINE(SCTP_M_SOCKOPT, "sctp_socko", "sctp socket option"); MALLOC_DEFINE(SCTP_M_MCORE, "sctp_mcore", "sctp mcore queue"); /* Global NON-VNET structure that controls the iterator */ struct iterator_control sctp_it_ctl; void sctp_wakeup_iterator(void) { wakeup(&sctp_it_ctl.iterator_running); } static void sctp_iterator_thread(void *v SCTP_UNUSED) { SCTP_IPI_ITERATOR_WQ_LOCK(); /* In FreeBSD this thread never terminates. */ for (;;) { msleep(&sctp_it_ctl.iterator_running, &sctp_it_ctl.ipi_iterator_wq_mtx, 0, "waiting_for_work", 0); sctp_iterator_worker(); } } void sctp_startup_iterator(void) { if (sctp_it_ctl.thread_proc) { /* You only get one */ return; } /* Initialize global locks here, thus only once. */ SCTP_ITERATOR_LOCK_INIT(); SCTP_IPI_ITERATOR_WQ_INIT(); TAILQ_INIT(&sctp_it_ctl.iteratorhead); kproc_create(sctp_iterator_thread, (void *)NULL, &sctp_it_ctl.thread_proc, RFPROC, SCTP_KTHREAD_PAGES, SCTP_KTRHEAD_NAME); } #ifdef INET6 void sctp_gather_internal_ifa_flags(struct sctp_ifa *ifa) { struct in6_ifaddr *ifa6; ifa6 = (struct in6_ifaddr *)ifa->ifa; ifa->flags = ifa6->ia6_flags; if (!MODULE_GLOBAL(ip6_use_deprecated)) { if (ifa->flags & IN6_IFF_DEPRECATED) { ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE; } else { ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE; } } else { ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE; } if (ifa->flags & (IN6_IFF_DETACHED | IN6_IFF_ANYCAST | IN6_IFF_NOTREADY)) { ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE; } else { ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE; } } #endif /* INET6 */ static uint32_t sctp_is_desired_interface_type(struct ifnet *ifn) { int result; /* check the interface type to see if it's one we care about */ switch (ifn->if_type) { case IFT_ETHER: case IFT_ISO88023: case IFT_ISO88024: case IFT_ISO88025: case IFT_ISO88026: case IFT_STARLAN: case IFT_P10: case IFT_P80: case IFT_HY: case IFT_FDDI: case IFT_XETHER: case IFT_ISDNBASIC: case IFT_ISDNPRIMARY: case IFT_PTPSERIAL: case IFT_OTHER: case IFT_PPP: case IFT_LOOP: case IFT_SLIP: case IFT_GIF: case IFT_L2VLAN: case IFT_STF: case IFT_IP: case IFT_IPOVERCDLC: case IFT_IPOVERCLAW: case IFT_PROPVIRTUAL: /* NetGraph Virtual too */ case IFT_VIRTUALIPADDRESS: result = 1; break; default: result = 0; } return (result); } static void sctp_init_ifns_for_vrf(int vrfid) { /* * Here we must apply ANY locks needed by the IFN we access and also * make sure we lock any IFA that exists as we float through the * list of IFA's */ + struct epoch_tracker et; struct ifnet *ifn; struct ifaddr *ifa; struct sctp_ifa *sctp_ifa; uint32_t ifa_flags; #ifdef INET6 struct in6_ifaddr *ifa6; #endif IFNET_RLOCK(); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_link) { - struct epoch_tracker et; - if (sctp_is_desired_interface_type(ifn) == 0) { /* non desired type */ continue; } - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifn->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL) { continue; } switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: if (((struct sockaddr_in *)ifa->ifa_addr)->sin_addr.s_addr == 0) { continue; } break; #endif #ifdef INET6 case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr)) { /* skip unspecifed addresses */ continue; } break; #endif default: continue; } switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifa_flags = 0; break; #endif #ifdef INET6 case AF_INET6: ifa6 = (struct in6_ifaddr *)ifa; ifa_flags = ifa6->ia6_flags; break; #endif default: ifa_flags = 0; break; } sctp_ifa = sctp_add_addr_to_vrf(vrfid, (void *)ifn, ifn->if_index, ifn->if_type, ifn->if_xname, (void *)ifa, ifa->ifa_addr, ifa_flags, 0); if (sctp_ifa) { sctp_ifa->localifa_flags &= ~SCTP_ADDR_DEFER_USE; } } - NET_EPOCH_EXIT(et); } + NET_EPOCH_EXIT(et); IFNET_RUNLOCK(); } void sctp_init_vrf_list(int vrfid) { if (vrfid > SCTP_MAX_VRF_ID) /* can't do that */ return; /* Don't care about return here */ (void)sctp_allocate_vrf(vrfid); /* * Now we need to build all the ifn's for this vrf and there * addresses */ sctp_init_ifns_for_vrf(vrfid); } void sctp_addr_change(struct ifaddr *ifa, int cmd) { uint32_t ifa_flags = 0; if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) { return; } /* * BSD only has one VRF, if this changes we will need to hook in the * right things here to get the id to pass to the address management * routine. */ if (SCTP_BASE_VAR(first_time) == 0) { /* Special test to see if my ::1 will showup with this */ SCTP_BASE_VAR(first_time) = 1; sctp_init_ifns_for_vrf(SCTP_DEFAULT_VRFID); } if ((cmd != RTM_ADD) && (cmd != RTM_DELETE)) { /* don't know what to do with this */ return; } if (ifa->ifa_addr == NULL) { return; } if (sctp_is_desired_interface_type(ifa->ifa_ifp) == 0) { /* non desired type */ return; } switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: if (((struct sockaddr_in *)ifa->ifa_addr)->sin_addr.s_addr == 0) { return; } break; #endif #ifdef INET6 case AF_INET6: ifa_flags = ((struct in6_ifaddr *)ifa)->ia6_flags; if (IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr)) { /* skip unspecifed addresses */ return; } break; #endif default: /* non inet/inet6 skip */ return; } if (cmd == RTM_ADD) { (void)sctp_add_addr_to_vrf(SCTP_DEFAULT_VRFID, (void *)ifa->ifa_ifp, ifa->ifa_ifp->if_index, ifa->ifa_ifp->if_type, ifa->ifa_ifp->if_xname, (void *)ifa, ifa->ifa_addr, ifa_flags, 1); } else { sctp_del_addr_from_vrf(SCTP_DEFAULT_VRFID, ifa->ifa_addr, ifa->ifa_ifp->if_index, ifa->ifa_ifp->if_xname); /* * We don't bump refcount here so when it completes the * final delete will happen. */ } } void sctp_add_or_del_interfaces(int (*pred) (struct ifnet *), int add){ struct ifnet *ifn; struct ifaddr *ifa; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_link) { if (!(*pred) (ifn)) { continue; } CK_STAILQ_FOREACH(ifa, &ifn->if_addrhead, ifa_link) { sctp_addr_change(ifa, add ? RTM_ADD : RTM_DELETE); } } IFNET_RUNLOCK(); } struct mbuf * sctp_get_mbuf_for_msg(unsigned int space_needed, int want_header, int how, int allonebuf, int type) { struct mbuf *m = NULL; m = m_getm2(NULL, space_needed, how, type, want_header ? M_PKTHDR : 0); if (m == NULL) { /* bad, no memory */ return (m); } if (allonebuf) { if (SCTP_BUF_SIZE(m) < space_needed) { m_freem(m); return (NULL); } KASSERT(SCTP_BUF_NEXT(m) == NULL, ("%s: no chain allowed", __FUNCTION__)); } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mb(m, SCTP_MBUF_IALLOC); } #endif return (m); } #ifdef SCTP_PACKET_LOGGING void sctp_packet_log(struct mbuf *m) { int *lenat, thisone; void *copyto; uint32_t *tick_tock; int length; int total_len; int grabbed_lock = 0; int value, newval, thisend, thisbegin; /* * Buffer layout. -sizeof this entry (total_len) -previous end * (value) -ticks of log (ticks) o -ip packet o -as logged - * where this started (thisbegin) x <--end points here */ length = SCTP_HEADER_LEN(m); total_len = SCTP_SIZE32((length + (4 * sizeof(int)))); /* Log a packet to the buffer. */ if (total_len > SCTP_PACKET_LOG_SIZE) { /* Can't log this packet I have not a buffer big enough */ return; } if (length < (int)(SCTP_MIN_V4_OVERHEAD + sizeof(struct sctp_cookie_ack_chunk))) { return; } atomic_add_int(&SCTP_BASE_VAR(packet_log_writers), 1); try_again: if (SCTP_BASE_VAR(packet_log_writers) > SCTP_PKTLOG_WRITERS_NEED_LOCK) { SCTP_IP_PKTLOG_LOCK(); grabbed_lock = 1; again_locked: value = SCTP_BASE_VAR(packet_log_end); newval = SCTP_BASE_VAR(packet_log_end) + total_len; if (newval >= SCTP_PACKET_LOG_SIZE) { /* we wrapped */ thisbegin = 0; thisend = total_len; } else { thisbegin = SCTP_BASE_VAR(packet_log_end); thisend = newval; } if (!(atomic_cmpset_int(&SCTP_BASE_VAR(packet_log_end), value, thisend))) { goto again_locked; } } else { value = SCTP_BASE_VAR(packet_log_end); newval = SCTP_BASE_VAR(packet_log_end) + total_len; if (newval >= SCTP_PACKET_LOG_SIZE) { /* we wrapped */ thisbegin = 0; thisend = total_len; } else { thisbegin = SCTP_BASE_VAR(packet_log_end); thisend = newval; } if (!(atomic_cmpset_int(&SCTP_BASE_VAR(packet_log_end), value, thisend))) { goto try_again; } } /* Sanity check */ if (thisend >= SCTP_PACKET_LOG_SIZE) { SCTP_PRINTF("Insanity stops a log thisbegin:%d thisend:%d writers:%d lock:%d end:%d\n", thisbegin, thisend, SCTP_BASE_VAR(packet_log_writers), grabbed_lock, SCTP_BASE_VAR(packet_log_end)); SCTP_BASE_VAR(packet_log_end) = 0; goto no_log; } lenat = (int *)&SCTP_BASE_VAR(packet_log_buffer)[thisbegin]; *lenat = total_len; lenat++; *lenat = value; lenat++; tick_tock = (uint32_t *)lenat; lenat++; *tick_tock = sctp_get_tick_count(); copyto = (void *)lenat; thisone = thisend - sizeof(int); lenat = (int *)&SCTP_BASE_VAR(packet_log_buffer)[thisone]; *lenat = thisbegin; if (grabbed_lock) { SCTP_IP_PKTLOG_UNLOCK(); grabbed_lock = 0; } m_copydata(m, 0, length, (caddr_t)copyto); no_log: if (grabbed_lock) { SCTP_IP_PKTLOG_UNLOCK(); } atomic_subtract_int(&SCTP_BASE_VAR(packet_log_writers), 1); } int sctp_copy_out_packet_log(uint8_t *target, int length) { /* * We wind through the packet log starting at start copying up to * length bytes out. We return the number of bytes copied. */ int tocopy, this_copy; int *lenat; int did_delay = 0; tocopy = length; if (length < (int)(2 * sizeof(int))) { /* not enough room */ return (0); } if (SCTP_PKTLOG_WRITERS_NEED_LOCK) { atomic_add_int(&SCTP_BASE_VAR(packet_log_writers), SCTP_PKTLOG_WRITERS_NEED_LOCK); again: if ((did_delay == 0) && (SCTP_BASE_VAR(packet_log_writers) != SCTP_PKTLOG_WRITERS_NEED_LOCK)) { /* * we delay here for just a moment hoping the * writer(s) that were present when we entered will * have left and we only have locking ones that will * contend with us for the lock. This does not * assure 100% access, but its good enough for a * logging facility like this. */ did_delay = 1; DELAY(10); goto again; } } SCTP_IP_PKTLOG_LOCK(); lenat = (int *)target; *lenat = SCTP_BASE_VAR(packet_log_end); lenat++; this_copy = min((length - sizeof(int)), SCTP_PACKET_LOG_SIZE); memcpy((void *)lenat, (void *)SCTP_BASE_VAR(packet_log_buffer), this_copy); if (SCTP_PKTLOG_WRITERS_NEED_LOCK) { atomic_subtract_int(&SCTP_BASE_VAR(packet_log_writers), SCTP_PKTLOG_WRITERS_NEED_LOCK); } SCTP_IP_PKTLOG_UNLOCK(); return (this_copy + sizeof(int)); } #endif diff --git a/sys/netinet6/icmp6.c b/sys/netinet6/icmp6.c index b18de150fd33..f2d0321ffdab 100644 --- a/sys/netinet6/icmp6.c +++ b/sys/netinet6/icmp6.c @@ -1,2826 +1,2818 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #define MBUF_PRIVATE /* XXXRW: Optimisation tries to avoid M_EXT mbufs */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern struct domain inet6domain; VNET_PCPUSTAT_DEFINE(struct icmp6stat, icmp6stat); VNET_PCPUSTAT_SYSINIT(icmp6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(icmp6stat); #endif /* VIMAGE */ VNET_DECLARE(struct inpcbinfo, ripcbinfo); VNET_DECLARE(struct inpcbhead, ripcb); VNET_DECLARE(int, icmp6errppslim); VNET_DEFINE_STATIC(int, icmp6errpps_count) = 0; VNET_DEFINE_STATIC(struct timeval, icmp6errppslim_last); VNET_DECLARE(int, icmp6_nodeinfo); #define V_ripcbinfo VNET(ripcbinfo) #define V_ripcb VNET(ripcb) #define V_icmp6errppslim VNET(icmp6errppslim) #define V_icmp6errpps_count VNET(icmp6errpps_count) #define V_icmp6errppslim_last VNET(icmp6errppslim_last) #define V_icmp6_nodeinfo VNET(icmp6_nodeinfo) static void icmp6_errcount(int, int); static int icmp6_rip6_input(struct mbuf **, int); static int icmp6_ratelimit(const struct in6_addr *, const int, const int); static const char *icmp6_redirect_diag(struct in6_addr *, struct in6_addr *, struct in6_addr *); static struct mbuf *ni6_input(struct mbuf *, int, struct prison *); static struct mbuf *ni6_nametodns(const char *, int, int); static int ni6_dnsmatch(const char *, int, const char *, int); static int ni6_addrs(struct icmp6_nodeinfo *, struct mbuf *, struct ifnet **, struct in6_addr *); static int ni6_store_addrs(struct icmp6_nodeinfo *, struct icmp6_nodeinfo *, struct ifnet *, int); static int icmp6_notify_error(struct mbuf **, int, int, int); /* * Kernel module interface for updating icmp6stat. The argument is an index * into icmp6stat treated as an array of u_quad_t. While this encodes the * general layout of icmp6stat into the caller, it doesn't encode its * location, so that future changes to add, for example, per-CPU stats * support won't cause binary compatibility problems for kernel modules. */ void kmod_icmp6stat_inc(int statnum) { counter_u64_add(VNET(icmp6stat)[statnum], 1); } static void icmp6_errcount(int type, int code) { switch (type) { case ICMP6_DST_UNREACH: switch (code) { case ICMP6_DST_UNREACH_NOROUTE: ICMP6STAT_INC(icp6s_odst_unreach_noroute); return; case ICMP6_DST_UNREACH_ADMIN: ICMP6STAT_INC(icp6s_odst_unreach_admin); return; case ICMP6_DST_UNREACH_BEYONDSCOPE: ICMP6STAT_INC(icp6s_odst_unreach_beyondscope); return; case ICMP6_DST_UNREACH_ADDR: ICMP6STAT_INC(icp6s_odst_unreach_addr); return; case ICMP6_DST_UNREACH_NOPORT: ICMP6STAT_INC(icp6s_odst_unreach_noport); return; } break; case ICMP6_PACKET_TOO_BIG: ICMP6STAT_INC(icp6s_opacket_too_big); return; case ICMP6_TIME_EXCEEDED: switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: ICMP6STAT_INC(icp6s_otime_exceed_transit); return; case ICMP6_TIME_EXCEED_REASSEMBLY: ICMP6STAT_INC(icp6s_otime_exceed_reassembly); return; } break; case ICMP6_PARAM_PROB: switch (code) { case ICMP6_PARAMPROB_HEADER: ICMP6STAT_INC(icp6s_oparamprob_header); return; case ICMP6_PARAMPROB_NEXTHEADER: ICMP6STAT_INC(icp6s_oparamprob_nextheader); return; case ICMP6_PARAMPROB_OPTION: ICMP6STAT_INC(icp6s_oparamprob_option); return; } break; case ND_REDIRECT: ICMP6STAT_INC(icp6s_oredirect); return; } ICMP6STAT_INC(icp6s_ounknown); } /* * A wrapper function for icmp6_error() necessary when the erroneous packet * may not contain enough scope zone information. */ void icmp6_error2(struct mbuf *m, int type, int code, int param, struct ifnet *ifp) { struct ip6_hdr *ip6; if (ifp == NULL) return; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); #else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) return; } #endif ip6 = mtod(m, struct ip6_hdr *); if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0) return; if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) return; icmp6_error(m, type, code, param); } /* * Generate an error packet of type error in response to bad IP6 packet. */ void icmp6_error(struct mbuf *m, int type, int code, int param) { struct ip6_hdr *oip6, *nip6; struct icmp6_hdr *icmp6; u_int preplen; int off; int nxt; ICMP6STAT_INC(icp6s_error); /* count per-type-code statistics */ icmp6_errcount(type, code); #ifdef M_DECRYPTED /*not openbsd*/ if (m->m_flags & M_DECRYPTED) { ICMP6STAT_INC(icp6s_canterror); goto freeit; } #endif #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); #else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) return; } #endif oip6 = mtod(m, struct ip6_hdr *); /* * If the destination address of the erroneous packet is a multicast * address, or the packet was sent using link-layer multicast, * we should basically suppress sending an error (RFC 2463, Section * 2.4). * We have two exceptions (the item e.2 in that section): * - the Packet Too Big message can be sent for path MTU discovery. * - the Parameter Problem Message that can be allowed an icmp6 error * in the option type field. This check has been done in * ip6_unknown_opt(), so we can just check the type and code. */ if ((m->m_flags & (M_BCAST|M_MCAST) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) && (type != ICMP6_PACKET_TOO_BIG && (type != ICMP6_PARAM_PROB || code != ICMP6_PARAMPROB_OPTION))) goto freeit; /* * RFC 2463, 2.4 (e.5): source address check. * XXX: the case of anycast source? */ if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_src)) goto freeit; /* * If we are about to send ICMPv6 against ICMPv6 error/redirect, * don't do it. */ nxt = -1; off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); if (off >= 0 && nxt == IPPROTO_ICMPV6) { struct icmp6_hdr *icp; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, off + sizeof(struct icmp6_hdr), ); icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off, sizeof(*icp)); if (icp == NULL) { ICMP6STAT_INC(icp6s_tooshort); return; } #endif if (icp->icmp6_type < ICMP6_ECHO_REQUEST || icp->icmp6_type == ND_REDIRECT) { /* * ICMPv6 error * Special case: for redirect (which is * informational) we must not send icmp6 error. */ ICMP6STAT_INC(icp6s_canterror); goto freeit; } else { /* ICMPv6 informational - send the error */ } } else { /* non-ICMPv6 - send the error */ } oip6 = mtod(m, struct ip6_hdr *); /* adjust pointer */ /* Finally, do rate limitation check. */ if (icmp6_ratelimit(&oip6->ip6_src, type, code)) { ICMP6STAT_INC(icp6s_toofreq); goto freeit; } /* * OK, ICMP6 can be generated. */ if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN) m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len); preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); M_PREPEND(m, preplen, M_NOWAIT); /* FIB is also copied over. */ if (m == NULL) { nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__)); return; } nip6 = mtod(m, struct ip6_hdr *); nip6->ip6_src = oip6->ip6_src; nip6->ip6_dst = oip6->ip6_dst; in6_clearscope(&oip6->ip6_src); in6_clearscope(&oip6->ip6_dst); icmp6 = (struct icmp6_hdr *)(nip6 + 1); icmp6->icmp6_type = type; icmp6->icmp6_code = code; icmp6->icmp6_pptr = htonl((u_int32_t)param); ICMP6STAT_INC(icp6s_outhist[type]); icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */ return; freeit: /* * If we can't tell whether or not we can generate ICMP6, free it. */ m_freem(m); } /* * Process a received ICMP6 message. */ int icmp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp, *n; struct ifnet *ifp; struct ip6_hdr *ip6, *nip6; struct icmp6_hdr *icmp6, *nicmp6; int off = *offp; int icmp6len = m->m_pkthdr.len - *offp; int code, sum, noff; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; int ip6len, error; ifp = m->m_pkthdr.rcvif; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), IPPROTO_DONE); /* m might change if M_LOOP. So, call mtod after this */ #endif /* * Locate icmp6 structure in mbuf, and check * that not corrupted and of at least minimum length */ ip6 = mtod(m, struct ip6_hdr *); ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); if (icmp6len < sizeof(struct icmp6_hdr)) { ICMP6STAT_INC(icp6s_tooshort); goto freeit; } /* * Check multicast group membership. * Note: SSM filters are not applied for ICMPv6 traffic. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct in6_multi *inm; inm = in6m_lookup(ifp, &ip6->ip6_dst); if (inm == NULL) { IP6STAT_INC(ip6s_notmember); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto freeit; } } /* * calculate the checksum */ #ifndef PULLDOWN_TEST icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { ICMP6STAT_INC(icp6s_tooshort); return IPPROTO_DONE; } #endif code = icmp6->icmp6_code; if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) { nd6log((LOG_ERR, "ICMP6 checksum error(%d|%x) %s\n", icmp6->icmp6_type, sum, ip6_sprintf(ip6bufs, &ip6->ip6_src))); ICMP6STAT_INC(icp6s_checksum); goto freeit; } ICMP6STAT_INC(icp6s_inhist[icmp6->icmp6_type]); icmp6_ifstat_inc(ifp, ifs6_in_msg); if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK) icmp6_ifstat_inc(ifp, ifs6_in_error); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: icmp6_ifstat_inc(ifp, ifs6_in_dstunreach); switch (code) { case ICMP6_DST_UNREACH_NOROUTE: case ICMP6_DST_UNREACH_ADDR: /* PRC_HOSTDEAD is a DOS */ code = PRC_UNREACH_NET; break; case ICMP6_DST_UNREACH_ADMIN: icmp6_ifstat_inc(ifp, ifs6_in_adminprohib); code = PRC_UNREACH_ADMIN_PROHIB; break; case ICMP6_DST_UNREACH_BEYONDSCOPE: /* I mean "source address was incorrect." */ code = PRC_PARAMPROB; break; case ICMP6_DST_UNREACH_NOPORT: code = PRC_UNREACH_PORT; break; default: goto badcode; } goto deliver; break; case ICMP6_PACKET_TOO_BIG: icmp6_ifstat_inc(ifp, ifs6_in_pkttoobig); /* validation is made in icmp6_mtudisc_update */ code = PRC_MSGSIZE; /* * Updating the path MTU will be done after examining * intermediate extension headers. */ goto deliver; break; case ICMP6_TIME_EXCEEDED: icmp6_ifstat_inc(ifp, ifs6_in_timeexceed); switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: code = PRC_TIMXCEED_INTRANS; break; case ICMP6_TIME_EXCEED_REASSEMBLY: code = PRC_TIMXCEED_REASS; break; default: goto badcode; } goto deliver; break; case ICMP6_PARAM_PROB: icmp6_ifstat_inc(ifp, ifs6_in_paramprob); switch (code) { case ICMP6_PARAMPROB_NEXTHEADER: code = PRC_UNREACH_PROTOCOL; break; case ICMP6_PARAMPROB_HEADER: case ICMP6_PARAMPROB_OPTION: code = PRC_PARAMPROB; break; default: goto badcode; } goto deliver; break; case ICMP6_ECHO_REQUEST: icmp6_ifstat_inc(ifp, ifs6_in_echo); if (code != 0) goto badcode; if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) { /* Give up remote */ break; } if (!M_WRITABLE(n) || n->m_len < off + sizeof(struct icmp6_hdr)) { struct mbuf *n0 = n; int n0len; CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) <= MHLEN); n = m_gethdr(M_NOWAIT, n0->m_type); if (n == NULL) { /* Give up remote */ m_freem(n0); break; } m_move_pkthdr(n, n0); /* FIB copied. */ n0len = n0->m_pkthdr.len; /* save for use below */ /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); noff = sizeof(struct ip6_hdr); /* new mbuf contains only ipv6+icmpv6 headers */ n->m_len = noff + sizeof(struct icmp6_hdr); /* * Adjust mbuf. ip6_plen will be adjusted in * ip6_output(). */ m_adj(n0, off + sizeof(struct icmp6_hdr)); /* recalculate complete packet size */ n->m_pkthdr.len = n0len + (noff - off); n->m_next = n0; } else { IP6_EXTHDR_GET(nicmp6, struct icmp6_hdr *, n, off, sizeof(*nicmp6)); noff = off; } if (n) { nicmp6->icmp6_type = ICMP6_ECHO_REPLY; nicmp6->icmp6_code = 0; ICMP6STAT_INC(icp6s_reflect); ICMP6STAT_INC(icp6s_outhist[ICMP6_ECHO_REPLY]); icmp6_reflect(n, noff); } break; case ICMP6_ECHO_REPLY: icmp6_ifstat_inc(ifp, ifs6_in_echoreply); if (code != 0) goto badcode; break; case MLD_LISTENER_QUERY: case MLD_LISTENER_REPORT: case MLD_LISTENER_DONE: case MLDV2_LISTENER_REPORT: /* * Drop MLD traffic which is not link-local, has a hop limit * of greater than 1 hop, or which does not have the * IPv6 HBH Router Alert option. * As IPv6 HBH options are stripped in ip6_input() we must * check an mbuf header flag. * XXX Should we also sanity check that these messages * were directed to a link-local multicast prefix? */ if ((ip6->ip6_hlim != 1) || (m->m_flags & M_RTALERT_MLD) == 0) goto freeit; if (mld_input(m, off, icmp6len) != 0) return (IPPROTO_DONE); /* m stays. */ break; case ICMP6_WRUREQUEST: /* ICMP6_FQDN_QUERY */ { enum { WRU, FQDN } mode; struct prison *pr; if (!V_icmp6_nodeinfo) break; if (icmp6len == sizeof(struct icmp6_hdr) + 4) mode = WRU; else if (icmp6len >= sizeof(struct icmp6_nodeinfo)) mode = FQDN; else goto badlen; pr = NULL; sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_vnet == ifp->if_vnet) break; sx_sunlock(&allprison_lock); if (pr == NULL) pr = curthread->td_ucred->cr_prison; if (mode == FQDN) { #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo), IPPROTO_DONE); #endif n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n) n = ni6_input(n, off, pr); /* XXX meaningless if n == NULL */ noff = sizeof(struct ip6_hdr); } else { u_char *p; int maxhlen, hlen; /* * XXX: this combination of flags is pointless, * but should we keep this for compatibility? */ if ((V_icmp6_nodeinfo & (ICMP6_NODEINFO_FQDNOK | ICMP6_NODEINFO_TMPADDROK)) != (ICMP6_NODEINFO_FQDNOK | ICMP6_NODEINFO_TMPADDROK)) break; if (code != 0) goto badcode; CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) + 4 <= MHLEN); n = m_gethdr(M_NOWAIT, m->m_type); if (n == NULL) { /* Give up remote */ break; } if (!m_dup_pkthdr(n, m, M_NOWAIT)) { /* * Previous code did a blind M_COPY_PKTHDR * and said "just for rcvif". If true, then * we could tolerate the dup failing (due to * the deep copy of the tag chain). For now * be conservative and just fail. */ m_free(n); n = NULL; break; } /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); p = (u_char *)(nicmp6 + 1); bzero(p, 4); maxhlen = M_TRAILINGSPACE(n) - (sizeof(*nip6) + sizeof(*nicmp6) + 4); mtx_lock(&pr->pr_mtx); hlen = strlen(pr->pr_hostname); if (maxhlen > hlen) maxhlen = hlen; /* meaningless TTL */ bcopy(pr->pr_hostname, p + 4, maxhlen); mtx_unlock(&pr->pr_mtx); noff = sizeof(struct ip6_hdr); n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + 4 + maxhlen; nicmp6->icmp6_type = ICMP6_WRUREPLY; nicmp6->icmp6_code = 0; } if (n) { ICMP6STAT_INC(icp6s_reflect); ICMP6STAT_INC(icp6s_outhist[ICMP6_WRUREPLY]); icmp6_reflect(n, noff); } break; } case ICMP6_WRUREPLY: if (code != 0) goto badcode; break; case ND_ROUTER_SOLICIT: icmp6_ifstat_inc(ifp, ifs6_in_routersolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_solicit)) goto badlen; if (send_sendso_input_hook != NULL) { IP6_EXTHDR_CHECK(m, off, icmp6len, IPPROTO_DONE); error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_rs_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_ROUTER_ADVERT: icmp6_ifstat_inc(ifp, ifs6_in_routeradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_advert)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_ra_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_NEIGHBOR_SOLICIT: icmp6_ifstat_inc(ifp, ifs6_in_neighborsolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_solicit)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_ns_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_NEIGHBOR_ADVERT: icmp6_ifstat_inc(ifp, ifs6_in_neighboradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_advert)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_na_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_REDIRECT: icmp6_ifstat_inc(ifp, ifs6_in_redirect); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_redirect)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); icmp6_redirect_input(m, off); m = n; if (m == NULL) goto freeit; break; case ICMP6_ROUTER_RENUMBERING: if (code != ICMP6_ROUTER_RENUMBERING_COMMAND && code != ICMP6_ROUTER_RENUMBERING_RESULT) goto badcode; if (icmp6len < sizeof(struct icmp6_router_renum)) goto badlen; break; default: nd6log((LOG_DEBUG, "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n", icmp6->icmp6_type, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ifp ? ifp->if_index : 0)); if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) { /* ICMPv6 error: MUST deliver it by spec... */ code = PRC_NCMDS; /* deliver */ } else { /* ICMPv6 informational: MUST not deliver */ break; } deliver: if (icmp6_notify_error(&m, off, icmp6len, code) != 0) { /* In this case, m should've been freed. */ return (IPPROTO_DONE); } break; badcode: ICMP6STAT_INC(icp6s_badcode); break; badlen: ICMP6STAT_INC(icp6s_badlen); break; } /* deliver the packet to appropriate sockets */ icmp6_rip6_input(&m, *offp); return IPPROTO_DONE; freeit: m_freem(m); return IPPROTO_DONE; } static int icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code) { struct mbuf *m = *mp; struct icmp6_hdr *icmp6; struct ip6_hdr *eip6; u_int32_t notifymtu; struct sockaddr_in6 icmp6src, icmp6dst; if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) { ICMP6STAT_INC(icp6s_tooshort); goto freeit; } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr), -1); icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif eip6 = (struct ip6_hdr *)(icmp6 + 1); /* Detect the upper level protocol */ { void (*ctlfunc)(int, struct sockaddr *, void *); u_int8_t nxt = eip6->ip6_nxt; int eoff = off + sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr); struct ip6ctlparam ip6cp; struct in6_addr *finaldst = NULL; int icmp6type = icmp6->icmp6_type; struct ip6_frag *fh; struct ip6_rthdr *rth; struct ip6_rthdr0 *rth0; int rthlen; while (1) { /* XXX: should avoid infinite loop explicitly? */ struct ip6_ext *eh; switch (nxt) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_AH: #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_ext), -1); eh = (struct ip6_ext *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(eh, struct ip6_ext *, m, eoff, sizeof(*eh)); if (eh == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif if (nxt == IPPROTO_AH) eoff += (eh->ip6e_len + 2) << 2; else eoff += (eh->ip6e_len + 1) << 3; nxt = eh->ip6e_nxt; break; case IPPROTO_ROUTING: /* * When the erroneous packet contains a * routing header, we should examine the * header to determine the final destination. * Otherwise, we can't properly update * information that depends on the final * destination (e.g. path MTU). */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(*rth), -1); rth = (struct ip6_rthdr *) (mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m, eoff, sizeof(*rth)); if (rth == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif rthlen = (rth->ip6r_len + 1) << 3; /* * XXX: currently there is no * officially defined type other * than type-0. * Note that if the segment left field * is 0, all intermediate hops must * have been passed. */ if (rth->ip6r_segleft && rth->ip6r_type == IPV6_RTHDR_TYPE_0) { int hops; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + rthlen, -1); rth0 = (struct ip6_rthdr0 *) (mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(rth0, struct ip6_rthdr0 *, m, eoff, rthlen); if (rth0 == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif /* just ignore a bogus header */ if ((rth0->ip6r0_len % 2) == 0 && (hops = rth0->ip6r0_len/2)) finaldst = (struct in6_addr *)(rth0 + 1) + (hops - 1); } eoff += rthlen; nxt = rth->ip6r_nxt; break; case IPPROTO_FRAGMENT: #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_frag), -1); fh = (struct ip6_frag *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(fh, struct ip6_frag *, m, eoff, sizeof(*fh)); if (fh == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif /* * Data after a fragment header is meaningless * unless it is the first fragment, but * we'll go to the notify label for path MTU * discovery. */ if (fh->ip6f_offlg & IP6F_OFF_MASK) goto notify; eoff += sizeof(struct ip6_frag); nxt = fh->ip6f_nxt; break; default: /* * This case includes ESP and the No Next * Header. In such cases going to the notify * label does not have any meaning * (i.e. ctlfunc will be NULL), but we go * anyway since we might have to update * path MTU information. */ goto notify; } } notify: #ifndef PULLDOWN_TEST icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif /* * retrieve parameters from the inner IPv6 header, and convert * them into sockaddr structures. * XXX: there is no guarantee that the source or destination * addresses of the inner packet are in the same scope as * the addresses of the icmp packet. But there is no other * way to determine the zone. */ eip6 = (struct ip6_hdr *)(icmp6 + 1); bzero(&icmp6dst, sizeof(icmp6dst)); icmp6dst.sin6_len = sizeof(struct sockaddr_in6); icmp6dst.sin6_family = AF_INET6; if (finaldst == NULL) icmp6dst.sin6_addr = eip6->ip6_dst; else icmp6dst.sin6_addr = *finaldst; if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; bzero(&icmp6src, sizeof(icmp6src)); icmp6src.sin6_len = sizeof(struct sockaddr_in6); icmp6src.sin6_family = AF_INET6; icmp6src.sin6_addr = eip6->ip6_src; if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; icmp6src.sin6_flowinfo = (eip6->ip6_flow & IPV6_FLOWLABEL_MASK); if (finaldst == NULL) finaldst = &eip6->ip6_dst; ip6cp.ip6c_m = m; ip6cp.ip6c_icmp6 = icmp6; ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1); ip6cp.ip6c_off = eoff; ip6cp.ip6c_finaldst = finaldst; ip6cp.ip6c_src = &icmp6src; ip6cp.ip6c_nxt = nxt; if (icmp6type == ICMP6_PACKET_TOO_BIG) { notifymtu = ntohl(icmp6->icmp6_mtu); ip6cp.ip6c_cmdarg = (void *)¬ifymtu; icmp6_mtudisc_update(&ip6cp, 1); /*XXX*/ } ctlfunc = (void (*)(int, struct sockaddr *, void *)) (inet6sw[ip6_protox[nxt]].pr_ctlinput); if (ctlfunc) { (void) (*ctlfunc)(code, (struct sockaddr *)&icmp6dst, &ip6cp); } } *mp = m; return (0); freeit: m_freem(m); return (-1); } void icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated) { struct in6_addr *dst = ip6cp->ip6c_finaldst; struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6; struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */ u_int mtu = ntohl(icmp6->icmp6_mtu); struct in_conninfo inc; #if 0 /* * RFC2460 section 5, last paragraph. * even though minimum link MTU for IPv6 is IPV6_MMTU, * we may see ICMPv6 too big with mtu < IPV6_MMTU * due to packet translator in the middle. * see ip6_output() and ip6_getpmtu() "alwaysfrag" case for * special handling. */ if (mtu < IPV6_MMTU) return; #endif /* * we reject ICMPv6 too big with abnormally small value. * XXX what is the good definition of "abnormally small"? */ if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8) return; if (!validated) return; /* * In case the suggested mtu is less than IPV6_MMTU, we * only need to remember that it was for above mentioned * "alwaysfrag" case. * Try to be as close to the spec as possible. */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU - 8; bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) return; if (mtu < tcp_maxmtu6(&inc, NULL)) { tcp_hc_updatemtu(&inc, mtu); ICMP6STAT_INC(icp6s_pmtuchg); } } /* * Process a Node Information Query packet, based on * draft-ietf-ipngwg-icmp-name-lookups-07. * * Spec incompatibilities: * - IPv6 Subject address handling * - IPv4 Subject address handling support missing * - Proxy reply (answer even if it's not for me) * - joins NI group address at in6_ifattach() time only, does not cope * with hostname changes by sethostname(3) */ static struct mbuf * ni6_input(struct mbuf *m, int off, struct prison *pr) { struct icmp6_nodeinfo *ni6, *nni6; struct mbuf *n = NULL; u_int16_t qtype; int subjlen; int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); struct ni_reply_fqdn *fqdn; int addrs; /* for NI_QTYPE_NODEADDR */ struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */ struct in6_addr in6_subj; /* subject address */ struct ip6_hdr *ip6; int oldfqdn = 0; /* if 1, return pascal string (03 draft) */ char *subj = NULL; struct in6_ifaddr *ia6 = NULL; ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo *, m, off, sizeof(*ni6)); if (ni6 == NULL) { /* m is already reclaimed */ return (NULL); } #endif /* * Validate IPv6 source address. * The default configuration MUST be to refuse answering queries from * global-scope addresses according to RFC4602. * Notes: * - it's not very clear what "refuse" means; this implementation * simply drops it. * - it's not very easy to identify global-scope (unicast) addresses * since there are many prefixes for them. It should be safer * and in practice sufficient to check "all" but loopback and * link-local (note that site-local unicast was deprecated and * ULA is defined as global scope-wise) */ if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 && !IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) && !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) goto bad; /* * Validate IPv6 destination address. * * The Responder must discard the Query without further processing * unless it is one of the Responder's unicast or anycast addresses, or * a link-local scope multicast address which the Responder has joined. * [RFC4602, Section 5.] */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) goto bad; /* else it's a link-local multicast, fine */ } else { /* unicast or anycast */ ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia6 == NULL) goto bad; /* XXX impossible */ if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) && !(V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) { ifa_free(&ia6->ia_ifa); nd6log((LOG_DEBUG, "ni6_input: ignore node info to " "a temporary address in %s:%d", __FILE__, __LINE__)); goto bad; } ifa_free(&ia6->ia_ifa); } /* validate query Subject field. */ qtype = ntohs(ni6->ni_qtype); subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo); switch (qtype) { case NI_QTYPE_NOOP: case NI_QTYPE_SUPTYPES: /* 07 draft */ if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0) break; /* FALLTHROUGH */ case NI_QTYPE_FQDN: case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: #if ICMP6_NI_SUBJ_IPV6 != 0 case 0: #endif /* * backward compatibility - try to accept 03 draft * format, where no Subject is present. */ if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 && subjlen == 0) { oldfqdn++; break; } #if ICMP6_NI_SUBJ_IPV6 != 0 if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6) goto bad; #endif if (subjlen != sizeof(struct in6_addr)) goto bad; /* * Validate Subject address. * * Not sure what exactly "address belongs to the node" * means in the spec, is it just unicast, or what? * * At this moment we consider Subject address as * "belong to the node" if the Subject address equals * to the IPv6 destination address; validation for * IPv6 destination address should have done enough * check for us. * * We do not do proxy at this moment. */ /* m_pulldown instead of copy? */ m_copydata(m, off + sizeof(struct icmp6_nodeinfo), subjlen, (caddr_t)&in6_subj); if (in6_setscope(&in6_subj, m->m_pkthdr.rcvif, NULL)) goto bad; subj = (char *)&in6_subj; if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj)) break; /* * XXX if we are to allow other cases, we should really * be careful about scope here. * basically, we should disallow queries toward IPv6 * destination X with subject Y, * if scope(X) > scope(Y). * if we allow scope(X) > scope(Y), it will result in * information leakage across scope boundary. */ goto bad; case ICMP6_NI_SUBJ_FQDN: /* * Validate Subject name with gethostname(3). * * The behavior may need some debate, since: * - we are not sure if the node has FQDN as * hostname (returned by gethostname(3)). * - the code does wildcard match for truncated names. * however, we are not sure if we want to perform * wildcard match, if gethostname(3) side has * truncated hostname. */ mtx_lock(&pr->pr_mtx); n = ni6_nametodns(pr->pr_hostname, strlen(pr->pr_hostname), 0); mtx_unlock(&pr->pr_mtx); if (!n || n->m_next || n->m_len == 0) goto bad; IP6_EXTHDR_GET(subj, char *, m, off + sizeof(struct icmp6_nodeinfo), subjlen); if (subj == NULL) goto bad; if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *), n->m_len)) { goto bad; } m_freem(n); n = NULL; break; case ICMP6_NI_SUBJ_IPV4: /* XXX: to be implemented? */ default: goto bad; } break; } /* refuse based on configuration. XXX ICMP6_NI_REFUSED? */ switch (qtype) { case NI_QTYPE_FQDN: if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0) goto bad; break; case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0) goto bad; break; } /* guess reply length */ switch (qtype) { case NI_QTYPE_NOOP: break; /* no reply data */ case NI_QTYPE_SUPTYPES: replylen += sizeof(u_int32_t); break; case NI_QTYPE_FQDN: /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); break; case NI_QTYPE_NODEADDR: addrs = ni6_addrs(ni6, m, &ifp, (struct in6_addr *)subj); if ((replylen += addrs * (sizeof(struct in6_addr) + sizeof(u_int32_t))) > MCLBYTES) replylen = MCLBYTES; /* XXX: will truncate pkt later */ break; case NI_QTYPE_IPV4ADDR: /* unsupported - should respond with unknown Qtype? */ break; default: /* * XXX: We must return a reply with the ICMP6 code * `unknown Qtype' in this case. However we regard the case * as an FQDN query for backward compatibility. * Older versions set a random value to this field, * so it rarely varies in the defined qtypes. * But the mechanism is not reliable... * maybe we should obsolete older versions. */ qtype = NI_QTYPE_FQDN; /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); oldfqdn++; break; } /* Allocate an mbuf to reply. */ if (replylen > MCLBYTES) { /* * XXX: should we try to allocate more? But MCLBYTES * is probably much larger than IPV6_MMTU... */ goto bad; } if (replylen > MHLEN) n = m_getcl(M_NOWAIT, m->m_type, M_PKTHDR); else n = m_gethdr(M_NOWAIT, m->m_type); if (n == NULL) { m_freem(m); return (NULL); } m_move_pkthdr(n, m); /* just for recvif and FIB */ n->m_pkthdr.len = n->m_len = replylen; /* copy mbuf header and IPv6 + Node Information base headers */ bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr)); nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1); bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo)); /* qtype dependent procedure */ switch (qtype) { case NI_QTYPE_NOOP: nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = 0; break; case NI_QTYPE_SUPTYPES: { u_int32_t v; nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = htons(0x0000); /* raw bitmap */ /* supports NOOP, SUPTYPES, FQDN, and NODEADDR */ v = (u_int32_t)htonl(0x0000000f); bcopy(&v, nni6 + 1, sizeof(u_int32_t)); break; } case NI_QTYPE_FQDN: nni6->ni_code = ICMP6_NI_SUCCESS; fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) + sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo)); nni6->ni_flags = 0; /* XXX: meaningless TTL */ fqdn->ni_fqdn_ttl = 0; /* ditto. */ /* * XXX do we really have FQDN in hostname? */ mtx_lock(&pr->pr_mtx); n->m_next = ni6_nametodns(pr->pr_hostname, strlen(pr->pr_hostname), oldfqdn); mtx_unlock(&pr->pr_mtx); if (n->m_next == NULL) goto bad; /* XXX we assume that n->m_next is not a chain */ if (n->m_next->m_next != NULL) goto bad; n->m_pkthdr.len += n->m_next->m_len; break; case NI_QTYPE_NODEADDR: { int lenlim, copied; nni6->ni_code = ICMP6_NI_SUCCESS; n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); lenlim = M_TRAILINGSPACE(n); copied = ni6_store_addrs(ni6, nni6, ifp, lenlim); /* XXX: reset mbuf length */ n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo) + copied; break; } default: break; /* XXX impossible! */ } nni6->ni_type = ICMP6_NI_REPLY; m_freem(m); return (n); bad: m_freem(m); if (n) m_freem(n); return (NULL); } /* * make a mbuf with DNS-encoded string. no compression support. * * XXX names with less than 2 dots (like "foo" or "foo.section") will be * treated as truncated name (two \0 at the end). this is a wild guess. * * old - return pascal string if non-zero */ static struct mbuf * ni6_nametodns(const char *name, int namelen, int old) { struct mbuf *m; char *cp, *ep; const char *p, *q; int i, len, nterm; if (old) len = namelen + 1; else len = MCLBYTES; /* Because MAXHOSTNAMELEN is usually 256, we use cluster mbuf. */ if (len > MLEN) m = m_getcl(M_NOWAIT, MT_DATA, 0); else m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) goto fail; if (old) { m->m_len = len; *mtod(m, char *) = namelen; bcopy(name, mtod(m, char *) + 1, namelen); return m; } else { m->m_len = 0; cp = mtod(m, char *); ep = mtod(m, char *) + M_TRAILINGSPACE(m); /* if not certain about my name, return empty buffer */ if (namelen == 0) return m; /* * guess if it looks like shortened hostname, or FQDN. * shortened hostname needs two trailing "\0". */ i = 0; for (p = name; p < name + namelen; p++) { if (*p && *p == '.') i++; } if (i < 2) nterm = 2; else nterm = 1; p = name; while (cp < ep && p < name + namelen) { i = 0; for (q = p; q < name + namelen && *q && *q != '.'; q++) i++; /* result does not fit into mbuf */ if (cp + i + 1 >= ep) goto fail; /* * DNS label length restriction, RFC1035 page 8. * "i == 0" case is included here to avoid returning * 0-length label on "foo..bar". */ if (i <= 0 || i >= 64) goto fail; *cp++ = i; bcopy(p, cp, i); cp += i; p = q; if (p < name + namelen && *p == '.') p++; } /* termination */ if (cp + nterm >= ep) goto fail; while (nterm-- > 0) *cp++ = '\0'; m->m_len = cp - mtod(m, char *); return m; } panic("should not reach here"); /* NOTREACHED */ fail: if (m) m_freem(m); return NULL; } /* * check if two DNS-encoded string matches. takes care of truncated * form (with \0\0 at the end). no compression support. * XXX upper/lowercase match (see RFC2065) */ static int ni6_dnsmatch(const char *a, int alen, const char *b, int blen) { const char *a0, *b0; int l; /* simplest case - need validation? */ if (alen == blen && bcmp(a, b, alen) == 0) return 1; a0 = a; b0 = b; /* termination is mandatory */ if (alen < 2 || blen < 2) return 0; if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0') return 0; alen--; blen--; while (a - a0 < alen && b - b0 < blen) { if (a - a0 + 1 > alen || b - b0 + 1 > blen) return 0; if ((signed char)a[0] < 0 || (signed char)b[0] < 0) return 0; /* we don't support compression yet */ if (a[0] >= 64 || b[0] >= 64) return 0; /* truncated case */ if (a[0] == 0 && a - a0 == alen - 1) return 1; if (b[0] == 0 && b - b0 == blen - 1) return 1; if (a[0] == 0 || b[0] == 0) return 0; if (a[0] != b[0]) return 0; l = a[0]; if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen) return 0; if (bcmp(a + 1, b + 1, l) != 0) return 0; a += 1 + l; b += 1 + l; } if (a - a0 == alen && b - b0 == blen) return 1; else return 0; } /* * calculate the number of addresses to be returned in the node info reply. */ static int ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp, struct in6_addr *subj) { - struct epoch_tracker et; struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; int addrs = 0, addrsofif, iffound = 0; int niflags = ni6->ni_flags; + NET_EPOCH_ASSERT(); + if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) { switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: if (subj == NULL) /* must be impossible... */ return (0); break; default: /* * XXX: we only support IPv6 subject address for * this Qtype. */ return (0); } } - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { addrsofif = 0; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 && IN6_ARE_ADDR_EQUAL(subj, &ifa6->ia_addr.sin6_addr)) iffound = 1; /* * IPv4-mapped addresses can only be returned by a * Node Information proxy, since they represent * addresses of IPv4-only nodes, which perforce do * not implement this protocol. * [icmp-name-lookups-07, Section 5.4] * So we don't support NI_NODEADDR_FLAG_COMPAT in * this function at this moment. */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; /* we need only unicast addresses */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } addrsofif++; /* count the address */ } if (iffound) { *ifpp = ifp; - NET_EPOCH_EXIT(et); return (addrsofif); } addrs += addrsofif; } - NET_EPOCH_EXIT(et); return (addrs); } static int ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, struct ifnet *ifp0, int resid) { - struct epoch_tracker et; struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; struct ifnet *ifp_dep = NULL; int copied = 0, allow_deprecated = 0; u_char *cp = (u_char *)(nni6 + 1); int niflags = ni6->ni_flags; u_int32_t ltime; + NET_EPOCH_ASSERT(); + if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL)) return (0); /* needless to copy */ - NET_EPOCH_ENTER(et); ifp = ifp0 ? ifp0 : CK_STAILQ_FIRST(&V_ifnet); again: for (; ifp; ifp = CK_STAILQ_NEXT(ifp, if_link)) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 && allow_deprecated == 0) { /* * prefererred address should be put before * deprecated addresses. */ /* record the interface for later search */ if (ifp_dep == NULL) ifp_dep = ifp; continue; } else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 && allow_deprecated != 0) continue; /* we now collect deprecated addrs */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } /* now we can copy the address */ if (resid < sizeof(struct in6_addr) + sizeof(u_int32_t)) { /* * We give up much more copy. * Set the truncate flag and return. */ nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE; - NET_EPOCH_EXIT(et); return (copied); } /* * Set the TTL of the address. * The TTL value should be one of the following * according to the specification: * * 1. The remaining lifetime of a DHCP lease on the * address, or * 2. The remaining Valid Lifetime of a prefix from * which the address was derived through Stateless * Autoconfiguration. * * Note that we currently do not support stateful * address configuration by DHCPv6, so the former * case can't happen. */ if (ifa6->ia6_lifetime.ia6t_expire == 0) ltime = ND6_INFINITE_LIFETIME; else { if (ifa6->ia6_lifetime.ia6t_expire > time_uptime) ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_uptime); else ltime = 0; } bcopy(<ime, cp, sizeof(u_int32_t)); cp += sizeof(u_int32_t); /* copy the address itself */ bcopy(&ifa6->ia_addr.sin6_addr, cp, sizeof(struct in6_addr)); in6_clearscope((struct in6_addr *)cp); /* XXX */ cp += sizeof(struct in6_addr); resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t)); copied += (sizeof(struct in6_addr) + sizeof(u_int32_t)); } if (ifp0) /* we need search only on the specified IF */ break; } if (allow_deprecated == 0 && ifp_dep != NULL) { ifp = ifp_dep; allow_deprecated = 1; goto again; } - NET_EPOCH_EXIT(et); - return (copied); } /* * XXX almost dup'ed code with rip6_input. */ static int icmp6_rip6_input(struct mbuf **mp, int off) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb *inp; struct inpcb *last = NULL; struct sockaddr_in6 fromsa; struct icmp6_hdr *icmp6; struct epoch_tracker et; struct mbuf *opts = NULL; #ifndef PULLDOWN_TEST /* this is assumed to be safe. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { /* m is already reclaimed */ return (IPPROTO_DONE); } #endif /* * XXX: the address may have embedded scope zone ID, which should be * hidden from applications. */ bzero(&fromsa, sizeof(fromsa)); fromsa.sin6_family = AF_INET6; fromsa.sin6_len = sizeof(struct sockaddr_in6); fromsa.sin6_addr = ip6->ip6_src; if (sa6_recoverscope(&fromsa)) { m_freem(m); return (IPPROTO_DONE); } INP_INFO_RLOCK_ET(&V_ripcbinfo, et); CK_LIST_FOREACH(inp, &V_ripcb, inp_list) { if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (inp->inp_ip_p != IPPROTO_ICMPV6) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) continue; INP_RLOCK(inp); if (__predict_false(inp->inp_flags2 & INP_FREED)) { INP_RUNLOCK(inp); continue; } if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, inp->in6p_icmp6filt)) { INP_RUNLOCK(inp); continue; } if (last != NULL) { struct mbuf *n = NULL; /* * Recent network drivers tend to allocate a single * mbuf cluster, rather than to make a couple of * mbufs without clusters. Also, since the IPv6 code * path tries to avoid m_pullup(), it is highly * probable that we still have an mbuf cluster here * even though the necessary length can be stored in an * mbuf's internal buffer. * Meanwhile, the default size of the receive socket * buffer for raw sockets is not so large. This means * the possibility of packet loss is relatively higher * than before. To avoid this scenario, we copy the * received data to a separate mbuf that does not use * a cluster, if possible. * XXX: it is better to copy the data after stripping * intermediate headers. */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { n = m_get(M_NOWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; } else { m_free(n); n = NULL; } } } if (n != NULL || (n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { if (last->inp_flags & INP_CONTROLOPTS) ip6_savecontrol(last, n, &opts); /* strip intermediate headers */ m_adj(n, off); SOCKBUF_LOCK(&last->inp_socket->so_rcv); if (sbappendaddr_locked( &last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) { m_freem(opts); } SOCKBUF_UNLOCK( &last->inp_socket->so_rcv); } else sorwakeup_locked(last->inp_socket); opts = NULL; } INP_RUNLOCK(last); } last = inp; } INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); if (last != NULL) { if (last->inp_flags & INP_CONTROLOPTS) ip6_savecontrol(last, m, &opts); /* strip intermediate headers */ m_adj(m, off); /* avoid using mbuf clusters if possible (see above) */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { struct mbuf *n; n = m_get(M_NOWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; m_freem(m); m = n; } else { m_freem(n); n = NULL; } } } SOCKBUF_LOCK(&last->inp_socket->so_rcv); if (sbappendaddr_locked(&last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, m, opts) == 0) { m_freem(m); if (opts) m_freem(opts); SOCKBUF_UNLOCK(&last->inp_socket->so_rcv); } else sorwakeup_locked(last->inp_socket); INP_RUNLOCK(last); } else { m_freem(m); IP6STAT_DEC(ip6s_delivered); } return IPPROTO_DONE; } /* * Reflect the ip6 packet back to the source. * OFF points to the icmp6 header, counted from the top of the mbuf. */ void icmp6_reflect(struct mbuf *m, size_t off) { struct in6_addr src6, *srcp; struct ip6_hdr *ip6; struct icmp6_hdr *icmp6; struct in6_ifaddr *ia = NULL; struct ifnet *outif = NULL; int plen; int type, code, hlim; /* too short to reflect */ if (off < sizeof(struct ip6_hdr)) { nd6log((LOG_DEBUG, "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n", (u_long)off, (u_long)sizeof(struct ip6_hdr), __FILE__, __LINE__)); goto bad; } /* * If there are extra headers between IPv6 and ICMPv6, strip * off that header first. */ #ifdef DIAGNOSTIC if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN) panic("assumption failed in icmp6_reflect"); #endif if (off > sizeof(struct ip6_hdr)) { size_t l; struct ip6_hdr nip6; l = off - sizeof(struct ip6_hdr); m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6); m_adj(m, l); l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6)); } else /* off == sizeof(struct ip6_hdr) */ { size_t l; l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } } plen = m->m_pkthdr.len - sizeof(struct ip6_hdr); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_nxt = IPPROTO_ICMPV6; icmp6 = (struct icmp6_hdr *)(ip6 + 1); type = icmp6->icmp6_type; /* keep type for statistics */ code = icmp6->icmp6_code; /* ditto. */ hlim = 0; srcp = NULL; /* * If the incoming packet was addressed directly to us (i.e. unicast), * use dst as the src for the reply. * The IN6_IFF_NOTREADY case should be VERY rare, but is possible * (for example) when we encounter an error while forwarding procedure * destined to a duplicated address of ours. */ if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia != NULL && !(ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY))) { src6 = ia->ia_addr.sin6_addr; srcp = &src6; if (m->m_pkthdr.rcvif != NULL) { /* XXX: This may not be the outgoing interface */ hlim = ND_IFINFO(m->m_pkthdr.rcvif)->chlim; } else hlim = V_ip6_defhlim; } if (ia != NULL) ifa_free(&ia->ia_ifa); } if (srcp == NULL) { int error; struct in6_addr dst6; uint32_t scopeid; /* * This case matches to multicasts, our anycast, or unicasts * that we do not own. Select a source address based on the * source address of the erroneous packet. */ in6_splitscope(&ip6->ip6_src, &dst6, &scopeid); error = in6_selectsrc_addr(M_GETFIB(m), &dst6, scopeid, NULL, &src6, &hlim); if (error) { char ip6buf[INET6_ADDRSTRLEN]; nd6log((LOG_DEBUG, "icmp6_reflect: source can't be determined: " "dst=%s, error=%d\n", ip6_sprintf(ip6buf, &ip6->ip6_dst), error)); goto bad; } srcp = &src6; } /* * ip6_input() drops a packet if its src is multicast. * So, the src is never multicast. */ ip6->ip6_dst = ip6->ip6_src; ip6->ip6_src = *srcp; ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = hlim; icmp6->icmp6_cksum = 0; icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), plen); /* * XXX option handling */ m->m_flags &= ~(M_BCAST|M_MCAST); m->m_pkthdr.rcvif = NULL; ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) icmp6_ifoutstat_inc(outif, type, code); return; bad: m_freem(m); return; } void icmp6_fasttimo(void) { mld_fasttimo(); } void icmp6_slowtimo(void) { mld_slowtimo(); } static const char * icmp6_redirect_diag(struct in6_addr *src6, struct in6_addr *dst6, struct in6_addr *tgt6) { static char buf[1024]; char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; char ip6buft[INET6_ADDRSTRLEN]; snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)", ip6_sprintf(ip6bufs, src6), ip6_sprintf(ip6bufd, dst6), ip6_sprintf(ip6buft, tgt6)); return buf; } void icmp6_redirect_input(struct mbuf *m, int off) { struct ifnet *ifp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_redirect *nd_rd; int icmp6len = ntohs(ip6->ip6_plen); char *lladdr = NULL; int lladdrlen = 0; int is_router; int is_onlink; struct in6_addr src6 = ip6->ip6_src; struct in6_addr redtgt6; struct in6_addr reddst6; union nd_opts ndopts; char ip6buf[INET6_ADDRSTRLEN]; M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: no rcvif", __func__)); ifp = m->m_pkthdr.rcvif; /* XXX if we are router, we don't update route by icmp6 redirect */ if (V_ip6_forwarding) goto freeit; if (!V_icmp6_rediraccept) goto freeit; /* RFC 6980: Nodes MUST silently ignore fragments */ if(m->m_flags & M_FRAGMENTED) goto freeit; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len); if (nd_rd == NULL) { ICMP6STAT_INC(icp6s_tooshort); return; } #endif redtgt6 = nd_rd->nd_rd_target; reddst6 = nd_rd->nd_rd_dst; if (in6_setscope(&redtgt6, m->m_pkthdr.rcvif, NULL) || in6_setscope(&reddst6, m->m_pkthdr.rcvif, NULL)) { goto freeit; } /* validation */ if (!IN6_IS_ADDR_LINKLOCAL(&src6)) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "must be from linklocal\n", ip6_sprintf(ip6buf, &src6))); goto bad; } if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "hlim=%d (must be 255)\n", ip6_sprintf(ip6buf, &src6), ip6->ip6_hlim)); goto bad; } { /* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */ struct nhop6_basic nh6; struct in6_addr kdst; uint32_t scopeid; in6_splitscope(&reddst6, &kdst, &scopeid); if (fib6_lookup_nh_basic(ifp->if_fib, &kdst, scopeid, 0, 0,&nh6)==0){ if ((nh6.nh_flags & NHF_GATEWAY) == 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; no route " "with inet6 gateway found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* * Embed scope zone id into next hop address, since * fib6_lookup_nh_basic() returns address without embedded * scope zone id. */ if (in6_setscope(&nh6.nh_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; if (IN6_ARE_ADDR_EQUAL(&src6, &nh6.nh_addr) == 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "not equal to gw-for-src=%s (must be same): " "%s\n", ip6_sprintf(ip6buf, &nh6.nh_addr), icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } } else { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "no route found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } } if (IN6_IS_ADDR_MULTICAST(&reddst6)) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "redirect dst must be unicast: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } is_router = is_onlink = 0; if (IN6_IS_ADDR_LINKLOCAL(&redtgt6)) is_router = 1; /* router case */ if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0) is_onlink = 1; /* on-link destination case */ if (!is_router && !is_onlink) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "neither router case nor onlink case: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } icmp6len -= sizeof(*nd_rd); nd6_option_init(nd_rd + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "%s: invalid ND option, rejected: %s\n", __func__, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "%s: lladdrlen mismatch for %s " "(if %d, icmp6 packet %d): %s\n", __func__, ip6_sprintf(ip6buf, &redtgt6), ifp->if_addrlen, lladdrlen - 2, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* Validation passed. */ /* RFC 2461 8.3 */ nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT, is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER); /* * Install a gateway route in the better-router case or an interface * route in the on-link-destination case. */ { struct sockaddr_in6 sdst; struct sockaddr_in6 sgw; struct sockaddr_in6 ssrc; struct sockaddr *gw; int rt_flags; u_int fibnum; bzero(&sdst, sizeof(sdst)); bzero(&ssrc, sizeof(ssrc)); sdst.sin6_family = ssrc.sin6_family = AF_INET6; sdst.sin6_len = ssrc.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr)); rt_flags = RTF_HOST; if (is_router) { bzero(&sgw, sizeof(sgw)); sgw.sin6_family = AF_INET6; sgw.sin6_len = sizeof(struct sockaddr_in6); bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr)); gw = (struct sockaddr *)&sgw; rt_flags |= RTF_GATEWAY; } else gw = ifp->if_addr->ifa_addr; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) in6_rtredirect((struct sockaddr *)&sdst, gw, (struct sockaddr *)NULL, rt_flags, (struct sockaddr *)&ssrc, fibnum); } /* finally update cached route in each socket via pfctlinput */ { struct sockaddr_in6 sdst; bzero(&sdst, sizeof(sdst)); sdst.sin6_family = AF_INET6; sdst.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&sdst); } freeit: m_freem(m); return; bad: ICMP6STAT_INC(icp6s_badredirect); m_freem(m); } void icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt) { struct ifnet *ifp; /* my outgoing interface */ struct in6_addr *ifp_ll6; struct in6_addr *router_ll6; struct ip6_hdr *sip6; /* m0 as struct ip6_hdr */ struct mbuf *m = NULL; /* newly allocated one */ struct m_tag *mtag; struct ip6_hdr *ip6; /* m as struct ip6_hdr */ struct nd_redirect *nd_rd; struct llentry *ln = NULL; size_t maxlen; u_char *p; struct ifnet *outif = NULL; struct sockaddr_in6 src_sa; icmp6_errcount(ND_REDIRECT, 0); /* if we are not router, we don't send icmp6 redirect */ if (!V_ip6_forwarding) goto fail; /* sanity check */ if (!m0 || !rt || !(rt->rt_flags & RTF_UP) || !(ifp = rt->rt_ifp)) goto fail; /* * Address check: * the source address must identify a neighbor, and * the destination address must not be a multicast address * [RFC 2461, sec 8.2] */ sip6 = mtod(m0, struct ip6_hdr *); bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = sip6->ip6_src; if (nd6_is_addr_neighbor(&src_sa, ifp) == 0) goto fail; if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst)) goto fail; /* what should we do here? */ /* rate limit */ if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0)) goto fail; /* * Since we are going to append up to 1280 bytes (= IPV6_MMTU), * we almost always ask for an mbuf cluster for simplicity. * (MHLEN < IPV6_MMTU is almost always true) */ #if IPV6_MMTU >= MCLBYTES # error assumption failed about IPV6_MMTU and MCLBYTES #endif m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) goto fail; M_SETFIB(m, rt->rt_fibnum); maxlen = M_TRAILINGSPACE(m); maxlen = min(IPV6_MMTU, maxlen); /* just for safety */ if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) { goto fail; } { /* get ip6 linklocal address for ifp(my outgoing interface). */ struct in6_ifaddr *ia; if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY| IN6_IFF_ANYCAST)) == NULL) goto fail; ifp_ll6 = &ia->ia_addr.sin6_addr; /* XXXRW: reference released prematurely. */ ifa_free(&ia->ia_ifa); } /* get ip6 linklocal address for the router. */ if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)rt->rt_gateway; router_ll6 = &sin6->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(router_ll6)) router_ll6 = (struct in6_addr *)NULL; } else router_ll6 = (struct in6_addr *)NULL; /* ip6 */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; /* ip6->ip6_src must be linklocal addr for my outgoing if. */ bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr)); bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr)); /* ND Redirect */ nd_rd = (struct nd_redirect *)(ip6 + 1); nd_rd->nd_rd_type = ND_REDIRECT; nd_rd->nd_rd_code = 0; nd_rd->nd_rd_reserved = 0; if (rt->rt_flags & RTF_GATEWAY) { /* * nd_rd->nd_rd_target must be a link-local address in * better router cases. */ if (!router_ll6) goto fail; bcopy(router_ll6, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } else { /* make sure redtgt == reddst */ bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } p = (u_char *)(nd_rd + 1); if (!router_ll6) goto nolladdropt; { /* target lladdr option */ - struct epoch_tracker et; int len; struct nd_opt_hdr *nd_opt; char *lladdr; - NET_EPOCH_ENTER(et); ln = nd6_lookup(router_ll6, 0, ifp); - NET_EPOCH_EXIT(et); if (ln == NULL) goto nolladdropt; len = sizeof(*nd_opt) + ifp->if_addrlen; len = (len + 7) & ~7; /* round by 8 */ /* safety check */ if (len + (p - (u_char *)ip6) > maxlen) goto nolladdropt; if (ln->la_flags & LLE_VALID) { nd_opt = (struct nd_opt_hdr *)p; nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = len >> 3; lladdr = (char *)(nd_opt + 1); bcopy(ln->ll_addr, lladdr, ifp->if_addrlen); p += len; } } nolladdropt: if (ln != NULL) LLE_RUNLOCK(ln); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* just to be safe */ #ifdef M_DECRYPTED /*not openbsd*/ if (m0->m_flags & M_DECRYPTED) goto noredhdropt; #endif if (p - (u_char *)ip6 > maxlen) goto noredhdropt; { /* redirected header option */ int len; struct nd_opt_rd_hdr *nd_opt_rh; /* * compute the maximum size for icmp6 redirect header option. * XXX room for auth header? */ len = maxlen - (p - (u_char *)ip6); len &= ~7; /* This is just for simplicity. */ if (m0->m_pkthdr.len != m0->m_len) { if (m0->m_next) { m_freem(m0->m_next); m0->m_next = NULL; } m0->m_pkthdr.len = m0->m_len; } /* * Redirected header option spec (RFC2461 4.6.3) talks nothing * about padding/truncate rule for the original IP packet. * From the discussion on IPv6imp in Feb 1999, * the consensus was: * - "attach as much as possible" is the goal * - pad if not aligned (original size can be guessed by * original ip6 header) * Following code adds the padding if it is simple enough, * and truncates if not. */ if (m0->m_next || m0->m_pkthdr.len != m0->m_len) panic("assumption failed in %s:%d", __FILE__, __LINE__); if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) { /* not enough room, truncate */ m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } else { /* enough room, pad or truncate */ size_t extra; extra = m0->m_pkthdr.len % 8; if (extra) { /* pad if easy enough, truncate if not */ if (8 - extra <= M_TRAILINGSPACE(m0)) { /* pad */ m0->m_len += (8 - extra); m0->m_pkthdr.len += (8 - extra); } else { /* truncate */ m0->m_pkthdr.len -= extra; m0->m_len -= extra; } } len = m0->m_pkthdr.len + sizeof(*nd_opt_rh); m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } nd_opt_rh = (struct nd_opt_rd_hdr *)p; bzero(nd_opt_rh, sizeof(*nd_opt_rh)); nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER; nd_opt_rh->nd_opt_rh_len = len >> 3; p += sizeof(*nd_opt_rh); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* connect m0 to m */ m_tag_delete_chain(m0, NULL); m0->m_flags &= ~M_PKTHDR; m->m_next = m0; m->m_pkthdr.len = m->m_len + m0->m_len; m0 = NULL; } noredhdropt:; if (m0) { m_freem(m0); m0 = NULL; } /* XXX: clear embedded link IDs in the inner header */ in6_clearscope(&sip6->ip6_src); in6_clearscope(&sip6->ip6_dst); in6_clearscope(&nd_rd->nd_rd_target); in6_clearscope(&nd_rd->nd_rd_dst); ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); nd_rd->nd_rd_cksum = 0; nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), ntohs(ip6->ip6_plen)); if (send_sendso_input_hook != NULL) { mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short), M_NOWAIT); if (mtag == NULL) goto fail; *(unsigned short *)(mtag + 1) = nd_rd->nd_rd_type; m_tag_prepend(m, mtag); } /* send the packet to outside... */ ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_redirect); } ICMP6STAT_INC(icp6s_outhist[ND_REDIRECT]); return; fail: if (m) m_freem(m); if (m0) m_freem(m0); } /* * ICMPv6 socket option processing. */ int icmp6_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0; int optlen; struct inpcb *inp = sotoinpcb(so); int level, op, optname; if (sopt) { level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; } else level = op = optname = optlen = 0; if (level != IPPROTO_ICMPV6) { return EINVAL; } switch (op) { case PRCO_SETOPT: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter ic6f; if (optlen != sizeof(ic6f)) { error = EMSGSIZE; break; } error = sooptcopyin(sopt, &ic6f, optlen, optlen); if (error == 0) { INP_WLOCK(inp); *inp->in6p_icmp6filt = ic6f; INP_WUNLOCK(inp); } break; } default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter ic6f; INP_RLOCK(inp); ic6f = *inp->in6p_icmp6filt; INP_RUNLOCK(inp); error = sooptcopyout(sopt, &ic6f, sizeof(ic6f)); break; } default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Perform rate limit check. * Returns 0 if it is okay to send the icmp6 packet. * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate * limitation. * * XXX per-destination/type check necessary? * * dst - not used at this moment * type - not used at this moment * code - not used at this moment */ static int icmp6_ratelimit(const struct in6_addr *dst, const int type, const int code) { int ret; ret = 0; /* okay to send */ /* PPS limit */ if (!ppsratecheck(&V_icmp6errppslim_last, &V_icmp6errpps_count, V_icmp6errppslim)) { /* The packet is subject to rate limit */ ret++; } return ret; } diff --git a/sys/netinet6/in6.c b/sys/netinet6/in6.c index 4de6e8d2dfe0..c92ee171b784 100644 --- a/sys/netinet6/in6.c +++ b/sys/netinet6/in6.c @@ -1,2564 +1,2551 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6.c,v 1.259 2002/01/21 11:37:50 keiichi Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.2 (Berkeley) 11/15/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * struct in6_ifreq and struct ifreq must be type punnable for common members * of ifr_ifru to allow accessors to be shared. */ _Static_assert(offsetof(struct in6_ifreq, ifr_ifru) == offsetof(struct ifreq, ifr_ifru), "struct in6_ifreq and struct ifreq are not type punnable"); VNET_DECLARE(int, icmp6_nodeinfo_oldmcprefix); #define V_icmp6_nodeinfo_oldmcprefix VNET(icmp6_nodeinfo_oldmcprefix) /* * Definitions of some costant IP6 addresses. */ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; const struct in6_addr in6addr_nodelocal_allnodes = IN6ADDR_NODELOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; const struct in6_addr in6addr_linklocal_allv2routers = IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT; const struct in6_addr in6mask0 = IN6MASK0; const struct in6_addr in6mask32 = IN6MASK32; const struct in6_addr in6mask64 = IN6MASK64; const struct in6_addr in6mask96 = IN6MASK96; const struct in6_addr in6mask128 = IN6MASK128; const struct sockaddr_in6 sa6_any = { sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 }; static int in6_notify_ifa(struct ifnet *, struct in6_ifaddr *, struct in6_aliasreq *, int); static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *); static int in6_validate_ifra(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); static struct in6_ifaddr *in6_alloc_ifa(struct ifnet *, struct in6_aliasreq *, int flags); static int in6_update_ifa_internal(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int, int); static int in6_broadcast_ifa(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); #define ifa2ia6(ifa) ((struct in6_ifaddr *)(ifa)) #define ia62ifa(ia6) (&((ia6)->ia_ifa)) void in6_newaddrmsg(struct in6_ifaddr *ia, int cmd) { struct sockaddr_dl gateway; struct sockaddr_in6 mask, addr; struct rtentry rt; int fibnum; /* * initialize for rtmsg generation */ bzero(&gateway, sizeof(gateway)); gateway.sdl_len = sizeof(gateway); gateway.sdl_family = AF_LINK; bzero(&rt, sizeof(rt)); rt.rt_gateway = (struct sockaddr *)&gateway; memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask)); memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr)); rt_mask(&rt) = (struct sockaddr *)&mask; rt_key(&rt) = (struct sockaddr *)&addr; rt.rt_flags = RTF_HOST | RTF_STATIC; if (cmd == RTM_ADD) rt.rt_flags |= RTF_UP; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : ia62ifa(ia)->ifa_ifp->if_fib; /* Announce arrival of local address to this FIB. */ rt_newaddrmsg_fib(cmd, &ia->ia_ifa, 0, &rt, fibnum); } int in6_mask2len(struct in6_addr *mask, u_char *lim0) { int x = 0, y; u_char *lim = lim0, *p; /* ignore the scope_id part */ if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask)) lim = (u_char *)mask + sizeof(*mask); for (p = (u_char *)mask; p < lim; x++, p++) { if (*p != 0xff) break; } y = 0; if (p < lim) { for (y = 0; y < 8; y++) { if ((*p & (0x80 >> y)) == 0) break; } } /* * when the limit pointer is given, do a stricter check on the * remaining bits. */ if (p < lim) { if (y != 0 && (*p & (0x00ff >> y)) != 0) return (-1); for (p = p + 1; p < lim; p++) if (*p != 0) return (-1); } return x * 8 + y; } #ifdef COMPAT_FREEBSD32 struct in6_ndifreq32 { char ifname[IFNAMSIZ]; uint32_t ifindex; }; #define SIOCGDEFIFACE32_IN6 _IOWR('i', 86, struct in6_ndifreq32) #endif int in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct in6_ifreq *ifr = (struct in6_ifreq *)data; struct in6_ifaddr *ia = NULL; struct in6_aliasreq *ifra = (struct in6_aliasreq *)data; struct sockaddr_in6 *sa6; int carp_attached = 0; int error; u_long ocmd = cmd; /* * Compat to make pre-10.x ifconfig(8) operable. */ if (cmd == OSIOCAIFADDR_IN6) cmd = SIOCAIFADDR_IN6; switch (cmd) { case SIOCGETSGCNT_IN6: case SIOCGETMIFCNT_IN6: /* * XXX mrt_ioctl has a 3rd, unused, FIB argument in route.c. * We cannot see how that would be needed, so do not adjust the * KPI blindly; more likely should clean up the IPv4 variant. */ return (mrt6_ioctl ? mrt6_ioctl(cmd, data) : EOPNOTSUPP); } switch (cmd) { case SIOCAADDRCTL_POLICY: case SIOCDADDRCTL_POLICY: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ADDRCTRL6); if (error) return (error); } return (in6_src_ioctl(cmd, data)); } if (ifp == NULL) return (EOPNOTSUPP); switch (cmd) { case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCSDEFIFACE_IN6: case SIOCSIFINFO_FLAGS: case SIOCSIFINFO_IN6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ND6); if (error) return (error); } /* FALLTHROUGH */ case OSIOCGIFINFO_IN6: case SIOCGIFINFO_IN6: case SIOCGNBRINFO_IN6: case SIOCGDEFIFACE_IN6: return (nd6_ioctl(cmd, data, ifp)); #ifdef COMPAT_FREEBSD32 case SIOCGDEFIFACE32_IN6: { struct in6_ndifreq ndif; struct in6_ndifreq32 *ndif32; error = nd6_ioctl(SIOCGDEFIFACE_IN6, (caddr_t)&ndif, ifp); if (error) return (error); ndif32 = (struct in6_ndifreq32 *)data; ndif32->ifindex = ndif.ifindex; return (0); } #endif } switch (cmd) { case SIOCSIFPREFIX_IN6: case SIOCDIFPREFIX_IN6: case SIOCAIFPREFIX_IN6: case SIOCCIFPREFIX_IN6: case SIOCSGIFPREFIX_IN6: case SIOCGIFPREFIX_IN6: log(LOG_NOTICE, "prefix ioctls are now invalidated. " "please use ifconfig.\n"); return (EOPNOTSUPP); } switch (cmd) { case SIOCSSCOPE6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SCOPE6); if (error) return (error); } /* FALLTHROUGH */ case SIOCGSCOPE6: case SIOCGSCOPE6DEF: return (scope6_ioctl(cmd, data, ifp)); } /* * Find address for this interface, if it exists. * * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation * only, and used the first interface address as the target of other * operations (without checking ifra_addr). This was because netinet * code/API assumed at most 1 interface address per interface. * Since IPv6 allows a node to assign multiple addresses * on a single interface, we almost always look and check the * presence of ifra_addr, and reject invalid ones here. * It also decreases duplicated code among SIOC*_IN6 operations. */ switch (cmd) { case SIOCAIFADDR_IN6: case SIOCSIFPHYADDR_IN6: sa6 = &ifra->ifra_addr; break; case SIOCSIFADDR_IN6: case SIOCGIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFNETMASK_IN6: case SIOCDIFADDR_IN6: case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: case SIOCGIFAFLAG_IN6: case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCGIFALIFETIME_IN6: case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: sa6 = &ifr->ifr_addr; break; case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* * Although we should pass any non-INET6 ioctl requests * down to driver, we filter some legacy INET requests. * Drivers trust SIOCSIFADDR et al to come from an already * privileged layer, and do not perform any credentials * checks or input validation. */ return (EINVAL); default: sa6 = NULL; break; } if (sa6 && sa6->sin6_family == AF_INET6) { if (sa6->sin6_scope_id != 0) error = sa6_embedscope(sa6, 0); else error = in6_setscope(&sa6->sin6_addr, ifp, NULL); if (error != 0) return (error); if (td != NULL && (error = prison_check_ip6(td->td_ucred, &sa6->sin6_addr)) != 0) return (error); ia = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr); } else ia = NULL; switch (cmd) { case SIOCSIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: /* * Since IPv6 allows a node to assign multiple addresses * on a single interface, SIOCSIFxxx ioctls are deprecated. */ /* we decided to obsolete this command (20000704) */ error = EINVAL; goto out; case SIOCDIFADDR_IN6: /* * for IPv4, we look for existing in_ifaddr here to allow * "ifconfig if0 delete" to remove the first IPv4 address on * the interface. For IPv6, as the spec allows multiple * interface address from the day one, we consider "remove the * first one" semantics to be not preferable. */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } /* FALLTHROUGH */ case SIOCAIFADDR_IN6: /* * We always require users to specify a valid IPv6 address for * the corresponding operation. */ if (ifra->ifra_addr.sin6_family != AF_INET6 || ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) { error = EAFNOSUPPORT; goto out; } if (td != NULL) { error = priv_check(td, (cmd == SIOCDIFADDR_IN6) ? PRIV_NET_DELIFADDR : PRIV_NET_ADDIFADDR); if (error) goto out; } /* FALLTHROUGH */ case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: if (ifp->if_afdata[AF_INET6] == NULL) { error = EPFNOSUPPORT; goto out; } break; case SIOCGIFADDR_IN6: /* This interface is basically deprecated. use SIOCGIFCONF. */ /* FALLTHROUGH */ case SIOCGIFAFLAG_IN6: case SIOCGIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFALIFETIME_IN6: /* must think again about its semantics */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } break; } switch (cmd) { case SIOCGIFADDR_IN6: ifr->ifr_addr = ia->ia_addr; if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0) goto out; break; case SIOCGIFDSTADDR_IN6: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; goto out; } ifr->ifr_dstaddr = ia->ia_dstaddr; if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0) goto out; break; case SIOCGIFNETMASK_IN6: ifr->ifr_addr = ia->ia_prefixmask; break; case SIOCGIFAFLAG_IN6: ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags; break; case SIOCGIFSTAT_IN6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->in6_ifstat, &ifr->ifr_ifru.ifru_stat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFSTAT_ICMP6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->icmp6_ifstat, &ifr->ifr_ifru.ifru_icmp6stat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFALIFETIME_IN6: ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_vltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_expire = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_vltime; } else retlt->ia6t_expire = maxexpire; } if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_pltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_preferred = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_pltime; } else retlt->ia6t_preferred = maxexpire; } break; case SIOCAIFADDR_IN6: { struct nd_prefixctl pr0; struct nd_prefix *pr; /* * first, make or update the interface address structure, * and link it to the list. */ if ((error = in6_update_ifa(ifp, ifra, ia, 0)) != 0) goto out; if (ia != NULL) { if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, true); ifa_free(&ia->ia_ifa); } if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr)) == NULL) { /* * this can happen when the user specify the 0 valid * lifetime. */ break; } if (cmd == ocmd && ifra->ifra_vhid > 0) { if (carp_attach_p != NULL) error = (*carp_attach_p)(&ia->ia_ifa, ifra->ifra_vhid); else error = EPROTONOSUPPORT; if (error) goto out; else carp_attached = 1; } /* * then, make the prefix on-link on the interface. * XXX: we'd rather create the prefix before the address, but * we need at least one address to install the corresponding * interface route, so we configure the address first. */ /* * convert mask to prefix length (prefixmask has already * been validated in in6_update_ifa(). */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, NULL); if (pr0.ndpr_plen == 128) { /* we don't need to install a host route. */ goto aifaddr_out; } pr0.ndpr_prefix = ifra->ifra_addr; /* apply the mask for safety. */ IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &ifra->ifra_prefixmask.sin6_addr); /* * XXX: since we don't have an API to set prefix (not address) * lifetimes, we just use the same lifetimes as addresses. * The (temporarily) installed lifetimes can be overridden by * later advertised RAs (when accept_rtadv is non 0), which is * an intended behavior. */ pr0.ndpr_raf_onlink = 1; /* should be configurable? */ pr0.ndpr_raf_auto = ((ifra->ifra_flags & IN6_IFF_AUTOCONF) != 0); pr0.ndpr_vltime = ifra->ifra_lifetime.ia6t_vltime; pr0.ndpr_pltime = ifra->ifra_lifetime.ia6t_pltime; /* add the prefix if not yet. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { /* * nd6_prelist_add will install the corresponding * interface route. */ if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) { if (carp_attached) (*carp_detach_p)(&ia->ia_ifa, false); goto out; } } /* relate the address to the prefix */ if (ia->ia6_ndpr == NULL) { ia->ia6_ndpr = pr; pr->ndpr_addrcnt++; /* * If this is the first autoconf address from the * prefix, create a temporary address as well * (when required). */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF) && V_ip6_use_tempaddr && pr->ndpr_addrcnt == 1) { int e; if ((e = in6_tmpifadd(ia, 1, 0)) != 0) { log(LOG_NOTICE, "in6_control: failed " "to create a temporary address, " "errno=%d\n", e); } } } nd6_prefix_rele(pr); /* * this might affect the status of autoconfigured addresses, * that is, this address might make other addresses detached. */ pfxlist_onlink_check(); aifaddr_out: /* * Try to clear the flag when a new IPv6 address is added * onto an IFDISABLED interface and it succeeds. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) { struct in6_ndireq nd; memset(&nd, 0, sizeof(nd)); nd.ndi.flags = ND_IFINFO(ifp)->flags; nd.ndi.flags &= ~ND6_IFF_IFDISABLED; if (nd6_ioctl(SIOCSIFINFO_FLAGS, (caddr_t)&nd, ifp) < 0) log(LOG_NOTICE, "SIOCAIFADDR_IN6: " "SIOCSIFINFO_FLAGS for -ifdisabled " "failed."); /* * Ignore failure of clearing the flag intentionally. * The failure means address duplication was detected. */ } break; } case SIOCDIFADDR_IN6: { struct nd_prefix *pr; /* * If the address being deleted is the only one that owns * the corresponding prefix, expire the prefix as well. * XXX: theoretically, we don't have to worry about such * relationship, since we separate the address management * and the prefix management. We do this, however, to provide * as much backward compatibility as possible in terms of * the ioctl operation. * Note that in6_purgeaddr() will decrement ndpr_addrcnt. */ pr = ia->ia6_ndpr; in6_purgeaddr(&ia->ia_ifa); if (pr != NULL && pr->ndpr_addrcnt == 0) { ND6_WLOCK(); nd6_prefix_unlink(pr, NULL); ND6_WUNLOCK(); nd6_prefix_del(pr); } EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, &ia->ia_ifa, IFADDR_EVENT_DEL); break; } default: if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto out; } error = (*ifp->if_ioctl)(ifp, cmd, data); goto out; } error = 0; out: if (ia != NULL) ifa_free(&ia->ia_ifa); return (error); } static struct in6_multi_mship * in6_joingroup_legacy(struct ifnet *ifp, const struct in6_addr *mcaddr, int *errorp, int delay) { struct in6_multi_mship *imm; int error; imm = malloc(sizeof(*imm), M_IP6MADDR, M_NOWAIT); if (imm == NULL) { *errorp = ENOBUFS; return (NULL); } delay = (delay * PR_FASTHZ) / hz; error = in6_joingroup(ifp, mcaddr, NULL, &imm->i6mm_maddr, delay); if (error) { *errorp = error; free(imm, M_IP6MADDR); return (NULL); } return (imm); } /* * Join necessary multicast groups. Factored out from in6_update_ifa(). * This entire work should only be done once, for the default FIB. */ static int in6_update_ifa_join_mc(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags, struct in6_multi **in6m_sol) { char ip6buf[INET6_ADDRSTRLEN]; struct in6_addr mltaddr; struct in6_multi_mship *imm; int delay, error; KASSERT(in6m_sol != NULL, ("%s: in6m_sol is NULL", __func__)); /* Join solicited multicast addr for new host id. */ bzero(&mltaddr, sizeof(struct in6_addr)); mltaddr.s6_addr32[0] = IPV6_ADDR_INT32_MLL; mltaddr.s6_addr32[2] = htonl(1); mltaddr.s6_addr32[3] = ifra->ifra_addr.sin6_addr.s6_addr32[3]; mltaddr.s6_addr8[12] = 0xff; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) { /* XXX: should not happen */ log(LOG_ERR, "%s: in6_setscope failed\n", __func__); goto cleanup; } delay = error = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need a random delay for DAD on the address being * configured. It also means delaying transmission of the * corresponding MLD report to avoid report collision. * [RFC 4861, Section 6.3.7] */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); *in6m_sol = imm->i6mm_maddr; /* * Join link-local all-nodes address. */ mltaddr = in6addr_linklocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup_legacy(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); /* * Join node information group address. */ delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * The spec does not say anything about delay for this group, * but the same logic should apply. */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } if (in6_nigroup(ifp, NULL, -1, &mltaddr) == 0) { /* XXX jinmei */ imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } if (V_icmp6_nodeinfo_oldmcprefix && in6_nigroup_oldmcprefix(ifp, NULL, -1, &mltaddr) == 0) { imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } /* * Join interface-local all-nodes address. * (ff01::1%ifN, and ff01::%ifN/32) */ mltaddr = in6addr_nodelocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup_legacy(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); cleanup: return (error); } /* * Update parameters of an IPv6 interface address. * If necessary, a new entry is created and linked into address chains. * This function is separated from in6_control(). */ int in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int error, hostIsNew = 0; if ((error = in6_validate_ifra(ifp, ifra, ia, flags)) != 0) return (error); if (ia == NULL) { hostIsNew = 1; if ((ia = in6_alloc_ifa(ifp, ifra, flags)) == NULL) return (ENOBUFS); } error = in6_update_ifa_internal(ifp, ifra, ia, hostIsNew, flags); if (error != 0) { if (hostIsNew != 0) { in6_unlink_ifa(ia, ifp); ifa_free(&ia->ia_ifa); } return (error); } if (hostIsNew) error = in6_broadcast_ifa(ifp, ifra, ia, flags); return (error); } /* * Fill in basic IPv6 address request info. */ void in6_prepare_ifra(struct in6_aliasreq *ifra, const struct in6_addr *addr, const struct in6_addr *mask) { memset(ifra, 0, sizeof(struct in6_aliasreq)); ifra->ifra_addr.sin6_family = AF_INET6; ifra->ifra_addr.sin6_len = sizeof(struct sockaddr_in6); if (addr != NULL) ifra->ifra_addr.sin6_addr = *addr; ifra->ifra_prefixmask.sin6_family = AF_INET6; ifra->ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); if (mask != NULL) ifra->ifra_prefixmask.sin6_addr = *mask; } static int in6_validate_ifra(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int plen = -1; struct sockaddr_in6 dst6; struct in6_addrlifetime *lt; char ip6buf[INET6_ADDRSTRLEN]; /* Validate parameters */ if (ifp == NULL || ifra == NULL) /* this maybe redundant */ return (EINVAL); /* * The destination address for a p2p link must have a family * of AF_UNSPEC or AF_INET6. */ if ((ifp->if_flags & IFF_POINTOPOINT) != 0 && ifra->ifra_dstaddr.sin6_family != AF_INET6 && ifra->ifra_dstaddr.sin6_family != AF_UNSPEC) return (EAFNOSUPPORT); /* * Validate address */ if (ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6) || ifra->ifra_addr.sin6_family != AF_INET6) return (EINVAL); /* * validate ifra_prefixmask. don't check sin6_family, netmask * does not carry fields other than sin6_len. */ if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6)) return (EINVAL); /* * Because the IPv6 address architecture is classless, we require * users to specify a (non 0) prefix length (mask) for a new address. * We also require the prefix (when specified) mask is valid, and thus * reject a non-consecutive mask. */ if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0) return (EINVAL); if (ifra->ifra_prefixmask.sin6_len != 0) { plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, (u_char *)&ifra->ifra_prefixmask + ifra->ifra_prefixmask.sin6_len); if (plen <= 0) return (EINVAL); } else { /* * In this case, ia must not be NULL. We just use its prefix * length. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); } /* * If the destination address on a p2p interface is specified, * and the address is a scoped one, validate/set the scope * zone identifier. */ dst6 = ifra->ifra_dstaddr; if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 && (dst6.sin6_family == AF_INET6)) { struct in6_addr in6_tmp; u_int32_t zoneid; in6_tmp = dst6.sin6_addr; if (in6_setscope(&in6_tmp, ifp, &zoneid)) return (EINVAL); /* XXX: should be impossible */ if (dst6.sin6_scope_id != 0) { if (dst6.sin6_scope_id != zoneid) return (EINVAL); } else /* user omit to specify the ID. */ dst6.sin6_scope_id = zoneid; /* convert into the internal form */ if (sa6_embedscope(&dst6, 0)) return (EINVAL); /* XXX: should be impossible */ } /* Modify original ifra_dstaddr to reflect changes */ ifra->ifra_dstaddr = dst6; /* * The destination address can be specified only for a p2p or a * loopback interface. If specified, the corresponding prefix length * must be 128. */ if (ifra->ifra_dstaddr.sin6_family == AF_INET6) { if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) { /* XXX: noisy message */ nd6log((LOG_INFO, "in6_update_ifa: a destination can " "be specified for a p2p or a loopback IF only\n")); return (EINVAL); } if (plen != 128) { nd6log((LOG_INFO, "in6_update_ifa: prefixlen should " "be 128 when dstaddr is specified\n")); return (EINVAL); } } /* lifetime consistency check */ lt = &ifra->ifra_lifetime; if (lt->ia6t_pltime > lt->ia6t_vltime) return (EINVAL); if (lt->ia6t_vltime == 0) { /* * the following log might be noisy, but this is a typical * configuration mistake or a tool's bug. */ nd6log((LOG_INFO, "in6_update_ifa: valid lifetime is 0 for %s\n", ip6_sprintf(ip6buf, &ifra->ifra_addr.sin6_addr))); if (ia == NULL) return (0); /* there's nothing to do */ } /* Check prefix mask */ if (ia != NULL && ifra->ifra_prefixmask.sin6_len != 0) { /* * We prohibit changing the prefix length of an existing * address, because * + such an operation should be rare in IPv6, and * + the operation would confuse prefix management. */ if (ia->ia_prefixmask.sin6_len != 0 && in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) != plen) { nd6log((LOG_INFO, "in6_validate_ifa: the prefix length " "of an existing %s address should not be changed\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); return (EINVAL); } } return (0); } /* * Allocate a new ifaddr and link it into chains. */ static struct in6_ifaddr * in6_alloc_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags) { struct in6_ifaddr *ia; /* * When in6_alloc_ifa() is called in a process of a received * RA, it is called under an interrupt context. So, we should * call malloc with M_NOWAIT. */ ia = (struct in6_ifaddr *)ifa_alloc(sizeof(*ia), M_NOWAIT); if (ia == NULL) return (NULL); LIST_INIT(&ia->ia6_memberships); /* Initialize the address and masks, and put time stamp */ ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; ia->ia_addr.sin6_family = AF_INET6; ia->ia_addr.sin6_len = sizeof(ia->ia_addr); /* XXX: Can we assign ,sin6_addr and skip the rest? */ ia->ia_addr = ifra->ifra_addr; ia->ia6_createtime = time_uptime; if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) { /* * Some functions expect that ifa_dstaddr is not * NULL for p2p interfaces. */ ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; } else { ia->ia_ifa.ifa_dstaddr = NULL; } /* set prefix mask if any */ ia->ia_ifa.ifa_netmask = (struct sockaddr *)&ia->ia_prefixmask; if (ifra->ifra_prefixmask.sin6_len != 0) { ia->ia_prefixmask.sin6_family = AF_INET6; ia->ia_prefixmask.sin6_len = ifra->ifra_prefixmask.sin6_len; ia->ia_prefixmask.sin6_addr = ifra->ifra_prefixmask.sin6_addr; } ia->ia_ifp = ifp; ifa_ref(&ia->ia_ifa); /* if_addrhead */ IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(&ia->ia_ifa); /* in6_ifaddrhead */ IN6_IFADDR_WLOCK(); CK_STAILQ_INSERT_TAIL(&V_in6_ifaddrhead, ia, ia_link); CK_LIST_INSERT_HEAD(IN6ADDR_HASH(&ia->ia_addr.sin6_addr), ia, ia6_hash); IN6_IFADDR_WUNLOCK(); return (ia); } /* * Update/configure interface address parameters: * * 1) Update lifetime * 2) Update interface metric ad flags * 3) Notify other subsystems */ static int in6_update_ifa_internal(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int hostIsNew, int flags) { int error; /* update timestamp */ ia->ia6_updatetime = time_uptime; /* * Set lifetimes. We do not refer to ia6t_expire and ia6t_preferred * to see if the address is deprecated or invalidated, but initialize * these members for applications. */ ia->ia6_lifetime = ifra->ifra_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_expire = time_uptime + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_preferred = time_uptime + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; /* * backward compatibility - if IN6_IFF_DEPRECATED is set from the * userland, make it deprecated. */ if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) { ia->ia6_lifetime.ia6t_pltime = 0; ia->ia6_lifetime.ia6t_preferred = time_uptime; } /* * configure address flags. */ ia->ia6_flags = ifra->ifra_flags; /* * Make the address tentative before joining multicast addresses, * so that corresponding MLD responses would not have a tentative * source address. */ ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */ /* * DAD should be performed for an new address or addresses on * an interface with ND6_IFF_IFDISABLED. */ if (in6if_do_dad(ifp) && (hostIsNew || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED))) ia->ia6_flags |= IN6_IFF_TENTATIVE; /* notify other subsystems */ error = in6_notify_ifa(ifp, ia, ifra, hostIsNew); return (error); } /* * Do link-level ifa job: * 1) Add lle entry for added address * 2) Notifies routing socket users about new address * 3) join appropriate multicast group * 4) start DAD if enabled */ static int in6_broadcast_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { struct in6_multi *in6m_sol; int error = 0; /* Add local address to lltable, if necessary (ex. on p2p link). */ if ((error = nd6_add_ifa_lle(ia)) != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } /* Join necessary multicast groups. */ in6m_sol = NULL; if ((ifp->if_flags & IFF_MULTICAST) != 0) { error = in6_update_ifa_join_mc(ifp, ifra, ia, flags, &in6m_sol); if (error != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } } /* Perform DAD, if the address is TENTATIVE. */ if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) { int delay, mindelay, maxdelay; delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need to impose a delay before sending an NS * for DAD. Check if we also needed a delay for the * corresponding MLD message. If we did, the delay * should be larger than the MLD delay (this could be * relaxed a bit, but this simple logic is at least * safe). * XXX: Break data hiding guidelines and look at * state for the solicited multicast group. */ mindelay = 0; if (in6m_sol != NULL && in6m_sol->in6m_state == MLD_REPORTING_MEMBER) { mindelay = in6m_sol->in6m_timer; } maxdelay = MAX_RTR_SOLICITATION_DELAY * hz; if (maxdelay - mindelay == 0) delay = 0; else { delay = (arc4random() % (maxdelay - mindelay)) + mindelay; } } nd6_dad_start((struct ifaddr *)ia, delay); } in6_newaddrmsg(ia, RTM_ADD); ifa_free(&ia->ia_ifa); return (error); } void in6_purgeaddr(struct ifaddr *ifa) { struct ifnet *ifp = ifa->ifa_ifp; struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa; struct in6_multi_mship *imm; int plen, error; if (ifa->ifa_carp) (*carp_detach_p)(ifa, false); /* * Remove the loopback route to the interface address. * The check for the current setting of "nd6_useloopback" * is not needed. */ if (ia->ia_flags & IFA_RTSELF) { error = ifa_del_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags &= ~IFA_RTSELF; } /* stop DAD processing */ nd6_dad_stop(ifa); /* Leave multicast groups. */ while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) { LIST_REMOVE(imm, i6mm_chain); if (imm->i6mm_maddr != NULL) in6_leavegroup(imm->i6mm_maddr, NULL); free(imm, M_IP6MADDR); } plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if ((ia->ia_flags & IFA_ROUTE) && plen == 128) { error = rtinit(&(ia->ia_ifa), RTM_DELETE, ia->ia_flags | (ia->ia_dstaddr.sin6_family == AF_INET6 ? RTF_HOST : 0)); if (error != 0) log(LOG_INFO, "%s: err=%d, destination address delete " "failed\n", __func__, error); ia->ia_flags &= ~IFA_ROUTE; } in6_newaddrmsg(ia, RTM_DELETE); in6_unlink_ifa(ia, ifp); } static void in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp) { char ip6buf[INET6_ADDRSTRLEN]; int remove_lle; IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ /* * Defer the release of what might be the last reference to the * in6_ifaddr so that it can't be freed before the remainder of the * cleanup. */ IN6_IFADDR_WLOCK(); CK_STAILQ_REMOVE(&V_in6_ifaddrhead, ia, in6_ifaddr, ia_link); CK_LIST_REMOVE(ia, ia6_hash); IN6_IFADDR_WUNLOCK(); /* * Release the reference to the base prefix. There should be a * positive reference. */ remove_lle = 0; if (ia->ia6_ndpr == NULL) { nd6log((LOG_NOTICE, "in6_unlink_ifa: autoconf'ed address " "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia)))); } else { ia->ia6_ndpr->ndpr_addrcnt--; /* Do not delete lles within prefix if refcont != 0 */ if (ia->ia6_ndpr->ndpr_addrcnt == 0) remove_lle = 1; ia->ia6_ndpr = NULL; } nd6_rem_ifa_lle(ia, remove_lle); /* * Also, if the address being removed is autoconf'ed, call * pfxlist_onlink_check() since the release might affect the status of * other (detached) addresses. */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF)) { pfxlist_onlink_check(); } ifa_free(&ia->ia_ifa); /* in6_ifaddrhead */ } /* * Notifies other subsystems about address change/arrival: * 1) Notifies device handler on the first IPv6 address assignment * 2) Handle routing table changes for P2P links and route * 3) Handle routing table changes for address host route */ static int in6_notify_ifa(struct ifnet *ifp, struct in6_ifaddr *ia, struct in6_aliasreq *ifra, int hostIsNew) { int error = 0, plen, ifacount = 0; struct ifaddr *ifa; struct sockaddr_in6 *pdst; char ip6buf[INET6_ADDRSTRLEN]; /* * Give the interface a chance to initialize * if this is its first address, */ if (hostIsNew != 0) { struct epoch_tracker et; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifacount++; } NET_EPOCH_EXIT(et); } if (ifacount <= 1 && ifp->if_ioctl) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) goto done; } /* * If a new destination address is specified, scrub the old one and * install the new destination. Note that the interface must be * p2p or loopback. */ pdst = &ifra->ifra_dstaddr; if (pdst->sin6_family == AF_INET6 && !IN6_ARE_ADDR_EQUAL(&pdst->sin6_addr, &ia->ia_dstaddr.sin6_addr)) { if ((ia->ia_flags & IFA_ROUTE) != 0 && (rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST) != 0)) { nd6log((LOG_ERR, "in6_update_ifa_internal: failed to " "remove a route to the old destination: %s\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); /* proceed anyway... */ } else ia->ia_flags &= ~IFA_ROUTE; ia->ia_dstaddr = *pdst; } /* * If a new destination address is specified for a point-to-point * interface, install a route to the destination as an interface * direct route. * XXX: the logic below rejects assigning multiple addresses on a p2p * interface that share the same destination. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if (!(ia->ia_flags & IFA_ROUTE) && plen == 128 && ia->ia_dstaddr.sin6_family == AF_INET6) { int rtflags = RTF_UP | RTF_HOST; /* * Handle the case for ::1 . */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_flags |= IFA_RTSELF; error = rtinit(&ia->ia_ifa, RTM_ADD, ia->ia_flags | rtflags); if (error) goto done; ia->ia_flags |= IFA_ROUTE; } /* * add a loopback route to self if not exists */ if (!(ia->ia_flags & IFA_RTSELF) && V_nd6_useloopback) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags |= IFA_RTSELF; } done: WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Invoking IPv6 network device address event may sleep"); ifa_ref(&ia->ia_ifa); EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, &ia->ia_ifa, IFADDR_EVENT_ADD); ifa_free(&ia->ia_ifa); return (error); } /* * Find an IPv6 interface link-local address specific to an interface. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags) { - struct epoch_tracker et; struct ifaddr *ifa; - NET_EPOCH_ENTER(et); + NET_EPOCH_ASSERT(); + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) { if ((((struct in6_ifaddr *)ifa)->ia6_flags & ignoreflags) != 0) continue; ifa_ref(ifa); break; } } - NET_EPOCH_EXIT(et); return ((struct in6_ifaddr *)ifa); } /* * find the interface address corresponding to a given IPv6 address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_LIST_FOREACH(ia, IN6ADDR_HASH(addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) { if (zoneid != 0 && zoneid != ia->ia_addr.sin6_scope_id) continue; ifa_ref(&ia->ia_ifa); break; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (ia); } /* * find the internet address corresponding to a given interface and address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpwithaddr(struct ifnet *ifp, const struct in6_addr *addr) { struct epoch_tracker et; struct ifaddr *ifa; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) { ifa_ref(ifa); break; } } NET_EPOCH_EXIT(et); return ((struct in6_ifaddr *)ifa); } /* * Find a link-local scoped address on ifp and return it if any. */ struct in6_ifaddr * in6ifa_llaonifp(struct ifnet *ifp) { - struct epoch_tracker et; struct sockaddr_in6 *sin6; struct ifaddr *ifa; + NET_EPOCH_ASSERT(); + if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) return (NULL); - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_NODELOCAL(&sin6->sin6_addr)) break; } - NET_EPOCH_EXIT(et); return ((struct in6_ifaddr *)ifa); } /* * Convert IP6 address to printable (loggable) representation. Caller * has to make sure that ip6buf is at least INET6_ADDRSTRLEN long. */ static char digits[] = "0123456789abcdef"; char * ip6_sprintf(char *ip6buf, const struct in6_addr *addr) { int i, cnt = 0, maxcnt = 0, idx = 0, index = 0; char *cp; const u_int16_t *a = (const u_int16_t *)addr; const u_int8_t *d; int dcolon = 0, zero = 0; cp = ip6buf; for (i = 0; i < 8; i++) { if (*(a + i) == 0) { cnt++; if (cnt == 1) idx = i; } else if (maxcnt < cnt) { maxcnt = cnt; index = idx; cnt = 0; } } if (maxcnt < cnt) { maxcnt = cnt; index = idx; } for (i = 0; i < 8; i++) { if (dcolon == 1) { if (*a == 0) { if (i == 7) *cp++ = ':'; a++; continue; } else dcolon = 2; } if (*a == 0) { if (dcolon == 0 && *(a + 1) == 0 && i == index) { if (i == 0) *cp++ = ':'; *cp++ = ':'; dcolon = 1; } else { *cp++ = '0'; *cp++ = ':'; } a++; continue; } d = (const u_char *)a; /* Try to eliminate leading zeros in printout like in :0001. */ zero = 1; *cp = digits[*d >> 4]; if (*cp != '0') { zero = 0; cp++; } *cp = digits[*d++ & 0xf]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp = digits[*d >> 4]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp++ = digits[*d & 0xf]; *cp++ = ':'; a++; } *--cp = '\0'; return (ip6buf); } int in6_localaddr(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) return 1; IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr, &ia->ia_prefixmask.sin6_addr)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return 1; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in6_localip(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_LIST_FOREACH(ia, IN6ADDR_HASH(in6), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in6_ifhasaddr(struct ifnet *ifp, struct in6_addr *addr) { struct in6_addr in6; - struct epoch_tracker et; struct ifaddr *ifa; struct in6_ifaddr *ia6; + NET_EPOCH_ASSERT(); + in6 = *addr; if (in6_clearscope(&in6)) return (0); in6_setscope(&in6, ifp, NULL); - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia6 = (struct in6_ifaddr *)ifa; - if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &in6)) { - NET_EPOCH_EXIT(et); + if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &in6)) return (1); - } } - NET_EPOCH_EXIT(et); return (0); } int in6_is_addr_deprecated(struct sockaddr_in6 *sa6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_LIST_FOREACH(ia, IN6ADDR_HASH(&sa6->sin6_addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), &sa6->sin6_addr)) { if (ia->ia6_flags & IN6_IFF_DEPRECATED) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); /* true */ } break; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); /* false */ } /* * return length of part which dst and src are equal * hard coding... */ int in6_matchlen(struct in6_addr *src, struct in6_addr *dst) { int match = 0; u_char *s = (u_char *)src, *d = (u_char *)dst; u_char *lim = s + 16, r; while (s < lim) if ((r = (*d++ ^ *s++)) != 0) { while (r < 128) { match++; r <<= 1; } break; } else match += 8; return match; } /* XXX: to be scope conscious */ int in6_are_prefix_equal(struct in6_addr *p1, struct in6_addr *p2, int len) { int bytelen, bitlen; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_are_prefix_equal: invalid prefix length(%d)\n", len); return (0); } bytelen = len / 8; bitlen = len % 8; if (bcmp(&p1->s6_addr, &p2->s6_addr, bytelen)) return (0); if (bitlen != 0 && p1->s6_addr[bytelen] >> (8 - bitlen) != p2->s6_addr[bytelen] >> (8 - bitlen)) return (0); return (1); } void in6_prefixlen2mask(struct in6_addr *maskp, int len) { u_char maskarray[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; int bytelen, bitlen, i; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n", len); return; } bzero(maskp, sizeof(*maskp)); bytelen = len / 8; bitlen = len % 8; for (i = 0; i < bytelen; i++) maskp->s6_addr[i] = 0xff; if (bitlen) maskp->s6_addr[bytelen] = maskarray[bitlen - 1]; } /* * return the best address out of the same scope. if no address was * found, return the first valid address from designated IF. */ struct in6_ifaddr * in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) { - struct epoch_tracker et; int dst_scope = in6_addrscope(dst), blen = -1, tlen; struct ifaddr *ifa; struct in6_ifaddr *besta = NULL; struct in6_ifaddr *dep[2]; /* last-resort: deprecated */ + NET_EPOCH_ASSERT(); + dep[0] = dep[1] = NULL; /* * We first look for addresses in the same scope. * If there is one, return it. * If two or more, return one which matches the dst longest. * If none, return one of global addresses assigned other ifs. */ - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[0] = (struct in6_ifaddr *)ifa; continue; } if (dst_scope == in6_addrscope(IFA_IN6(ifa))) { /* * call in6_matchlen() as few as possible */ if (besta) { if (blen == -1) blen = in6_matchlen(&besta->ia_addr.sin6_addr, dst); tlen = in6_matchlen(IFA_IN6(ifa), dst); if (tlen > blen) { blen = tlen; besta = (struct in6_ifaddr *)ifa; } } else besta = (struct in6_ifaddr *)ifa; } } if (besta) { ifa_ref(&besta->ia_ifa); - NET_EPOCH_EXIT(et); return (besta); } CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[1] = (struct in6_ifaddr *)ifa; continue; } if (ifa != NULL) ifa_ref(ifa); - NET_EPOCH_EXIT(et); return (struct in6_ifaddr *)ifa; } /* use the last-resort values, that are, deprecated addresses */ if (dep[0]) { ifa_ref((struct ifaddr *)dep[0]); - NET_EPOCH_EXIT(et); return dep[0]; } if (dep[1]) { ifa_ref((struct ifaddr *)dep[1]); - NET_EPOCH_EXIT(et); return dep[1]; } - NET_EPOCH_EXIT(et); return NULL; } /* * perform DAD when interface becomes IFF_UP. */ void in6_if_up(struct ifnet *ifp) { struct epoch_tracker et; struct ifaddr *ifa; struct in6_ifaddr *ia; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (ia->ia6_flags & IN6_IFF_TENTATIVE) { /* * The TENTATIVE flag was likely set by hand * beforehand, implicitly indicating the need for DAD. * We may be able to skip the random delay in this * case, but we impose delays just in case. */ nd6_dad_start(ifa, arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz)); } } - NET_EPOCH_EXIT(et); /* * special cases, like 6to4, are handled in in6_ifattach */ in6_ifattach(ifp, NULL); + NET_EPOCH_EXIT(et); } int in6if_do_dad(struct ifnet *ifp) { if ((ifp->if_flags & IFF_LOOPBACK) != 0) return (0); if ((ifp->if_flags & IFF_MULTICAST) == 0) return (0); if ((ND_IFINFO(ifp)->flags & (ND6_IFF_IFDISABLED | ND6_IFF_NO_DAD)) != 0) return (0); return (1); } /* * Calculate max IPv6 MTU through all the interfaces and store it * to in6_maxmtu. */ void in6_setmaxmtu(void) { struct epoch_tracker et; unsigned long maxmtu = 0; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* this function can be called during ifnet initialization */ if (!ifp->if_afdata[AF_INET6]) continue; if ((ifp->if_flags & IFF_LOOPBACK) == 0 && IN6_LINKMTU(ifp) > maxmtu) maxmtu = IN6_LINKMTU(ifp); } NET_EPOCH_EXIT(et); if (maxmtu) /* update only when maxmtu is positive */ V_in6_maxmtu = maxmtu; } /* * Provide the length of interface identifiers to be used for the link attached * to the given interface. The length should be defined in "IPv6 over * xxx-link" document. Note that address architecture might also define * the length for a particular set of address prefixes, regardless of the * link type. As clarified in rfc2462bis, those two definitions should be * consistent, and those really are as of August 2004. */ int in6_if2idlen(struct ifnet *ifp) { switch (ifp->if_type) { case IFT_ETHER: /* RFC2464 */ case IFT_PROPVIRTUAL: /* XXX: no RFC. treat it as ether */ case IFT_L2VLAN: /* ditto */ case IFT_BRIDGE: /* bridge(4) only does Ethernet-like links */ case IFT_INFINIBAND: return (64); case IFT_PPP: /* RFC2472 */ return (64); case IFT_FRELAY: /* RFC2590 */ return (64); case IFT_IEEE1394: /* RFC3146 */ return (64); case IFT_GIF: return (64); /* draft-ietf-v6ops-mech-v2-07 */ case IFT_LOOP: return (64); /* XXX: is this really correct? */ default: /* * Unknown link type: * It might be controversial to use the today's common constant * of 64 for these cases unconditionally. For full compliance, * we should return an error in this case. On the other hand, * if we simply miss the standard for the link type or a new * standard is defined for a new link type, the IFID length * is very likely to be the common constant. As a compromise, * we always use the constant, but make an explicit notice * indicating the "unknown" case. */ printf("in6_if2idlen: unknown link type (%d)\n", ifp->if_type); return (64); } } #include struct in6_llentry { struct llentry base; }; #define IN6_LLTBL_DEFAULT_HSIZE 32 #define IN6_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. */ static void in6_lltable_destroy_lle_unlocked(epoch_context_t ctx) { struct llentry *lle; lle = __containerof(ctx, struct llentry, lle_epoch_ctx); LLE_LOCK_DESTROY(lle); LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); } /* * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in6_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); epoch_call(net_epoch_preempt, &lle->lle_epoch_ctx, in6_lltable_destroy_lle_unlocked); } static struct llentry * in6_lltable_new(const struct in6_addr *addr6, u_int flags) { struct in6_llentry *lle; lle = malloc(sizeof(struct in6_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; lle->base.r_l3addr.addr6 = *addr6; lle->base.lle_refcnt = 1; lle->base.lle_free = in6_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); LLE_REQ_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } static int in6_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { const struct in6_addr *addr, *mask, *lle_addr; addr = &((const struct sockaddr_in6 *)saddr)->sin6_addr; mask = &((const struct sockaddr_in6 *)smask)->sin6_addr; lle_addr = &lle->r_l3addr.addr6; if (IN6_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. */ if (IN6_ARE_ADDR_EQUAL(addr, lle_addr) && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in6_lltable_free_entry(struct lltable *llt, struct llentry *lle) { struct ifnet *ifp; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table */ if ((lle->la_flags & LLE_LINKED) != 0) { ifp = llt->llt_ifp; IF_AFDATA_WLOCK_ASSERT(ifp); lltable_unlink_entry(llt, lle); } llentry_free(lle); } static int in6_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6; struct nhop6_basic nh6; struct in6_addr dst; uint32_t scopeid; int error; char ip6buf[INET6_ADDRSTRLEN]; int fibnum; + NET_EPOCH_ASSERT(); KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); sin6 = (const struct sockaddr_in6 *)l3addr; in6_splitscope(&sin6->sin6_addr, &dst, &scopeid); fibnum = V_rt_add_addr_allfibs ? RT_DEFAULT_FIB : ifp->if_fib; error = fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6); if (error != 0 || (nh6.nh_flags & NHF_GATEWAY) || nh6.nh_ifp != ifp) { - struct epoch_tracker et; struct ifaddr *ifa; /* * Create an ND6 cache for an IPv6 neighbor * that is not covered by our own prefix. */ - NET_EPOCH_ENTER(et); ifa = ifaof_ifpforaddr(l3addr, ifp); if (ifa != NULL) { - NET_EPOCH_EXIT(et); return 0; } - NET_EPOCH_EXIT(et); log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n", ip6_sprintf(ip6buf, &sin6->sin6_addr)); return EINVAL; } return 0; } /* * Called by the datapath to indicate that the entry was used. */ static void in6_lltable_mark_used(struct llentry *lle) { LLE_REQ_LOCK(lle); lle->r_skip_req = 0; /* * Set the hit time so the callback function * can determine the remaining time before * transiting to the DELAY state. */ lle->lle_hittime = time_uptime; LLE_REQ_UNLOCK(lle); } static inline uint32_t in6_lltable_hash_dst(const struct in6_addr *dst, uint32_t hsize) { return (IN6_LLTBL_HASH(dst->s6_addr32[3], hsize)); } static uint32_t in6_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in6_lltable_hash_dst(&lle->r_l3addr.addr6, hsize)); } static void in6_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)sa; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_addr = lle->r_l3addr.addr6; } static inline struct llentry * in6_lltable_find_dst(struct lltable *llt, const struct in6_addr *dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in6_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; CK_LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (IN6_ARE_ADDR_EQUAL(&lle->r_l3addr.addr6, dst)) break; } return (lle); } static void in6_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in6_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in6_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in6_lltable_new(&sin6->sin6_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if ((flags & LLE_IFADDR) == LLE_IFADDR) { linkhdrsize = LLE_MAX_LINKHDR; if (lltable_calc_llheader(ifp, AF_INET6, IF_LLADDR(ifp), linkhdr, &linkhdrsize, &lladdr_off) != 0) { epoch_call(net_epoch_preempt, &lle->lle_epoch_ctx, in6_lltable_destroy_lle_unlocked); return (NULL); } lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); lle->la_flags |= LLE_STATIC; } if ((lle->la_flags & LLE_STATIC) != 0) lle->ln_state = ND6_LLINFO_REACHABLE; return (lle); } static struct llentry * in6_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); KASSERT((flags & (LLE_UNLOCKED | LLE_EXCLUSIVE)) != (LLE_UNLOCKED | LLE_EXCLUSIVE), ("wrong lle request flags: %#x", flags)); lle = in6_lltable_find_dst(llt, &sin6->sin6_addr); if (lle == NULL) return (NULL); if (flags & LLE_UNLOCKED) return (lle); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); /* * If the afdata lock is not held, the LLE may have been unlinked while * we were blocked on the LLE lock. Check for this case. */ if (__predict_false((lle->la_flags & LLE_LINKED) == 0)) { if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(lle); else LLE_RUNLOCK(lle); return (NULL); } return (lle); } static int in6_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in6 sin6; /* * ndp.c assumes that sdl is word aligned */ #ifdef __LP64__ uint32_t pad; #endif struct sockaddr_dl sdl; } ndpc; struct sockaddr_dl *sdl; int error; bzero(&ndpc, sizeof(ndpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle, (struct sockaddr *)&ndpc.sin6); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&ndpc.sin6) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in6 (IPv6) * struct sockaddr_dl; */ ndpc.rtm.rtm_msglen = sizeof(ndpc); ndpc.rtm.rtm_version = RTM_VERSION; ndpc.rtm.rtm_type = RTM_GET; ndpc.rtm.rtm_flags = RTF_UP; ndpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; if (V_deembed_scopeid) sa6_recoverscope(&ndpc.sin6); /* publish */ if (lle->la_flags & LLE_PUB) ndpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &ndpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } if (lle->la_expire != 0) ndpc.rtm.rtm_rmx.rmx_expire = lle->la_expire + lle->lle_remtime / hz + time_second - time_uptime; ndpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) ndpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) ndpc.rtm.rtm_flags |= RTF_PINNED; if (lle->ln_router != 0) ndpc.rtm.rtm_flags |= RTF_GATEWAY; ndpc.rtm.rtm_rmx.rmx_pksent = lle->la_asked; /* Store state in rmx_weight value */ ndpc.rtm.rtm_rmx.rmx_state = lle->ln_state; ndpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &ndpc, sizeof(ndpc)); return (error); } static struct lltable * in6_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN6_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET6; llt->llt_ifp = ifp; llt->llt_lookup = in6_lltable_lookup; llt->llt_alloc_entry = in6_lltable_alloc; llt->llt_delete_entry = in6_lltable_delete_entry; llt->llt_dump_entry = in6_lltable_dump_entry; llt->llt_hash = in6_lltable_hash; llt->llt_fill_sa_entry = in6_lltable_fill_sa_entry; llt->llt_free_entry = in6_lltable_free_entry; llt->llt_match_prefix = in6_lltable_match_prefix; llt->llt_mark_used = in6_lltable_mark_used; lltable_link(llt); return (llt); } void * in6_domifattach(struct ifnet *ifp) { struct in6_ifextra *ext; /* There are not IPv6-capable interfaces. */ switch (ifp->if_type) { case IFT_PFLOG: case IFT_PFSYNC: case IFT_USB: return (NULL); } ext = (struct in6_ifextra *)malloc(sizeof(*ext), M_IFADDR, M_WAITOK); bzero(ext, sizeof(*ext)); ext->in6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct in6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->icmp6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->nd_ifinfo = nd6_ifattach(ifp); ext->scope6_id = scope6_ifattach(ifp); ext->lltable = in6_lltattach(ifp); ext->mld_ifinfo = mld_domifattach(ifp); return ext; } int in6_domifmtu(struct ifnet *ifp) { if (ifp->if_afdata[AF_INET6] == NULL) return ifp->if_mtu; return (IN6_LINKMTU(ifp)); } void in6_domifdetach(struct ifnet *ifp, void *aux) { struct in6_ifextra *ext = (struct in6_ifextra *)aux; mld_domifdetach(ifp); scope6_ifdetach(ext->scope6_id); nd6_ifdetach(ifp, ext->nd_ifinfo); lltable_free(ext->lltable); COUNTER_ARRAY_FREE(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); free(ext->in6_ifstat, M_IFADDR); COUNTER_ARRAY_FREE(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); free(ext->icmp6_ifstat, M_IFADDR); free(ext, M_IFADDR); } /* * Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be * v4 mapped addr or v4 compat addr */ void in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin, sizeof(*sin)); sin->sin_len = sizeof(struct sockaddr_in); sin->sin_family = AF_INET; sin->sin_port = sin6->sin6_port; sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3]; } /* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin6, sizeof(*sin6)); sin6->sin6_len = sizeof(struct sockaddr_in6); sin6->sin6_family = AF_INET6; sin6->sin6_port = sin->sin_port; sin6->sin6_addr.s6_addr32[0] = 0; sin6->sin6_addr.s6_addr32[1] = 0; sin6->sin6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; sin6->sin6_addr.s6_addr32[3] = sin->sin_addr.s_addr; } /* Convert sockaddr_in6 into sockaddr_in. */ void in6_sin6_2_sin_in_sock(struct sockaddr *nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 sin6; /* * Save original sockaddr_in6 addr and convert it * to sockaddr_in. */ sin6 = *(struct sockaddr_in6 *)nam; sin_p = (struct sockaddr_in *)nam; in6_sin6_2_sin(sin_p, &sin6); } /* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 *sin6_p; sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK); sin_p = (struct sockaddr_in *)*nam; in6_sin_2_v4mapsin6(sin_p, sin6_p); free(*nam, M_SONAME); *nam = (struct sockaddr *)sin6_p; } diff --git a/sys/netinet6/in6_ifattach.c b/sys/netinet6/in6_ifattach.c index 7ce680f2f91a..fd46f4494a43 100644 --- a/sys/netinet6/in6_ifattach.c +++ b/sys/netinet6/in6_ifattach.c @@ -1,910 +1,893 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_ifattach.c,v 1.118 2001/05/24 07:44:00 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_DEFINE(unsigned long, in6_maxmtu) = 0; #ifdef IP6_AUTO_LINKLOCAL VNET_DEFINE(int, ip6_auto_linklocal) = IP6_AUTO_LINKLOCAL; #else VNET_DEFINE(int, ip6_auto_linklocal) = 1; /* enabled by default */ #endif VNET_DEFINE(struct callout, in6_tmpaddrtimer_ch); #define V_in6_tmpaddrtimer_ch VNET(in6_tmpaddrtimer_ch) VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static int get_rand_ifid(struct ifnet *, struct in6_addr *); static int generate_tmp_ifid(u_int8_t *, const u_int8_t *, u_int8_t *); static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *); static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *); static int in6_ifattach_loopback(struct ifnet *); static void in6_purgemaddrs(struct ifnet *); #define EUI64_GBIT 0x01 #define EUI64_UBIT 0x02 #define EUI64_TO_IFID(in6) do {(in6)->s6_addr[8] ^= EUI64_UBIT; } while (0) #define EUI64_GROUP(in6) ((in6)->s6_addr[8] & EUI64_GBIT) #define EUI64_INDIVIDUAL(in6) (!EUI64_GROUP(in6)) #define EUI64_LOCAL(in6) ((in6)->s6_addr[8] & EUI64_UBIT) #define EUI64_UNIVERSAL(in6) (!EUI64_LOCAL(in6)) #define IFID_LOCAL(in6) (!EUI64_LOCAL(in6)) #define IFID_UNIVERSAL(in6) (!EUI64_UNIVERSAL(in6)) /* * Generate a last-resort interface identifier, when the machine has no * IEEE802/EUI64 address sources. * The goal here is to get an interface identifier that is * (1) random enough and (2) does not change across reboot. * We currently use MD5(hostname) for it. * * in6 - upper 64bits are preserved */ static int get_rand_ifid(struct ifnet *ifp, struct in6_addr *in6) { MD5_CTX ctxt; struct prison *pr; u_int8_t digest[16]; int hostnamelen; pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); hostnamelen = strlen(pr->pr_hostname); #if 0 /* we need at least several letters as seed for ifid */ if (hostnamelen < 3) { mtx_unlock(&pr->pr_mtx); return -1; } #endif /* generate 8 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, pr->pr_hostname, hostnamelen); mtx_unlock(&pr->pr_mtx); MD5Final(digest, &ctxt); /* assumes sizeof(digest) > sizeof(ifid) */ bcopy(digest, &in6->s6_addr[8], 8); /* make sure to set "u" bit to local, and "g" bit to individual. */ in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */ in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */ /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); return 0; } static int generate_tmp_ifid(u_int8_t *seed0, const u_int8_t *seed1, u_int8_t *ret) { MD5_CTX ctxt; u_int8_t seed[16], digest[16], nullbuf[8]; u_int32_t val32; /* If there's no history, start with a random seed. */ bzero(nullbuf, sizeof(nullbuf)); if (bcmp(nullbuf, seed0, sizeof(nullbuf)) == 0) { int i; for (i = 0; i < 2; i++) { val32 = arc4random(); bcopy(&val32, seed + sizeof(val32) * i, sizeof(val32)); } } else bcopy(seed0, seed, 8); /* copy the right-most 64-bits of the given address */ /* XXX assumption on the size of IFID */ bcopy(seed1, &seed[8], 8); if (0) { /* for debugging purposes only */ int i; printf("generate_tmp_ifid: new randomized ID from: "); for (i = 0; i < 16; i++) printf("%02x", seed[i]); printf(" "); } /* generate 16 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, seed, sizeof(seed)); MD5Final(digest, &ctxt); /* * RFC 3041 3.2.1. (3) * Take the left-most 64-bits of the MD5 digest and set bit 6 (the * left-most bit is numbered 0) to zero. */ bcopy(digest, ret, 8); ret[0] &= ~EUI64_UBIT; /* * XXX: we'd like to ensure that the generated value is not zero * for simplicity. If the caclculated digest happens to be zero, * use a random non-zero value as the last resort. */ if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) { nd6log((LOG_INFO, "generate_tmp_ifid: computed MD5 value is zero.\n")); val32 = arc4random(); val32 = 1 + (val32 % (0xffffffff - 1)); } /* * RFC 3041 3.2.1. (4) * Take the rightmost 64-bits of the MD5 digest and save them in * stable storage as the history value to be used in the next * iteration of the algorithm. */ bcopy(&digest[8], seed0, 8); if (0) { /* for debugging purposes only */ int i; printf("to: "); for (i = 0; i < 16; i++) printf("%02x", digest[i]); printf("\n"); } return 0; } /* * Get interface identifier for the specified interface. * XXX assumes single sockaddr_dl (AF_LINK address) per an interface * * in6 - upper 64bits are preserved */ int in6_get_hw_ifid(struct ifnet *ifp, struct in6_addr *in6) { - struct epoch_tracker et; struct ifaddr *ifa; struct sockaddr_dl *sdl; u_int8_t *addr; size_t addrlen; static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; static u_int8_t allone[8] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; - NET_EPOCH_ENTER(et); + NET_EPOCH_ASSERT(); + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) continue; sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) continue; if (sdl->sdl_alen == 0) continue; goto found; } - NET_EPOCH_EXIT(et); return -1; found: - IF_ADDR_LOCK_ASSERT(ifp); addr = LLADDR(sdl); addrlen = sdl->sdl_alen; /* get EUI64 */ switch (ifp->if_type) { case IFT_BRIDGE: case IFT_ETHER: case IFT_L2VLAN: case IFT_ATM: case IFT_IEEE1394: /* IEEE802/EUI64 cases - what others? */ /* IEEE1394 uses 16byte length address starting with EUI64 */ if (addrlen > 8) addrlen = 8; /* look at IEEE802/EUI64 only */ - if (addrlen != 8 && addrlen != 6) { - NET_EPOCH_EXIT(et); + if (addrlen != 8 && addrlen != 6) return -1; - } /* * check for invalid MAC address - on bsdi, we see it a lot * since wildboar configures all-zero MAC on pccard before * card insertion. */ - if (bcmp(addr, allzero, addrlen) == 0) { - NET_EPOCH_EXIT(et); + if (bcmp(addr, allzero, addrlen) == 0) return -1; - } - if (bcmp(addr, allone, addrlen) == 0) { - NET_EPOCH_EXIT(et); + if (bcmp(addr, allone, addrlen) == 0) return -1; - } /* make EUI64 address */ if (addrlen == 8) bcopy(addr, &in6->s6_addr[8], 8); else if (addrlen == 6) { in6->s6_addr[8] = addr[0]; in6->s6_addr[9] = addr[1]; in6->s6_addr[10] = addr[2]; in6->s6_addr[11] = 0xff; in6->s6_addr[12] = 0xfe; in6->s6_addr[13] = addr[3]; in6->s6_addr[14] = addr[4]; in6->s6_addr[15] = addr[5]; } break; case IFT_GIF: case IFT_STF: /* * RFC2893 says: "SHOULD use IPv4 address as ifid source". * however, IPv4 address is not very suitable as unique * identifier source (can be renumbered). * we don't do this. */ - NET_EPOCH_EXIT(et); return -1; case IFT_INFINIBAND: - if (addrlen != 20) { - NET_EPOCH_EXIT(et); + if (addrlen != 20) return -1; - } bcopy(addr + 12, &in6->s6_addr[8], 8); break; default: - NET_EPOCH_EXIT(et); return -1; } /* sanity check: g bit must not indicate "group" */ - if (EUI64_GROUP(in6)) { - NET_EPOCH_EXIT(et); + if (EUI64_GROUP(in6)) return -1; - } /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); /* * sanity check: ifid must not be all zero, avoid conflict with * subnet router anycast */ if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 && - bcmp(&in6->s6_addr[9], allzero, 7) == 0) { - NET_EPOCH_EXIT(et); + bcmp(&in6->s6_addr[9], allzero, 7) == 0) return -1; - } - NET_EPOCH_EXIT(et); return 0; } /* * Get interface identifier for the specified interface. If it is not * available on ifp0, borrow interface identifier from other information * sources. * * altifp - secondary EUI64 source */ static int get_ifid(struct ifnet *ifp0, struct ifnet *altifp, struct in6_addr *in6) { - struct epoch_tracker et; struct ifnet *ifp; + NET_EPOCH_ASSERT(); + /* first, try to get it from the interface itself */ if (in6_get_hw_ifid(ifp0, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from itself\n", if_name(ifp0))); goto success; } /* try secondary EUI64 source. this basically is for ATM PVC */ if (altifp && in6_get_hw_ifid(altifp, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from %s\n", if_name(ifp0), if_name(altifp))); goto success; } /* next, try to get it from some other hardware interface */ - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp == ifp0) continue; if (in6_get_hw_ifid(ifp, in6) != 0) continue; /* * to borrow ifid from other interface, ifid needs to be * globally unique */ if (IFID_UNIVERSAL(in6)) { nd6log((LOG_DEBUG, "%s: borrow interface identifier from %s\n", if_name(ifp0), if_name(ifp))); - NET_EPOCH_EXIT(et); goto success; } } - NET_EPOCH_EXIT(et); /* last resort: get from random number source */ if (get_rand_ifid(ifp, in6) == 0) { nd6log((LOG_DEBUG, "%s: interface identifier generated by random number\n", if_name(ifp0))); goto success; } printf("%s: failed to get interface identifier\n", if_name(ifp0)); return -1; success: nd6log((LOG_INFO, "%s: ifid: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", if_name(ifp0), in6->s6_addr[8], in6->s6_addr[9], in6->s6_addr[10], in6->s6_addr[11], in6->s6_addr[12], in6->s6_addr[13], in6->s6_addr[14], in6->s6_addr[15])); return 0; } /* * altifp - secondary EUI64 source */ static int in6_ifattach_linklocal(struct ifnet *ifp, struct ifnet *altifp) { struct in6_ifaddr *ia; struct in6_aliasreq ifra; struct nd_prefixctl pr0; struct nd_prefix *pr; int error; /* * configure link-local address. */ in6_prepare_ifra(&ifra, NULL, &in6mask64); ifra.ifra_addr.sin6_addr.s6_addr32[0] = htonl(0xfe800000); ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0; if ((ifp->if_flags & IFF_LOOPBACK) != 0) { ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0; ifra.ifra_addr.sin6_addr.s6_addr32[3] = htonl(1); } else { if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) { nd6log((LOG_ERR, "%s: no ifid available\n", if_name(ifp))); return (-1); } } if (in6_setscope(&ifra.ifra_addr.sin6_addr, ifp, NULL)) return (-1); /* link-local addresses should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; /* * Now call in6_update_ifa() to do a bunch of procedures to configure * a link-local address. We can set the 3rd argument to NULL, because * we know there's no other link-local address on the interface * and therefore we are adding one (instead of updating one). */ if ((error = in6_update_ifa(ifp, &ifra, NULL, IN6_IFAUPDATE_DADDELAY)) != 0) { /* * XXX: When the interface does not support IPv6, this call * would fail in the SIOCSIFADDR ioctl. I believe the * notification is rather confusing in this case, so just * suppress it. (jinmei@kame.net 20010130) */ if (error != EAFNOSUPPORT) nd6log((LOG_NOTICE, "in6_ifattach_linklocal: failed to " "configure a link-local address on %s " "(errno=%d)\n", if_name(ifp), error)); return (-1); } ia = in6ifa_ifpforlinklocal(ifp, 0); if (ia == NULL) { /* * Another thread removed the address that we just added. * This should be rare, but it happens. */ nd6log((LOG_NOTICE, "%s: %s: new link-local address " "disappeared\n", __func__, if_name(ifp))); return (-1); } ifa_free(&ia->ia_ifa); /* * Make the link-local prefix (fe80::%link/64) as on-link. * Since we'd like to manage prefixes separately from addresses, * we make an ND6 prefix structure for the link-local prefix, * and add it to the prefix list as a never-expire prefix. * XXX: this change might affect some existing code base... */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; /* this should be 64 at this moment. */ pr0.ndpr_plen = in6_mask2len(&ifra.ifra_prefixmask.sin6_addr, NULL); pr0.ndpr_prefix = ifra.ifra_addr; /* apply the mask for safety. (nd6_prelist_add will apply it again) */ IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &in6mask64); /* * Initialize parameters. The link-local prefix must always be * on-link, and its lifetimes never expire. */ pr0.ndpr_raf_onlink = 1; pr0.ndpr_raf_auto = 1; /* probably meaningless */ pr0.ndpr_vltime = ND6_INFINITE_LIFETIME; pr0.ndpr_pltime = ND6_INFINITE_LIFETIME; /* * Since there is no other link-local addresses, nd6_prefix_lookup() * probably returns NULL. However, we cannot always expect the result. * For example, if we first remove the (only) existing link-local * address, and then reconfigure another one, the prefix is still * valid with referring to the old link-local address. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { if ((error = nd6_prelist_add(&pr0, NULL, NULL)) != 0) return (error); } else nd6_prefix_rele(pr); return 0; } /* * ifp - must be IFT_LOOP */ static int in6_ifattach_loopback(struct ifnet *ifp) { struct in6_aliasreq ifra; int error; in6_prepare_ifra(&ifra, &in6addr_loopback, &in6mask128); /* * Always initialize ia_dstaddr (= broadcast address) to loopback * address. Follows IPv4 practice - see in_ifinit(). */ ifra.ifra_dstaddr.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_dstaddr.sin6_family = AF_INET6; ifra.ifra_dstaddr.sin6_addr = in6addr_loopback; /* the loopback address should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; /* * We are sure that this is a newly assigned address, so we can set * NULL to the 3rd arg. */ if ((error = in6_update_ifa(ifp, &ifra, NULL, 0)) != 0) { nd6log((LOG_ERR, "in6_ifattach_loopback: failed to configure " "the loopback address on %s (errno=%d)\n", if_name(ifp), error)); return (-1); } return 0; } /* * compute NI group address, based on the current hostname setting. * see RFC 4620. * * when ifp == NULL, the caller is responsible for filling scopeid. * * If oldmcprefix == 1, FF02:0:0:0:0:2::/96 is used for NI group address * while it is FF02:0:0:0:0:2:FF00::/104 in RFC 4620. */ static int in6_nigroup0(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6, int oldmcprefix) { struct prison *pr; const char *p; u_char *q; MD5_CTX ctxt; u_int8_t digest[16]; char l; char n[64]; /* a single label must not exceed 63 chars */ /* * If no name is given and namelen is -1, * we try to do the hostname lookup ourselves. */ if (!name && namelen == -1) { pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); name = pr->pr_hostname; namelen = strlen(name); } else pr = NULL; if (!name || !namelen) { if (pr != NULL) mtx_unlock(&pr->pr_mtx); return -1; } p = name; while (p && *p && *p != '.' && p - name < namelen) p++; if (p == name || p - name > sizeof(n) - 1) { if (pr != NULL) mtx_unlock(&pr->pr_mtx); return -1; /* label too long */ } l = p - name; strncpy(n, name, l); if (pr != NULL) mtx_unlock(&pr->pr_mtx); n[(int)l] = '\0'; for (q = n; *q; q++) { if ('A' <= *q && *q <= 'Z') *q = *q - 'A' + 'a'; } /* generate 16 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, &l, sizeof(l)); MD5Update(&ctxt, n, l); MD5Final(digest, &ctxt); bzero(in6, sizeof(*in6)); in6->s6_addr16[0] = IPV6_ADDR_INT16_MLL; in6->s6_addr8[11] = 2; if (oldmcprefix == 0) { in6->s6_addr8[12] = 0xff; /* Copy the first 24 bits of 128-bit hash into the address. */ bcopy(digest, &in6->s6_addr8[13], 3); } else { /* Copy the first 32 bits of 128-bit hash into the address. */ bcopy(digest, &in6->s6_addr32[3], sizeof(in6->s6_addr32[3])); } if (in6_setscope(in6, ifp, NULL)) return (-1); /* XXX: should not fail */ return 0; } int in6_nigroup(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6) { return (in6_nigroup0(ifp, name, namelen, in6, 0)); } int in6_nigroup_oldmcprefix(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6) { return (in6_nigroup0(ifp, name, namelen, in6, 1)); } /* * XXX multiple loopback interface needs more care. for instance, * nodelocal address needs to be configured onto only one of them. * XXX multiple link-local address case * * altifp - secondary EUI64 source */ void in6_ifattach(struct ifnet *ifp, struct ifnet *altifp) { struct in6_ifaddr *ia; + NET_EPOCH_ASSERT(); + if (ifp->if_afdata[AF_INET6] == NULL) return; /* * quirks based on interface type */ switch (ifp->if_type) { case IFT_STF: /* * 6to4 interface is a very special kind of beast. * no multicast, no linklocal. RFC2529 specifies how to make * linklocals for 6to4 interface, but there's no use and * it is rather harmful to have one. */ ND_IFINFO(ifp)->flags &= ~ND6_IFF_AUTO_LINKLOCAL; ND_IFINFO(ifp)->flags |= ND6_IFF_NO_DAD; break; default: break; } /* * usually, we require multicast capability to the interface */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { nd6log((LOG_INFO, "in6_ifattach: " "%s is not multicast capable, IPv6 not enabled\n", if_name(ifp))); return; } /* * assign loopback address for loopback interface. */ if ((ifp->if_flags & IFF_LOOPBACK) != 0) { /* * check that loopback address doesn't exist yet. */ ia = in6ifa_ifwithaddr(&in6addr_loopback, 0); if (ia == NULL) in6_ifattach_loopback(ifp); else ifa_free(&ia->ia_ifa); } /* * assign a link-local address, if there's none. */ if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL) { ia = in6ifa_ifpforlinklocal(ifp, 0); if (ia == NULL) in6_ifattach_linklocal(ifp, altifp); else ifa_free(&ia->ia_ifa); } /* update dynamically. */ if (V_in6_maxmtu < ifp->if_mtu) V_in6_maxmtu = ifp->if_mtu; } /* * NOTE: in6_ifdetach() does not support loopback if at this moment. * * When shutting down a VNET we clean up layers top-down. In that case * upper layer protocols (ulp) are cleaned up already and locks are destroyed * and we must not call into these cleanup functions anymore, thus purgeulp * is set to 0 in that case by in6_ifdetach_destroy(). * The normal case of destroying a (cloned) interface still needs to cleanup * everything related to the interface and will have purgeulp set to 1. */ static void _in6_ifdetach(struct ifnet *ifp, int purgeulp) { struct ifaddr *ifa, *next; if (ifp->if_afdata[AF_INET6] == NULL) return; /* * nuke any of IPv6 addresses we have */ CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6_purgeaddr(ifa); } if (purgeulp) { IN6_MULTI_LOCK(); in6_pcbpurgeif0(&V_udbinfo, ifp); in6_pcbpurgeif0(&V_ulitecbinfo, ifp); in6_pcbpurgeif0(&V_ripcbinfo, ifp); IN6_MULTI_UNLOCK(); } /* leave from all multicast groups joined */ in6_purgemaddrs(ifp); /* * Remove neighbor management table. * Enabling the nd6_purge will panic on vmove for interfaces on VNET * teardown as the IPv6 layer is cleaned up already and the locks * are destroyed. */ if (purgeulp) nd6_purge(ifp); } void in6_ifdetach(struct ifnet *ifp) { _in6_ifdetach(ifp, 1); } void in6_ifdetach_destroy(struct ifnet *ifp) { _in6_ifdetach(ifp, 0); } int in6_get_tmpifid(struct ifnet *ifp, u_int8_t *retbuf, const u_int8_t *baseid, int generate) { u_int8_t nullbuf[8]; struct nd_ifinfo *ndi = ND_IFINFO(ifp); bzero(nullbuf, sizeof(nullbuf)); if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) == 0) { /* we've never created a random ID. Create a new one. */ generate = 1; } if (generate) { bcopy(baseid, ndi->randomseed1, sizeof(ndi->randomseed1)); /* generate_tmp_ifid will update seedn and buf */ (void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1, ndi->randomid); } bcopy(ndi->randomid, retbuf, 8); return (0); } void in6_tmpaddrtimer(void *arg) { CURVNET_SET((struct vnet *) arg); struct nd_ifinfo *ndi; u_int8_t nullbuf[8]; struct ifnet *ifp; callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, curvnet); bzero(nullbuf, sizeof(nullbuf)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_afdata[AF_INET6] == NULL) continue; ndi = ND_IFINFO(ifp); if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) { /* * We've been generating a random ID on this interface. * Create a new one. */ (void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1, ndi->randomid); } } CURVNET_RESTORE(); } static void in6_purgemaddrs(struct ifnet *ifp) { struct in6_multi_head inmh; SLIST_INIT(&inmh); IN6_MULTI_LOCK(); IN6_MULTI_LIST_LOCK(); mld_ifdetach(ifp, &inmh); IN6_MULTI_LIST_UNLOCK(); IN6_MULTI_UNLOCK(); in6m_release_list_deferred(&inmh); /* * Make sure all multicast deletions invoking if_ioctl() are * completed before returning. Else we risk accessing a freed * ifnet structure pointer. */ in6m_release_wait(); } void in6_ifattach_destroy(void) { callout_drain(&V_in6_tmpaddrtimer_ch); } static void in6_ifattach_init(void *dummy) { /* Timer for regeneranation of temporary addresses randomize ID. */ callout_init(&V_in6_tmpaddrtimer_ch, 0); callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, curvnet); } /* * Cheat. * This must be after route_init(), which is now SI_ORDER_THIRD. */ SYSINIT(in6_ifattach_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, in6_ifattach_init, NULL); diff --git a/sys/netinet6/in6_mcast.c b/sys/netinet6/in6_mcast.c index 904154f56da5..5e7969d2a6da 100644 --- a/sys/netinet6/in6_mcast.c +++ b/sys/netinet6/in6_mcast.c @@ -1,2908 +1,2910 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2009 Bruce Simpson. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPv6 multicast socket, group, and socket option processing module. * Normative references: RFC 2292, RFC 3492, RFC 3542, RFC 3678, RFC 3810. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_MLD #define KTR_MLD KTR_INET6 #endif #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in6 sin6; }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ static MALLOC_DEFINE(M_IN6MFILTER, "in6_mfilter", "IPv6 multicast PCB-layer source filter"); MALLOC_DEFINE(M_IP6MADDR, "in6_multi", "IPv6 multicast group"); static MALLOC_DEFINE(M_IP6MOPTS, "ip6_moptions", "IPv6 multicast options"); static MALLOC_DEFINE(M_IP6MSOURCE, "ip6_msource", "IPv6 multicast MLD-layer source filter"); RB_GENERATE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp); /* * Locking: * - Lock order is: Giant, IN6_MULTI_LOCK, INP_WLOCK, * IN6_MULTI_LIST_LOCK, MLD_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by in6m_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip6_moptions and in6_mfilter are covered by the INP_WLOCK. * * struct in6_multi is covered by IN6_MULTI_LOCK. There isn't strictly * any need for in6_multi itself to be virtualized -- it is bound to an ifp * anyway no matter what happens. */ struct mtx in6_multi_list_mtx; MTX_SYSINIT(in6_multi_mtx, &in6_multi_list_mtx, "in6_multi_list_mtx", MTX_DEF); struct mtx in6_multi_free_mtx; MTX_SYSINIT(in6_multi_free_mtx, &in6_multi_free_mtx, "in6_multi_free_mtx", MTX_DEF); struct sx in6_multi_sx; SX_SYSINIT(in6_multi_sx, &in6_multi_sx, "in6_multi_sx"); static void im6f_commit(struct in6_mfilter *); static int im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin, struct in6_msource **); static struct in6_msource * im6f_graft(struct in6_mfilter *, const uint8_t, const struct sockaddr_in6 *); static void im6f_leave(struct in6_mfilter *); static int im6f_prune(struct in6_mfilter *, const struct sockaddr_in6 *); static void im6f_purge(struct in6_mfilter *); static void im6f_rollback(struct in6_mfilter *); static void im6f_reap(struct in6_mfilter *); static struct in6_mfilter * im6o_match_group(const struct ip6_moptions *, const struct ifnet *, const struct sockaddr *); static struct in6_msource * im6o_match_source(struct in6_mfilter *, const struct sockaddr *); static void im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims, const int rollback); static int in6_getmulti(struct ifnet *, const struct in6_addr *, struct in6_multi **); static int in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr, const int noalloc, struct ip6_msource **pims); #ifdef KTR static int in6m_is_ifp_detached(const struct in6_multi *); #endif static int in6m_merge(struct in6_multi *, /*const*/ struct in6_mfilter *); static void in6m_purge(struct in6_multi *); static void in6m_reap(struct in6_multi *); static struct ip6_moptions * in6p_findmoptions(struct inpcb *); static int in6p_get_source_filters(struct inpcb *, struct sockopt *); static int in6p_join_group(struct inpcb *, struct sockopt *); static int in6p_leave_group(struct inpcb *, struct sockopt *); static struct ifnet * in6p_lookup_mcast_ifp(const struct inpcb *, const struct sockaddr_in6 *); static int in6p_block_unblock_source(struct inpcb *, struct sockopt *); static int in6p_set_multicast_if(struct inpcb *, struct sockopt *); static int in6p_set_source_filters(struct inpcb *, struct sockopt *); static int sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS); SYSCTL_DECL(_net_inet6_ip6); /* XXX Not in any common header. */ static SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, mcast, CTLFLAG_RW, 0, "IPv6 multicast"); static u_long in6_mcast_maxgrpsrc = IPV6_MAX_GROUP_SRC_FILTER; SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxgrpsrc, CTLFLAG_RWTUN, &in6_mcast_maxgrpsrc, 0, "Max source filters per group"); static u_long in6_mcast_maxsocksrc = IPV6_MAX_SOCK_SRC_FILTER; SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxsocksrc, CTLFLAG_RWTUN, &in6_mcast_maxsocksrc, 0, "Max source filters per socket"); /* TODO Virtualize this switch. */ int in6_mcast_loop = IPV6_DEFAULT_MULTICAST_LOOP; SYSCTL_INT(_net_inet6_ip6_mcast, OID_AUTO, loop, CTLFLAG_RWTUN, &in6_mcast_loop, 0, "Loopback multicast datagrams by default"); static SYSCTL_NODE(_net_inet6_ip6_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip6_mcast_filters, "Per-interface stack-wide source filters"); #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp * is detached. */ static int __inline in6m_is_ifp_detached(const struct in6_multi *inm) { struct ifnet *ifp; KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->in6m_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that network-layer notion of ifp is the * same as that of link-layer. */ KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__)); } return (ifp == NULL); } #endif /* * Initialize an in6_mfilter structure to a known state at t0, t1 * with an empty source filter list. */ static __inline void im6f_init(struct in6_mfilter *imf, const int st0, const int st1) { memset(imf, 0, sizeof(struct in6_mfilter)); RB_INIT(&imf->im6f_sources); imf->im6f_st[0] = st0; imf->im6f_st[1] = st1; } struct in6_mfilter * ip6_mfilter_alloc(const int mflags, const int st0, const int st1) { struct in6_mfilter *imf; imf = malloc(sizeof(*imf), M_IN6MFILTER, mflags); if (imf != NULL) im6f_init(imf, st0, st1); return (imf); } void ip6_mfilter_free(struct in6_mfilter *imf) { im6f_purge(imf); free(imf, M_IN6MFILTER); } /* * Find an IPv6 multicast group entry for this ip6_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ static struct in6_mfilter * im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group) { const struct sockaddr_in6 *gsin6; struct in6_mfilter *imf; struct in6_multi *inm; gsin6 = (const struct sockaddr_in6 *)group; IP6_MFILTER_FOREACH(imf, &imo->im6o_head) { inm = imf->im6f_in6m; if (inm == NULL) continue; if ((ifp == NULL || (inm->in6m_ifp == ifp)) && IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &gsin6->sin6_addr)) { break; } } return (imf); } /* * Find an IPv6 multicast source entry for this imo which matches * the given group index for this socket, and source address. * * XXX TODO: The scope ID, if present in src, is stripped before * any comparison. We SHOULD enforce scope/zone checks where the source * filter entry has a link scope. * * NOTE: This does not check if the entry is in-mode, merely if * it exists, which may not be the desired behaviour. */ static struct in6_msource * im6o_match_source(struct in6_mfilter *imf, const struct sockaddr *src) { struct ip6_msource find; struct ip6_msource *ims; const sockunion_t *psa; KASSERT(src->sa_family == AF_INET6, ("%s: !AF_INET6", __func__)); psa = (const sockunion_t *)src; find.im6s_addr = psa->sin6.sin6_addr; in6_clearscope(&find.im6s_addr); /* XXX */ ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); return ((struct in6_msource *)ims); } /* * Perform filtering for multicast datagrams on a socket by group and source. * * Returns 0 if a datagram should be allowed through, or various error codes * if the socket was not a member of the group, or the source was muted, etc. */ int im6o_mc_filter(const struct ip6_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group, const struct sockaddr *src) { struct in6_mfilter *imf; struct in6_msource *ims; int mode; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); imf = im6o_match_group(imo, ifp, group); if (imf == NULL) return (MCAST_NOTGMEMBER); /* * Check if the source was included in an (S,G) join. * Allow reception on exclusive memberships by default, * reject reception on inclusive memberships by default. * Exclude source only if an in-mode exclude filter exists. * Include source only if an in-mode include filter exists. * NOTE: We are comparing group state here at MLD t1 (now) * with socket-layer t0 (since last downcall). */ mode = imf->im6f_st[1]; ims = im6o_match_source(imf, src); if ((ims == NULL && mode == MCAST_INCLUDE) || (ims != NULL && ims->im6sl_st[0] != mode)) return (MCAST_NOTSMEMBER); return (MCAST_PASS); } /* * Find and return a reference to an in6_multi record for (ifp, group), * and bump its reference count. * If one does not exist, try to allocate it, and update link-layer multicast * filters on ifp to listen for group. * Assumes the IN6_MULTI lock is held across the call. * Return 0 if successful, otherwise return an appropriate error code. */ static int in6_getmulti(struct ifnet *ifp, const struct in6_addr *group, struct in6_multi **pinm) { struct epoch_tracker et; struct sockaddr_in6 gsin6; struct ifmultiaddr *ifma; struct in6_multi *inm; int error; error = 0; /* * XXX: Accesses to ifma_protospec must be covered by IF_ADDR_LOCK; * if_addmulti() takes this mutex itself, so we must drop and * re-acquire around the call. */ IN6_MULTI_LOCK_ASSERT(); IN6_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); NET_EPOCH_ENTER(et); inm = in6m_lookup_locked(ifp, group); NET_EPOCH_EXIT(et); if (inm != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(inm->in6m_refcount >= 1, ("%s: bad refcount %d", __func__, inm->in6m_refcount)); in6m_acquire_locked(inm); *pinm = inm; goto out_locked; } memset(&gsin6, 0, sizeof(gsin6)); gsin6.sin6_family = AF_INET6; gsin6.sin6_len = sizeof(struct sockaddr_in6); gsin6.sin6_addr = *group; /* * Check if a link-layer group is already associated * with this network-layer group on the given ifnet. */ IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); error = if_addmulti(ifp, (struct sockaddr *)&gsin6, &ifma); if (error != 0) return (error); IN6_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); /* * If something other than netinet6 is occupying the link-layer * group, print a meaningful error message and back out of * the allocation. * Otherwise, bump the refcount on the existing network-layer * group association and return it. */ if (ifma->ifma_protospec != NULL) { inm = (struct in6_multi *)ifma->ifma_protospec; #ifdef INVARIANTS KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr", __func__)); KASSERT(ifma->ifma_addr->sa_family == AF_INET6, ("%s: ifma not AF_INET6", __func__)); KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__)); if (inm->in6m_ifma != ifma || inm->in6m_ifp != ifp || !IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, group)) panic("%s: ifma %p is inconsistent with %p (%p)", __func__, ifma, inm, group); #endif in6m_acquire_locked(inm); *pinm = inm; goto out_locked; } IF_ADDR_WLOCK_ASSERT(ifp); /* * A new in6_multi record is needed; allocate and initialize it. * We DO NOT perform an MLD join as the in6_ layer may need to * push an initial source list down to MLD to support SSM. * * The initial source filter state is INCLUDE, {} as per the RFC. * Pending state-changes per group are subject to a bounds check. */ inm = malloc(sizeof(*inm), M_IP6MADDR, M_NOWAIT | M_ZERO); if (inm == NULL) { IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); if_delmulti_ifma(ifma); return (ENOMEM); } inm->in6m_addr = *group; inm->in6m_ifp = ifp; inm->in6m_mli = MLD_IFINFO(ifp); inm->in6m_ifma = ifma; inm->in6m_refcount = 1; inm->in6m_state = MLD_NOT_MEMBER; mbufq_init(&inm->in6m_scq, MLD_MAX_STATE_CHANGES); inm->in6m_st[0].iss_fmode = MCAST_UNDEFINED; inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; RB_INIT(&inm->in6m_srcs); ifma->ifma_protospec = inm; *pinm = inm; out_locked: IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); return (error); } /* * Drop a reference to an in6_multi record. * * If the refcount drops to 0, free the in6_multi record and * delete the underlying link-layer membership. */ static void in6m_release(struct in6_multi *inm) { struct ifmultiaddr *ifma; struct ifnet *ifp; CTR2(KTR_MLD, "%s: refcount is %d", __func__, inm->in6m_refcount); MPASS(inm->in6m_refcount == 0); CTR2(KTR_MLD, "%s: freeing inm %p", __func__, inm); ifma = inm->in6m_ifma; ifp = inm->in6m_ifp; MPASS(ifma->ifma_llifma == NULL); /* XXX this access is not covered by IF_ADDR_LOCK */ CTR2(KTR_MLD, "%s: purging ifma %p", __func__, ifma); KASSERT(ifma->ifma_protospec == NULL, ("%s: ifma_protospec != NULL", __func__)); if (ifp == NULL) ifp = ifma->ifma_ifp; if (ifp != NULL) { CURVNET_SET(ifp->if_vnet); in6m_purge(inm); free(inm, M_IP6MADDR); if_delmulti_ifma_flags(ifma, 1); CURVNET_RESTORE(); if_rele(ifp); } else { in6m_purge(inm); free(inm, M_IP6MADDR); if_delmulti_ifma_flags(ifma, 1); } } static struct grouptask free_gtask; static struct in6_multi_head in6m_free_list; static void in6m_release_task(void *arg __unused); static void in6m_init(void) { SLIST_INIT(&in6m_free_list); taskqgroup_config_gtask_init(NULL, &free_gtask, in6m_release_task, "in6m release task"); } #ifdef EARLY_AP_STARTUP SYSINIT(in6m_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, in6m_init, NULL); #else SYSINIT(in6m_init, SI_SUB_ROOT_CONF - 1, SI_ORDER_SECOND, in6m_init, NULL); #endif void in6m_release_list_deferred(struct in6_multi_head *inmh) { if (SLIST_EMPTY(inmh)) return; mtx_lock(&in6_multi_free_mtx); SLIST_CONCAT(&in6m_free_list, inmh, in6_multi, in6m_nrele); mtx_unlock(&in6_multi_free_mtx); GROUPTASK_ENQUEUE(&free_gtask); } void in6m_release_wait(void) { /* Wait for all jobs to complete. */ gtaskqueue_drain_all(free_gtask.gt_taskqueue); } void in6m_disconnect_locked(struct in6_multi_head *inmh, struct in6_multi *inm) { struct ifnet *ifp; struct ifaddr *ifa; struct in6_ifaddr *ifa6; struct in6_multi_mship *imm, *imm_tmp; struct ifmultiaddr *ifma, *ll_ifma; IN6_MULTI_LIST_LOCK_ASSERT(); ifp = inm->in6m_ifp; if (ifp == NULL) return; /* already called */ inm->in6m_ifp = NULL; IF_ADDR_WLOCK_ASSERT(ifp); ifma = inm->in6m_ifma; if (ifma == NULL) return; if_ref(ifp); if (ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname); if ((ll_ifma = ifma->ifma_llifma) != NULL) { MPASS(ifma != ll_ifma); ifma->ifma_llifma = NULL; MPASS(ll_ifma->ifma_llifma == NULL); MPASS(ll_ifma->ifma_ifp == ifp); if (--ll_ifma->ifma_refcount == 0) { if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname); if_freemulti(ll_ifma); } } CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (void *)ifa; LIST_FOREACH_SAFE(imm, &ifa6->ia6_memberships, i6mm_chain, imm_tmp) { if (inm == imm->i6mm_maddr) { LIST_REMOVE(imm, i6mm_chain); free(imm, M_IP6MADDR); in6m_rele_locked(inmh, inm); } } } } static void in6m_release_task(void *arg __unused) { struct in6_multi_head in6m_free_tmp; struct in6_multi *inm, *tinm; SLIST_INIT(&in6m_free_tmp); mtx_lock(&in6_multi_free_mtx); SLIST_CONCAT(&in6m_free_tmp, &in6m_free_list, in6_multi, in6m_nrele); mtx_unlock(&in6_multi_free_mtx); IN6_MULTI_LOCK(); SLIST_FOREACH_SAFE(inm, &in6m_free_tmp, in6m_nrele, tinm) { SLIST_REMOVE_HEAD(&in6m_free_tmp, in6m_nrele); in6m_release(inm); } IN6_MULTI_UNLOCK(); } /* * Clear recorded source entries for a group. * Used by the MLD code. Caller must hold the IN6_MULTI lock. * FIXME: Should reap. */ void in6m_clear_recorded(struct in6_multi *inm) { struct ip6_msource *ims; IN6_MULTI_LIST_LOCK_ASSERT(); RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { if (ims->im6s_stp) { ims->im6s_stp = 0; --inm->in6m_st[1].iss_rec; } } KASSERT(inm->in6m_st[1].iss_rec == 0, ("%s: iss_rec %d not 0", __func__, inm->in6m_st[1].iss_rec)); } /* * Record a source as pending for a Source-Group MLDv2 query. * This lives here as it modifies the shared tree. * * inm is the group descriptor. * naddr is the address of the source to record in network-byte order. * * If the net.inet6.mld.sgalloc sysctl is non-zero, we will * lazy-allocate a source node in response to an SG query. * Otherwise, no allocation is performed. This saves some memory * with the trade-off that the source will not be reported to the * router if joined in the window between the query response and * the group actually being joined on the local host. * * VIMAGE: XXX: Currently the mld_sgalloc feature has been removed. * This turns off the allocation of a recorded source entry if * the group has not been joined. * * Return 0 if the source didn't exist or was already marked as recorded. * Return 1 if the source was marked as recorded by this function. * Return <0 if any error occurred (negated errno code). */ int in6m_record_source(struct in6_multi *inm, const struct in6_addr *addr) { struct ip6_msource find; struct ip6_msource *ims, *nims; IN6_MULTI_LIST_LOCK_ASSERT(); find.im6s_addr = *addr; ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find); if (ims && ims->im6s_stp) return (0); if (ims == NULL) { if (inm->in6m_nsrc == in6_mcast_maxgrpsrc) return (-ENOSPC); nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (-ENOMEM); nims->im6s_addr = find.im6s_addr; RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims); ++inm->in6m_nsrc; ims = nims; } /* * Mark the source as recorded and update the recorded * source count. */ ++ims->im6s_stp; ++inm->in6m_st[1].iss_rec; return (1); } /* * Return a pointer to an in6_msource owned by an in6_mfilter, * given its source address. * Lazy-allocate if needed. If this is a new entry its filter state is * undefined at t0. * * imf is the filter set being modified. * addr is the source address. * * SMPng: May be called with locks held; malloc must not block. */ static int im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin, struct in6_msource **plims) { struct ip6_msource find; struct ip6_msource *ims, *nims; struct in6_msource *lims; int error; error = 0; ims = NULL; lims = NULL; find.im6s_addr = psin->sin6_addr; ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); lims = (struct in6_msource *)ims; if (lims == NULL) { if (imf->im6f_nsrc == in6_mcast_maxsocksrc) return (ENOSPC); nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); lims = (struct in6_msource *)nims; lims->im6s_addr = find.im6s_addr; lims->im6sl_st[0] = MCAST_UNDEFINED; RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims); ++imf->im6f_nsrc; } *plims = lims; return (error); } /* * Graft a source entry into an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being in the new filter mode at t1. * * Return the pointer to the new node, otherwise return NULL. */ static struct in6_msource * im6f_graft(struct in6_mfilter *imf, const uint8_t st1, const struct sockaddr_in6 *psin) { struct ip6_msource *nims; struct in6_msource *lims; nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (NULL); lims = (struct in6_msource *)nims; lims->im6s_addr = psin->sin6_addr; lims->im6sl_st[0] = MCAST_UNDEFINED; lims->im6sl_st[1] = st1; RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims); ++imf->im6f_nsrc; return (lims); } /* * Prune a source entry from an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being left at t1, it is not freed. * * Return 0 if no error occurred, otherwise return an errno value. */ static int im6f_prune(struct in6_mfilter *imf, const struct sockaddr_in6 *psin) { struct ip6_msource find; struct ip6_msource *ims; struct in6_msource *lims; find.im6s_addr = psin->sin6_addr; ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); if (ims == NULL) return (ENOENT); lims = (struct in6_msource *)ims; lims->im6sl_st[1] = MCAST_UNDEFINED; return (0); } /* * Revert socket-layer filter set deltas at t1 to t0 state. */ static void im6f_rollback(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; struct in6_msource *lims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == lims->im6sl_st[1]) { /* no change at t1 */ continue; } else if (lims->im6sl_st[0] != MCAST_UNDEFINED) { /* revert change to existing source at t1 */ lims->im6sl_st[1] = lims->im6sl_st[0]; } else { /* revert source added t1 */ CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } } imf->im6f_st[1] = imf->im6f_st[0]; } /* * Mark socket-layer filter set as INCLUDE {} at t1. */ static void im6f_leave(struct in6_mfilter *imf) { struct ip6_msource *ims; struct in6_msource *lims; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; lims->im6sl_st[1] = MCAST_UNDEFINED; } imf->im6f_st[1] = MCAST_INCLUDE; } /* * Mark socket-layer filter set deltas as committed. */ static void im6f_commit(struct in6_mfilter *imf) { struct ip6_msource *ims; struct in6_msource *lims; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; lims->im6sl_st[0] = lims->im6sl_st[1]; } imf->im6f_st[0] = imf->im6f_st[1]; } /* * Reap unreferenced sources from socket-layer filter set. */ static void im6f_reap(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; struct in6_msource *lims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { lims = (struct in6_msource *)ims; if ((lims->im6sl_st[0] == MCAST_UNDEFINED) && (lims->im6sl_st[1] == MCAST_UNDEFINED)) { CTR2(KTR_MLD, "%s: free lims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } } } /* * Purge socket-layer filter set. */ static void im6f_purge(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } imf->im6f_st[0] = imf->im6f_st[1] = MCAST_UNDEFINED; KASSERT(RB_EMPTY(&imf->im6f_sources), ("%s: im6f_sources not empty", __func__)); } /* * Look up a source filter entry for a multicast group. * * inm is the group descriptor to work with. * addr is the IPv6 address to look up. * noalloc may be non-zero to suppress allocation of sources. * *pims will be set to the address of the retrieved or allocated source. * * SMPng: NOTE: may be called with locks held. * Return 0 if successful, otherwise return a non-zero error code. */ static int in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr, const int noalloc, struct ip6_msource **pims) { struct ip6_msource find; struct ip6_msource *ims, *nims; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif find.im6s_addr = *addr; ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find); if (ims == NULL && !noalloc) { if (inm->in6m_nsrc == in6_mcast_maxgrpsrc) return (ENOSPC); nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); nims->im6s_addr = *addr; RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims); ++inm->in6m_nsrc; ims = nims; CTR3(KTR_MLD, "%s: allocated %s as %p", __func__, ip6_sprintf(ip6tbuf, addr), ims); } *pims = ims; return (0); } /* * Merge socket-layer source into MLD-layer source. * If rollback is non-zero, perform the inverse of the merge. */ static void im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims, const int rollback) { int n = rollback ? -1 : 1; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; ip6_sprintf(ip6tbuf, &lims->im6s_addr); #endif if (lims->im6sl_st[0] == MCAST_EXCLUDE) { CTR3(KTR_MLD, "%s: t1 ex -= %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].ex -= n; } else if (lims->im6sl_st[0] == MCAST_INCLUDE) { CTR3(KTR_MLD, "%s: t1 in -= %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].in -= n; } if (lims->im6sl_st[1] == MCAST_EXCLUDE) { CTR3(KTR_MLD, "%s: t1 ex += %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].ex += n; } else if (lims->im6sl_st[1] == MCAST_INCLUDE) { CTR3(KTR_MLD, "%s: t1 in += %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].in += n; } } /* * Atomically update the global in6_multi state, when a membership's * filter list is being updated in any way. * * imf is the per-inpcb-membership group filter pointer. * A fake imf may be passed for in-kernel consumers. * * XXX This is a candidate for a set-symmetric-difference style loop * which would eliminate the repeated lookup from root of ims nodes, * as they share the same key space. * * If any error occurred this function will back out of refcounts * and return a non-zero value. */ static int in6m_merge(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { struct ip6_msource *ims, *nims; struct in6_msource *lims; int schanged, error; int nsrc0, nsrc1; schanged = 0; error = 0; nsrc1 = nsrc0 = 0; IN6_MULTI_LIST_LOCK_ASSERT(); /* * Update the source filters first, as this may fail. * Maintain count of in-mode filters at t0, t1. These are * used to work out if we transition into ASM mode or not. * Maintain a count of source filters whose state was * actually modified by this operation. */ RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == imf->im6f_st[0]) nsrc0++; if (lims->im6sl_st[1] == imf->im6f_st[1]) nsrc1++; if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue; error = in6m_get_source(inm, &lims->im6s_addr, 0, &nims); ++schanged; if (error) break; im6s_merge(nims, lims, 0); } if (error) { struct ip6_msource *bims; RB_FOREACH_REVERSE_FROM(ims, ip6_msource_tree, nims) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue; (void)in6m_get_source(inm, &lims->im6s_addr, 1, &bims); if (bims == NULL) continue; im6s_merge(bims, lims, 1); } goto out_reap; } CTR3(KTR_MLD, "%s: imf filters in-mode: %d at t0, %d at t1", __func__, nsrc0, nsrc1); /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ if (imf->im6f_st[0] == imf->im6f_st[1] && imf->im6f_st[1] == MCAST_INCLUDE) { if (nsrc1 == 0) { CTR1(KTR_MLD, "%s: --in on inm at t1", __func__); --inm->in6m_st[1].iss_in; } } /* Handle filter mode transition on socket. */ if (imf->im6f_st[0] != imf->im6f_st[1]) { CTR3(KTR_MLD, "%s: imf transition %d to %d", __func__, imf->im6f_st[0], imf->im6f_st[1]); if (imf->im6f_st[0] == MCAST_EXCLUDE) { CTR1(KTR_MLD, "%s: --ex on inm at t1", __func__); --inm->in6m_st[1].iss_ex; } else if (imf->im6f_st[0] == MCAST_INCLUDE) { CTR1(KTR_MLD, "%s: --in on inm at t1", __func__); --inm->in6m_st[1].iss_in; } if (imf->im6f_st[1] == MCAST_EXCLUDE) { CTR1(KTR_MLD, "%s: ex++ on inm at t1", __func__); inm->in6m_st[1].iss_ex++; } else if (imf->im6f_st[1] == MCAST_INCLUDE && nsrc1 > 0) { CTR1(KTR_MLD, "%s: in++ on inm at t1", __func__); inm->in6m_st[1].iss_in++; } } /* * Track inm filter state in terms of listener counts. * If there are any exclusive listeners, stack-wide * membership is exclusive. * Otherwise, if only inclusive listeners, stack-wide is inclusive. * If no listeners remain, state is undefined at t1, * and the MLD lifecycle for this group should finish. */ if (inm->in6m_st[1].iss_ex > 0) { CTR1(KTR_MLD, "%s: transition to EX", __func__); inm->in6m_st[1].iss_fmode = MCAST_EXCLUDE; } else if (inm->in6m_st[1].iss_in > 0) { CTR1(KTR_MLD, "%s: transition to IN", __func__); inm->in6m_st[1].iss_fmode = MCAST_INCLUDE; } else { CTR1(KTR_MLD, "%s: transition to UNDEF", __func__); inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; } /* Decrement ASM listener count on transition out of ASM mode. */ if (imf->im6f_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { if ((imf->im6f_st[1] != MCAST_EXCLUDE) || (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) { CTR1(KTR_MLD, "%s: --asm on inm at t1", __func__); --inm->in6m_st[1].iss_asm; } } /* Increment ASM listener count on transition to ASM mode. */ if (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { CTR1(KTR_MLD, "%s: asm++ on inm at t1", __func__); inm->in6m_st[1].iss_asm++; } CTR3(KTR_MLD, "%s: merged imf %p to inm %p", __func__, imf, inm); in6m_print(inm); out_reap: if (schanged > 0) { CTR1(KTR_MLD, "%s: sources changed; reaping", __func__); in6m_reap(inm); } return (error); } /* * Mark an in6_multi's filter set deltas as committed. * Called by MLD after a state change has been enqueued. */ void in6m_commit(struct in6_multi *inm) { struct ip6_msource *ims; CTR2(KTR_MLD, "%s: commit inm %p", __func__, inm); CTR1(KTR_MLD, "%s: pre commit:", __func__); in6m_print(inm); RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { ims->im6s_st[0] = ims->im6s_st[1]; } inm->in6m_st[0] = inm->in6m_st[1]; } /* * Reap unreferenced nodes from an in6_multi's filter set. */ static void in6m_reap(struct in6_multi *inm) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) { if (ims->im6s_st[0].ex > 0 || ims->im6s_st[0].in > 0 || ims->im6s_st[1].ex > 0 || ims->im6s_st[1].in > 0 || ims->im6s_stp != 0) continue; CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims); free(ims, M_IP6MSOURCE); inm->in6m_nsrc--; } } /* * Purge all source nodes from an in6_multi's filter set. */ static void in6m_purge(struct in6_multi *inm) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) { CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims); free(ims, M_IP6MSOURCE); inm->in6m_nsrc--; } /* Free state-change requests that might be queued. */ mbufq_drain(&inm->in6m_scq); } /* * Join a multicast address w/o sources. * KAME compatibility entry point. * * SMPng: Assume no mc locks held by caller. */ int in6_joingroup(struct ifnet *ifp, const struct in6_addr *mcaddr, /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm, const int delay) { int error; IN6_MULTI_LOCK(); error = in6_joingroup_locked(ifp, mcaddr, NULL, pinm, delay); IN6_MULTI_UNLOCK(); return (error); } /* * Join a multicast group; real entry point. * * Only preserves atomicity at inm level. * NOTE: imf argument cannot be const due to sys/tree.h limitations. * * If the MLD downcall fails, the group is not joined, and an error * code is returned. */ int in6_joingroup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr, /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm, const int delay) { struct in6_multi_head inmh; struct in6_mfilter timf; struct in6_multi *inm; struct ifmultiaddr *ifma; int error; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif /* * Sanity: Check scope zone ID was set for ifp, if and * only if group is scoped to an interface. */ KASSERT(IN6_IS_ADDR_MULTICAST(mcaddr), ("%s: not a multicast address", __func__)); if (IN6_IS_ADDR_MC_LINKLOCAL(mcaddr) || IN6_IS_ADDR_MC_INTFACELOCAL(mcaddr)) { KASSERT(mcaddr->s6_addr16[1] != 0, ("%s: scope zone ID not set", __func__)); } IN6_MULTI_LOCK_ASSERT(); IN6_MULTI_LIST_UNLOCK_ASSERT(); CTR4(KTR_MLD, "%s: join %s on %p(%s))", __func__, ip6_sprintf(ip6tbuf, mcaddr), ifp, if_name(ifp)); error = 0; inm = NULL; /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { im6f_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); imf = &timf; } error = in6_getmulti(ifp, mcaddr, &inm); if (error) { CTR1(KTR_MLD, "%s: in6_getmulti() failure", __func__); return (error); } IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) { CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); goto out_in6m_release; } CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, delay); if (error) { CTR1(KTR_MLD, "%s: failed to update source", __func__); goto out_in6m_release; } out_in6m_release: SLIST_INIT(&inmh); if (error) { struct epoch_tracker et; CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_protospec == inm) { ifma->ifma_protospec = NULL; break; } } in6m_disconnect_locked(&inmh, inm); in6m_rele_locked(&inmh, inm); NET_EPOCH_EXIT(et); } else { *pinm = inm; } IN6_MULTI_LIST_UNLOCK(); in6m_release_list_deferred(&inmh); return (error); } /* * Leave a multicast group; unlocked entry point. */ int in6_leavegroup(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { int error; IN6_MULTI_LOCK(); error = in6_leavegroup_locked(inm, imf); IN6_MULTI_UNLOCK(); return (error); } /* * Leave a multicast group; real entry point. * All source filters will be expunged. * * Only preserves atomicity at inm level. * * Holding the write lock for the INP which contains imf * is highly advisable. We can't assert for it as imf does not * contain a back-pointer to the owning inp. * * Note: This is not the same as in6m_release(*) as this function also * makes a state change downcall into MLD. */ int in6_leavegroup_locked(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { struct in6_multi_head inmh; struct in6_mfilter timf; struct ifnet *ifp; int error; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif error = 0; IN6_MULTI_LOCK_ASSERT(); CTR5(KTR_MLD, "%s: leave inm %p, %s/%s, imf %p", __func__, inm, ip6_sprintf(ip6tbuf, &inm->in6m_addr), (in6m_is_ifp_detached(inm) ? "null" : if_name(inm->in6m_ifp)), imf); /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { im6f_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); imf = &timf; } /* * Begin state merge transaction at MLD layer. * * As this particular invocation should not cause any memory * to be allocated, and there is no opportunity to roll back * the transaction, it MUST NOT fail. */ ifp = inm->in6m_ifp; IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = 0; if (ifp) error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm); if (ifp) IF_ADDR_WLOCK(ifp); SLIST_INIT(&inmh); if (inm->in6m_refcount == 1) in6m_disconnect_locked(&inmh, inm); in6m_rele_locked(&inmh, inm); if (ifp) IF_ADDR_WUNLOCK(ifp); IN6_MULTI_LIST_UNLOCK(); in6m_release_list_deferred(&inmh); return (error); } /* * Block or unblock an ASM multicast source on an inpcb. * This implements the delta-based API described in RFC 3678. * * The delta-based API applies only to exclusive-mode memberships. * An MLD downcall will be performed. * * SMPng: NOTE: Must take Giant as a join may create a new ifma. * * Return 0 if successful, otherwise return an appropriate error code. */ static int in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_msource *ims; struct in6_multi *inm; uint16_t fmode; int error, doblock; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif ifp = NULL; error = 0; doblock = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; ssa = (sockunion_t *)&gsr.gsr_source; switch (sopt->sopt_name) { case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) doblock = 1; break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); /* * Check if we are actually a member of this group. */ imo = in6p_findmoptions(inp); imf = im6o_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } inm = imf->im6f_in6m; /* * Attempting to use the delta-based API on an * non exclusive-mode membership is an error. */ fmode = imf->im6f_st[0]; if (fmode != MCAST_EXCLUDE) { error = EINVAL; goto out_in6p_locked; } /* * Deal with error cases up-front: * Asked to block, but already blocked; or * Asked to unblock, but nothing to unblock. * If adding a new block entry, allocate it. */ ims = im6o_match_source(imf, &ssa->sa); if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { CTR3(KTR_MLD, "%s: source %s %spresent", __func__, ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr), doblock ? "" : "not "); error = EADDRNOTAVAIL; goto out_in6p_locked; } INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at socket layer. */ if (doblock) { CTR2(KTR_MLD, "%s: %s source", __func__, "block"); ims = im6f_graft(imf, fmode, &ssa->sin6); if (ims == NULL) error = ENOMEM; } else { CTR2(KTR_MLD, "%s: %s source", __func__, "allow"); error = im6f_prune(imf, &ssa->sin6); } if (error) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); goto out_im6f_rollback; } /* * Begin state merge transaction at MLD layer. */ IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } IN6_MULTI_LIST_UNLOCK(); out_im6f_rollback: if (error) im6f_rollback(imf); else im6f_commit(imf); im6f_reap(imf); out_in6p_locked: INP_WUNLOCK(inp); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. * SMPng: NOTE: Returns with the INP write lock held. */ static struct ip6_moptions * in6p_findmoptions(struct inpcb *inp) { struct ip6_moptions *imo; INP_WLOCK(inp); if (inp->in6p_moptions != NULL) return (inp->in6p_moptions); INP_WUNLOCK(inp); imo = malloc(sizeof(*imo), M_IP6MOPTS, M_WAITOK); imo->im6o_multicast_ifp = NULL; imo->im6o_multicast_hlim = V_ip6_defmcasthlim; imo->im6o_multicast_loop = in6_mcast_loop; STAILQ_INIT(&imo->im6o_head); INP_WLOCK(inp); if (inp->in6p_moptions != NULL) { free(imo, M_IP6MOPTS); return (inp->in6p_moptions); } inp->in6p_moptions = imo; return (imo); } /* * Discard the IPv6 multicast options (and source filters). * * SMPng: NOTE: assumes INP write lock is held. * * XXX can all be safely deferred to epoch_call * */ static void inp_gcmoptions(struct ip6_moptions *imo) { struct in6_mfilter *imf; struct in6_multi *inm; struct ifnet *ifp; while ((imf = ip6_mfilter_first(&imo->im6o_head)) != NULL) { ip6_mfilter_remove(&imo->im6o_head, imf); im6f_leave(imf); if ((inm = imf->im6f_in6m) != NULL) { if ((ifp = inm->in6m_ifp) != NULL) { CURVNET_SET(ifp->if_vnet); (void)in6_leavegroup(inm, imf); CURVNET_RESTORE(); } else { (void)in6_leavegroup(inm, imf); } } ip6_mfilter_free(imf); } free(imo, M_IP6MOPTS); } void ip6_freemoptions(struct ip6_moptions *imo) { if (imo == NULL) return; inp_gcmoptions(imo); } /* * Atomically get source filters on a socket for an IPv6 multicast group. * Called with INP lock held; returns with lock released. */ static int in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct ip6_moptions *imo; struct in6_mfilter *imf; struct ip6_msource *ims; struct in6_msource *lims; struct sockaddr_in6 *psin; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; size_t nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); imo = inp->in6p_moptions; KASSERT(imo != NULL, ("%s: null ip6_moptions", __func__)); INP_WUNLOCK(inp); error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_group.ss_family != AF_INET6 || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); INP_WLOCK(inp); /* * Lookup group on the socket. */ imf = im6o_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } /* * Ignore memberships which are in limbo. */ if (imf->im6f_st[1] == MCAST_UNDEFINED) { INP_WUNLOCK(inp); return (EAGAIN); } msfr.msfr_fmode = imf->im6f_st[1]; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. * We only copy out the number of entries which userland * has asked for, but we always tell userland how big the * buffer really needs to be. */ if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) msfr.msfr_nsrcs = in6_mcast_maxsocksrc; tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_NOWAIT | M_ZERO); if (tss == NULL) { INP_WUNLOCK(inp); return (ENOBUFS); } } /* * Count number of sources in-mode at t0. * If buffer space exists and remains, copy out source entries. */ nsrcs = msfr.msfr_nsrcs; ncsrcs = 0; ptss = tss; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == MCAST_UNDEFINED || lims->im6sl_st[0] != imf->im6f_st[0]) continue; ++ncsrcs; if (tss != NULL && nsrcs > 0) { psin = (struct sockaddr_in6 *)ptss; psin->sin6_family = AF_INET6; psin->sin6_len = sizeof(struct sockaddr_in6); psin->sin6_addr = lims->im6s_addr; psin->sin6_port = 0; --nsrcs; ++ptss; } } INP_WUNLOCK(inp); if (tss != NULL) { error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); free(tss, M_TEMP); if (error) return (error); } msfr.msfr_nsrcs = ncsrcs; error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ int ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip6_moptions *im6o; int error; u_int optval; INP_WLOCK(inp); im6o = inp->in6p_moptions; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = 0; switch (sopt->sopt_name) { case IPV6_MULTICAST_IF: if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) { optval = 0; } else { optval = im6o->im6o_multicast_ifp->if_index; } INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MULTICAST_HOPS: if (im6o == NULL) optval = V_ip6_defmcasthlim; else optval = im6o->im6o_multicast_hlim; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MULTICAST_LOOP: if (im6o == NULL) optval = in6_mcast_loop; /* XXX VIMAGE */ else optval = im6o->im6o_multicast_loop; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MSFILTER: if (im6o == NULL) { error = EADDRNOTAVAIL; INP_WUNLOCK(inp); } else { error = in6p_get_source_filters(inp, sopt); } break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Look up the ifnet to use for a multicast group membership, * given the address of an IPv6 group. * * This routine exists to support legacy IPv6 multicast applications. * * If inp is non-NULL, use this socket's current FIB number for any * required FIB lookup. Look up the group address in the unicast FIB, * and use its ifp; usually, this points to the default next-hop. * If the FIB lookup fails, return NULL. * * FUTURE: Support multiple forwarding tables for IPv6. * * Returns NULL if no ifp could be found. */ static struct ifnet * in6p_lookup_mcast_ifp(const struct inpcb *inp, const struct sockaddr_in6 *gsin6) { struct nhop6_basic nh6; struct in6_addr dst; uint32_t scopeid; uint32_t fibnum; KASSERT(inp->inp_vflag & INP_IPV6, ("%s: not INP_IPV6 inpcb", __func__)); KASSERT(gsin6->sin6_family == AF_INET6, ("%s: not AF_INET6 group", __func__)); in6_splitscope(&gsin6->sin6_addr, &dst, &scopeid); fibnum = inp ? inp->inp_inc.inc_fibnum : RT_DEFAULT_FIB; if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6) != 0) return (NULL); return (nh6.nh_ifp); } /* * Join an IPv6 multicast group, possibly with a source. * * FIXME: The KAME use of the unspecified address (::) * to join *all* multicast groups is currently unsupported. */ static int in6p_join_group(struct inpcb *inp, struct sockopt *sopt) { struct in6_multi_head inmh; struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_multi *inm; struct in6_msource *lims; int error, is_new; SLIST_INIT(&inmh); ifp = NULL; lims = NULL; error = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; /* * Chew everything into struct group_source_req. * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. * Ignore passed-in scope ID. */ switch (sopt->sopt_name) { case IPV6_JOIN_GROUP: { struct ipv6_mreq mreq; error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq), sizeof(struct ipv6_mreq)); if (error) return (error); gsa->sin6.sin6_family = AF_INET6; gsa->sin6.sin6_len = sizeof(struct sockaddr_in6); gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr; if (mreq.ipv6mr_interface == 0) { ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6); } else { if (V_if_index < mreq.ipv6mr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(mreq.ipv6mr_interface); } CTR3(KTR_MLD, "%s: ipv6mr_interface = %d, ifp = %p", __func__, mreq.ipv6mr_interface, ifp); } break; case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: if (sopt->sopt_name == MCAST_JOIN_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr)) return (EINVAL); /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&ssa->sin6.sin6_addr); ssa->sin6.sin6_port = 0; ssa->sin6.sin6_scope_id = 0; } if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; /* * Always set the scope zone ID on memberships created from userland. * Use the passed-in ifp to do this. * XXX The in6_setscope() return value is meaningless. * XXX SCOPE6_LOCK() is taken by in6_setscope(). */ (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); IN6_MULTI_LOCK(); /* * Find the membership in the membership list. */ imo = in6p_findmoptions(inp); imf = im6o_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { is_new = 1; inm = NULL; if (ip6_mfilter_count(&imo->im6o_head) >= IPV6_MAX_MEMBERSHIPS) { error = ENOMEM; goto out_in6p_locked; } } else { is_new = 0; inm = imf->im6f_in6m; if (ssa->ss.ss_family != AF_UNSPEC) { /* * MCAST_JOIN_SOURCE_GROUP on an exclusive membership * is an error. On an existing inclusive membership, * it just adds the source to the filter list. */ if (imf->im6f_st[1] != MCAST_INCLUDE) { error = EINVAL; goto out_in6p_locked; } /* * Throw out duplicates. * * XXX FIXME: This makes a naive assumption that * even if entries exist for *ssa in this imf, * they will be rejected as dupes, even if they * are not valid in the current mode (in-mode). * * in6_msource is transactioned just as for anything * else in SSM -- but note naive use of in6m_graft() * below for allocating new filter entries. * * This is only an issue if someone mixes the * full-state SSM API with the delta-based API, * which is discouraged in the relevant RFCs. */ lims = im6o_match_source(imf, &ssa->sa); if (lims != NULL /*&& lims->im6sl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; goto out_in6p_locked; } } else { /* * MCAST_JOIN_GROUP alone, on any existing membership, * is rejected, to stop the same inpcb tying up * multiple refs to the in_multi. * On an existing inclusive membership, this is also * an error; if you want to change filter mode, * you must use the userland API setsourcefilter(). * XXX We don't reject this for imf in UNDEFINED * state at t1, because allocation of a filter * is atomic with allocation of a membership. */ error = EINVAL; goto out_in6p_locked; } } /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * Graft new source into filter list for this inpcb's * membership of the group. The in6_multi may not have * been allocated yet if this is a new membership, however, * the in_mfilter slot will be allocated and must be initialized. * * Note: Grafting of exclusive mode filters doesn't happen * in this path. * XXX: Should check for non-NULL lims (node exists but may * not be in-mode) for interop with full-state API. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* Membership starts in IN mode */ if (is_new) { CTR1(KTR_MLD, "%s: new join w/source", __func__); imf = ip6_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_INCLUDE); if (imf == NULL) { error = ENOMEM; goto out_in6p_locked; } } else { CTR2(KTR_MLD, "%s: %s source", __func__, "allow"); } lims = im6f_graft(imf, MCAST_INCLUDE, &ssa->sin6); if (lims == NULL) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); error = ENOMEM; goto out_in6p_locked; } } else { /* No address specified; Membership starts in EX mode */ if (is_new) { CTR1(KTR_MLD, "%s: new join w/o source", __func__); imf = ip6_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_EXCLUDE); if (imf == NULL) { error = ENOMEM; goto out_in6p_locked; } } } /* * Begin state merge transaction at MLD layer. */ if (is_new) { in_pcbref(inp); INP_WUNLOCK(inp); error = in6_joingroup_locked(ifp, &gsa->sin6.sin6_addr, imf, &imf->im6f_in6m, 0); INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { error = ENXIO; goto out_in6p_unlocked; } if (error) { goto out_in6p_locked; } /* * NOTE: Refcount from in6_joingroup_locked() * is protecting membership. */ } else { CTR1(KTR_MLD, "%s: merge inm state", __func__); IN6_MULTI_LIST_LOCK(); error = in6m_merge(inm, imf); if (error) { CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); IN6_MULTI_LIST_UNLOCK(); im6f_rollback(imf); im6f_reap(imf); goto out_in6p_locked; } CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); IN6_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_MLD, "%s: failed mld downcall", __func__); im6f_rollback(imf); im6f_reap(imf); goto out_in6p_locked; } } if (is_new) ip6_mfilter_insert(&imo->im6o_head, imf); im6f_commit(imf); imf = NULL; out_in6p_locked: INP_WUNLOCK(inp); out_in6p_unlocked: IN6_MULTI_UNLOCK(); if (is_new && imf) { if (imf->im6f_in6m != NULL) { struct in6_multi_head inmh; SLIST_INIT(&inmh); SLIST_INSERT_HEAD(&inmh, imf->im6f_in6m, in6m_defer); in6m_release_list_deferred(&inmh); } ip6_mfilter_free(imf); } return (error); } /* * Leave an IPv6 multicast group on an inpcb, possibly with a source. */ static int in6p_leave_group(struct inpcb *inp, struct sockopt *sopt) { struct ipv6_mreq mreq; struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_msource *ims; struct in6_multi *inm; uint32_t ifindex; int error; bool is_final; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif ifp = NULL; ifindex = 0; error = 0; is_final = true; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; /* * Chew everything passed in up into a struct group_source_req * as that is easier to process. * Note: Any embedded scope ID in the multicast group passed * in by userland is ignored, the interface index is the recommended * mechanism to specify an interface; see below. */ switch (sopt->sopt_name) { case IPV6_LEAVE_GROUP: error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq), sizeof(struct ipv6_mreq)); if (error) return (error); gsa->sin6.sin6_family = AF_INET6; gsa->sin6.sin6_len = sizeof(struct sockaddr_in6); gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr; gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; ifindex = mreq.ipv6mr_interface; break; case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: if (sopt->sopt_name == MCAST_LEAVE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr)) return (EINVAL); /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&ssa->sin6.sin6_addr); } gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; ifindex = gsr.gsr_interface; break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); /* * Validate interface index if provided. If no interface index * was provided separately, attempt to look the membership up * from the default scope as a last resort to disambiguate * the membership we are being asked to leave. * XXX SCOPE6 lock potentially taken here. */ if (ifindex != 0) { if (V_if_index < ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); } else { error = sa6_embedscope(&gsa->sin6, V_ip6_use_defzone); if (error) return (EADDRNOTAVAIL); /* * Some badly behaved applications don't pass an ifindex * or a scope ID, which is an API violation. In this case, * perform a lookup as per a v6 join. * * XXX For now, stomp on zone ID for the corner case. * This is not the 'KAME way', but we need to see the ifp * directly until such time as this implementation is * refactored, assuming the scope IDs are the way to go. */ ifindex = ntohs(gsa->sin6.sin6_addr.s6_addr16[1]); if (ifindex == 0) { CTR2(KTR_MLD, "%s: warning: no ifindex, looking up " "ifp for group %s.", __func__, ip6_sprintf(ip6tbuf, &gsa->sin6.sin6_addr)); ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6); } else { ifp = ifnet_byindex(ifindex); } if (ifp == NULL) return (EADDRNOTAVAIL); } CTR2(KTR_MLD, "%s: ifp = %p", __func__, ifp); KASSERT(ifp != NULL, ("%s: ifp did not resolve", __func__)); IN6_MULTI_LOCK(); /* * Find the membership in the membership list. */ imo = in6p_findmoptions(inp); imf = im6o_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } inm = imf->im6f_in6m; if (ssa->ss.ss_family != AF_UNSPEC) is_final = false; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * If we were instructed only to leave a given source, do so. * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ if (is_final) { ip6_mfilter_remove(&imo->im6o_head, imf); im6f_leave(imf); } else { if (imf->im6f_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; goto out_in6p_locked; } ims = im6o_match_source(imf, &ssa->sa); if (ims == NULL) { CTR3(KTR_MLD, "%s: source %p %spresent", __func__, ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr), "not "); error = EADDRNOTAVAIL; goto out_in6p_locked; } CTR2(KTR_MLD, "%s: %s source", __func__, "block"); error = im6f_prune(imf, &ssa->sin6); if (error) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); goto out_in6p_locked; } } /* * Begin state merge transaction at MLD layer. */ if (!is_final) { CTR1(KTR_MLD, "%s: merge inm state", __func__); IN6_MULTI_LIST_LOCK(); error = in6m_merge(inm, imf); if (error) { CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); IN6_MULTI_LIST_UNLOCK(); im6f_rollback(imf); im6f_reap(imf); goto out_in6p_locked; } CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); IN6_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_MLD, "%s: failed mld downcall", __func__); im6f_rollback(imf); im6f_reap(imf); goto out_in6p_locked; } } im6f_commit(imf); im6f_reap(imf); out_in6p_locked: INP_WUNLOCK(inp); if (is_final && imf) { /* * Give up the multicast address record to which * the membership points. */ (void)in6_leavegroup_locked(inm, imf); ip6_mfilter_free(imf); } IN6_MULTI_UNLOCK(); return (error); } /* * Select the interface for transmitting IPv6 multicast datagrams. * * Either an instance of struct in6_addr or an instance of struct ipv6_mreqn * may be passed to this socket option. An address of in6addr_any or an * interface index of 0 is used to remove a previous selection. * When no interface is selected, one is chosen for every send. */ static int in6p_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { struct ifnet *ifp; struct ip6_moptions *imo; u_int ifindex; int error; if (sopt->sopt_valsize != sizeof(u_int)) return (EINVAL); error = sooptcopyin(sopt, &ifindex, sizeof(u_int), sizeof(u_int)); if (error) return (error); if (V_if_index < ifindex) return (EINVAL); if (ifindex == 0) ifp = NULL; else { ifp = ifnet_byindex(ifindex); if (ifp == NULL) return (EINVAL); if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); } imo = in6p_findmoptions(inp); imo->im6o_multicast_ifp = ifp; INP_WUNLOCK(inp); return (0); } /* * Atomically set source filters on a socket for an IPv6 multicast group. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. */ static int in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_multi *inm; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) return (ENOBUFS); if (msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE) return (EINVAL); if (msfr.msfr_group.ss_family != AF_INET6 || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); gsa->sin6.sin6_port = 0; /* ignore port */ if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); /* * Take the INP write lock. * Check if this socket is a member of this group. */ imo = in6p_findmoptions(inp); imf = im6o_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } inm = imf->im6f_in6m; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); imf->im6f_st[1] = msfr.msfr_fmode; /* * Apply any new source filters, if present. * Make a copy of the user-space source vector so * that we may copy them with a single copyin. This * allows us to deal with page faults up-front. */ if (msfr.msfr_nsrcs > 0) { struct in6_msource *lims; struct sockaddr_in6 *psin; struct sockaddr_storage *kss, *pkss; int i; INP_WUNLOCK(inp); CTR2(KTR_MLD, "%s: loading %lu source list entries", __func__, (unsigned long)msfr.msfr_nsrcs); kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); if (error) { free(kss, M_TEMP); return (error); } INP_WLOCK(inp); /* * Mark all source filters as UNDEFINED at t1. * Restore new group filter mode, as im6f_leave() * will set it to INCLUDE. */ im6f_leave(imf); imf->im6f_st[1] = msfr.msfr_fmode; /* * Update socket layer filters at t1, lazy-allocating * new entries. This saves a bunch of memory at the * cost of one RB_FIND() per source entry; duplicate * entries in the msfr_nsrcs vector are ignored. * If we encounter an error, rollback transaction. * * XXX This too could be replaced with a set-symmetric * difference like loop to avoid walking from root * every time, as the key space is common. */ for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { psin = (struct sockaddr_in6 *)pkss; if (psin->sin6_family != AF_INET6) { error = EAFNOSUPPORT; break; } if (psin->sin6_len != sizeof(struct sockaddr_in6)) { error = EINVAL; break; } if (IN6_IS_ADDR_MULTICAST(&psin->sin6_addr)) { error = EINVAL; break; } /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&psin->sin6_addr); error = im6f_get_source(imf, psin, &lims); if (error) break; lims->im6sl_st[1] = imf->im6f_st[1]; } free(kss, M_TEMP); } if (error) goto out_im6f_rollback; INP_WLOCK_ASSERT(inp); IN6_MULTI_LIST_LOCK(); /* * Begin state merge transaction at MLD layer. */ CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } IN6_MULTI_LIST_UNLOCK(); out_im6f_rollback: if (error) im6f_rollback(imf); else im6f_commit(imf); im6f_reap(imf); out_in6p_locked: INP_WUNLOCK(inp); return (error); } /* * Set the IP multicast options in response to user setsockopt(). * * Many of the socket options handled in this function duplicate the * functionality of socket options in the regular unicast API. However, * it is not possible to merge the duplicate code, because the idempotence * of the IPv6 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. * * SMPng: XXX: Unlocked read of inp_socket believed OK. */ int ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip6_moptions *im6o; int error; error = 0; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) return (EOPNOTSUPP); switch (sopt->sopt_name) { case IPV6_MULTICAST_IF: error = in6p_set_multicast_if(inp, sopt); break; case IPV6_MULTICAST_HOPS: { int hlim; if (sopt->sopt_valsize != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &hlim, sizeof(hlim), sizeof(int)); if (error) break; if (hlim < -1 || hlim > 255) { error = EINVAL; break; } else if (hlim == -1) { hlim = V_ip6_defmcasthlim; } im6o = in6p_findmoptions(inp); im6o->im6o_multicast_hlim = hlim; INP_WUNLOCK(inp); break; } case IPV6_MULTICAST_LOOP: { u_int loop; /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. */ if (sopt->sopt_valsize != sizeof(u_int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &loop, sizeof(u_int), sizeof(u_int)); if (error) break; if (loop > 1) { error = EINVAL; break; } im6o = in6p_findmoptions(inp); im6o->im6o_multicast_loop = loop; INP_WUNLOCK(inp); break; } case IPV6_JOIN_GROUP: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: error = in6p_join_group(inp, sopt); break; case IPV6_LEAVE_GROUP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = in6p_leave_group(inp, sopt); break; case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = in6p_block_unblock_source(inp, sopt); break; case IPV6_MSFILTER: error = in6p_set_source_filters(inp, sopt); break; default: error = EOPNOTSUPP; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Expose MLD's multicast filter mode and source list(s) to userland, * keyed by (ifindex, group). * The filter mode is written out as a uint32_t, followed by * 0..n of struct in6_addr. * For use by ifmcstat(8). * SMPng: NOTE: unlocked read of ifindex space. */ static int sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS) { struct in6_addr mcaddr; struct in6_addr src; struct epoch_tracker et; struct ifnet *ifp; struct ifmultiaddr *ifma; struct in6_multi *inm; struct ip6_msource *ims; int *name; int retval; u_int namelen; uint32_t fmode, ifindex; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); /* int: ifindex + 4 * 32 bits of IPv6 address */ if (namelen != 5) return (EINVAL); ifindex = name[0]; if (ifindex <= 0 || ifindex > V_if_index) { CTR2(KTR_MLD, "%s: ifindex %u out of range", __func__, ifindex); return (ENOENT); } memcpy(&mcaddr, &name[1], sizeof(struct in6_addr)); if (!IN6_IS_ADDR_MULTICAST(&mcaddr)) { CTR2(KTR_MLD, "%s: group %s is not multicast", __func__, ip6_sprintf(ip6tbuf, &mcaddr)); return (EINVAL); } + NET_EPOCH_ENTER(et); ifp = ifnet_byindex(ifindex); if (ifp == NULL) { + NET_EPOCH_EXIT(et); CTR2(KTR_MLD, "%s: no ifp for ifindex %u", __func__, ifindex); return (ENOENT); } /* * Internal MLD lookups require that scope/zone ID is set. */ (void)in6_setscope(&mcaddr, ifp, NULL); retval = sysctl_wire_old_buffer(req, sizeof(uint32_t) + (in6_mcast_maxgrpsrc * sizeof(struct in6_addr))); - if (retval) + if (retval) { + NET_EPOCH_EXIT(et); return (retval); + } IN6_MULTI_LOCK(); IN6_MULTI_LIST_LOCK(); - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { inm = in6m_ifmultiaddr_get_inm(ifma); if (inm == NULL) continue; if (!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &mcaddr)) continue; fmode = inm->in6m_st[1].iss_fmode; retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); if (retval != 0) break; RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { CTR2(KTR_MLD, "%s: visit node %p", __func__, ims); /* * Only copy-out sources which are in-mode. */ if (fmode != im6s_get_mode(inm, ims, 1)) { CTR1(KTR_MLD, "%s: skip non-in-mode", __func__); continue; } src = ims->im6s_addr; retval = SYSCTL_OUT(req, &src, sizeof(struct in6_addr)); if (retval != 0) break; } } - NET_EPOCH_EXIT(et); - IN6_MULTI_LIST_UNLOCK(); IN6_MULTI_UNLOCK(); + NET_EPOCH_EXIT(et); return (retval); } #ifdef KTR static const char *in6m_modestrs[] = { "un", "in", "ex" }; static const char * in6m_mode_str(const int mode) { if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) return (in6m_modestrs[mode]); return ("??"); } static const char *in6m_statestrs[] = { "not-member", "silent", "idle", "lazy", "sleeping", "awakening", "query-pending", "sg-query-pending", "leaving" }; static const char * in6m_state_str(const int state) { if (state >= MLD_NOT_MEMBER && state <= MLD_LEAVING_MEMBER) return (in6m_statestrs[state]); return ("??"); } /* * Dump an in6_multi structure to the console. */ void in6m_print(const struct in6_multi *inm) { int t; char ip6tbuf[INET6_ADDRSTRLEN]; if ((ktr_mask & KTR_MLD) == 0) return; printf("%s: --- begin in6m %p ---\n", __func__, inm); printf("addr %s ifp %p(%s) ifma %p\n", ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp), inm->in6m_ifma); printf("timer %u state %s refcount %u scq.len %u\n", inm->in6m_timer, in6m_state_str(inm->in6m_state), inm->in6m_refcount, mbufq_len(&inm->in6m_scq)); printf("mli %p nsrc %lu sctimer %u scrv %u\n", inm->in6m_mli, inm->in6m_nsrc, inm->in6m_sctimer, inm->in6m_scrv); for (t = 0; t < 2; t++) { printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, in6m_mode_str(inm->in6m_st[t].iss_fmode), inm->in6m_st[t].iss_asm, inm->in6m_st[t].iss_ex, inm->in6m_st[t].iss_in, inm->in6m_st[t].iss_rec); } printf("%s: --- end in6m %p ---\n", __func__, inm); } #else /* !KTR */ void in6m_print(const struct in6_multi *inm) { } #endif /* KTR */ diff --git a/sys/netinet6/in6_var.h b/sys/netinet6/in6_var.h index 3e535310b29e..9856acd87429 100644 --- a/sys/netinet6/in6_var.h +++ b/sys/netinet6/in6_var.h @@ -1,928 +1,929 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_var.h,v 1.56 2001/03/29 05:34:31 itojun Exp $ */ /*- * Copyright (c) 1985, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_var.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET6_IN6_VAR_H_ #define _NETINET6_IN6_VAR_H_ #include #include #ifdef _KERNEL #include #include #endif /* * Interface address, Internet version. One of these structures * is allocated for each interface with an Internet address. * The ifaddr structure contains the protocol-independent part * of the structure and is assumed to be first. */ /* * pltime/vltime are just for future reference (required to implements 2 * hour rule for hosts). they should never be modified by nd6_timeout or * anywhere else. * userland -> kernel: accept pltime/vltime * kernel -> userland: throw up everything * in kernel: modify preferred/expire only */ struct in6_addrlifetime { time_t ia6t_expire; /* valid lifetime expiration time */ time_t ia6t_preferred; /* preferred lifetime expiration time */ u_int32_t ia6t_vltime; /* valid lifetime */ u_int32_t ia6t_pltime; /* prefix lifetime */ }; struct nd_ifinfo; struct scope6_id; struct lltable; struct mld_ifsoftc; struct in6_multi; struct in6_ifextra { counter_u64_t *in6_ifstat; counter_u64_t *icmp6_ifstat; struct nd_ifinfo *nd_ifinfo; struct scope6_id *scope6_id; struct lltable *lltable; struct mld_ifsoftc *mld_ifinfo; }; #define LLTABLE6(ifp) (((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->lltable) #ifdef _KERNEL SLIST_HEAD(in6_multi_head, in6_multi); MALLOC_DECLARE(M_IP6MADDR); struct in6_ifaddr { struct ifaddr ia_ifa; /* protocol-independent info */ #define ia_ifp ia_ifa.ifa_ifp #define ia_flags ia_ifa.ifa_flags struct sockaddr_in6 ia_addr; /* interface address */ struct sockaddr_in6 ia_net; /* network number of interface */ struct sockaddr_in6 ia_dstaddr; /* space for destination addr */ struct sockaddr_in6 ia_prefixmask; /* prefix mask */ u_int32_t ia_plen; /* prefix length */ CK_STAILQ_ENTRY(in6_ifaddr) ia_link; /* list of IPv6 addresses */ int ia6_flags; struct in6_addrlifetime ia6_lifetime; time_t ia6_createtime; /* the creation time of this address, which is * currently used for temporary addresses only. */ time_t ia6_updatetime; /* back pointer to the ND prefix (for autoconfigured addresses only) */ struct nd_prefix *ia6_ndpr; /* multicast addresses joined from the kernel */ LIST_HEAD(, in6_multi_mship) ia6_memberships; /* entry in bucket of inet6 addresses */ CK_LIST_ENTRY(in6_ifaddr) ia6_hash; }; /* List of in6_ifaddr's. */ CK_STAILQ_HEAD(in6_ifaddrhead, in6_ifaddr); CK_LIST_HEAD(in6_ifaddrlisthead, in6_ifaddr); #endif /* _KERNEL */ /* control structure to manage address selection policy */ struct in6_addrpolicy { struct sockaddr_in6 addr; /* prefix address */ struct sockaddr_in6 addrmask; /* prefix mask */ int preced; /* precedence */ int label; /* matching label */ u_quad_t use; /* statistics */ }; /* * IPv6 interface statistics, as defined in RFC2465 Ipv6IfStatsEntry (p12). */ struct in6_ifstat { uint64_t ifs6_in_receive; /* # of total input datagram */ uint64_t ifs6_in_hdrerr; /* # of datagrams with invalid hdr */ uint64_t ifs6_in_toobig; /* # of datagrams exceeded MTU */ uint64_t ifs6_in_noroute; /* # of datagrams with no route */ uint64_t ifs6_in_addrerr; /* # of datagrams with invalid dst */ uint64_t ifs6_in_protounknown; /* # of datagrams with unknown proto */ /* NOTE: increment on final dst if */ uint64_t ifs6_in_truncated; /* # of truncated datagrams */ uint64_t ifs6_in_discard; /* # of discarded datagrams */ /* NOTE: fragment timeout is not here */ uint64_t ifs6_in_deliver; /* # of datagrams delivered to ULP */ /* NOTE: increment on final dst if */ uint64_t ifs6_out_forward; /* # of datagrams forwarded */ /* NOTE: increment on outgoing if */ uint64_t ifs6_out_request; /* # of outgoing datagrams from ULP */ /* NOTE: does not include forwrads */ uint64_t ifs6_out_discard; /* # of discarded datagrams */ uint64_t ifs6_out_fragok; /* # of datagrams fragmented */ uint64_t ifs6_out_fragfail; /* # of datagrams failed on fragment */ uint64_t ifs6_out_fragcreat; /* # of fragment datagrams */ /* NOTE: this is # after fragment */ uint64_t ifs6_reass_reqd; /* # of incoming fragmented packets */ /* NOTE: increment on final dst if */ uint64_t ifs6_reass_ok; /* # of reassembled packets */ /* NOTE: this is # after reass */ /* NOTE: increment on final dst if */ uint64_t ifs6_reass_fail; /* # of reass failures */ /* NOTE: may not be packet count */ /* NOTE: increment on final dst if */ uint64_t ifs6_in_mcast; /* # of inbound multicast datagrams */ uint64_t ifs6_out_mcast; /* # of outbound multicast datagrams */ }; /* * ICMPv6 interface statistics, as defined in RFC2466 Ipv6IfIcmpEntry. * XXX: I'm not sure if this file is the right place for this structure... */ struct icmp6_ifstat { /* * Input statistics */ /* ipv6IfIcmpInMsgs, total # of input messages */ uint64_t ifs6_in_msg; /* ipv6IfIcmpInErrors, # of input error messages */ uint64_t ifs6_in_error; /* ipv6IfIcmpInDestUnreachs, # of input dest unreach errors */ uint64_t ifs6_in_dstunreach; /* ipv6IfIcmpInAdminProhibs, # of input administratively prohibited errs */ uint64_t ifs6_in_adminprohib; /* ipv6IfIcmpInTimeExcds, # of input time exceeded errors */ uint64_t ifs6_in_timeexceed; /* ipv6IfIcmpInParmProblems, # of input parameter problem errors */ uint64_t ifs6_in_paramprob; /* ipv6IfIcmpInPktTooBigs, # of input packet too big errors */ uint64_t ifs6_in_pkttoobig; /* ipv6IfIcmpInEchos, # of input echo requests */ uint64_t ifs6_in_echo; /* ipv6IfIcmpInEchoReplies, # of input echo replies */ uint64_t ifs6_in_echoreply; /* ipv6IfIcmpInRouterSolicits, # of input router solicitations */ uint64_t ifs6_in_routersolicit; /* ipv6IfIcmpInRouterAdvertisements, # of input router advertisements */ uint64_t ifs6_in_routeradvert; /* ipv6IfIcmpInNeighborSolicits, # of input neighbor solicitations */ uint64_t ifs6_in_neighborsolicit; /* ipv6IfIcmpInNeighborAdvertisements, # of input neighbor advertisements */ uint64_t ifs6_in_neighboradvert; /* ipv6IfIcmpInRedirects, # of input redirects */ uint64_t ifs6_in_redirect; /* ipv6IfIcmpInGroupMembQueries, # of input MLD queries */ uint64_t ifs6_in_mldquery; /* ipv6IfIcmpInGroupMembResponses, # of input MLD reports */ uint64_t ifs6_in_mldreport; /* ipv6IfIcmpInGroupMembReductions, # of input MLD done */ uint64_t ifs6_in_mlddone; /* * Output statistics. We should solve unresolved routing problem... */ /* ipv6IfIcmpOutMsgs, total # of output messages */ uint64_t ifs6_out_msg; /* ipv6IfIcmpOutErrors, # of output error messages */ uint64_t ifs6_out_error; /* ipv6IfIcmpOutDestUnreachs, # of output dest unreach errors */ uint64_t ifs6_out_dstunreach; /* ipv6IfIcmpOutAdminProhibs, # of output administratively prohibited errs */ uint64_t ifs6_out_adminprohib; /* ipv6IfIcmpOutTimeExcds, # of output time exceeded errors */ uint64_t ifs6_out_timeexceed; /* ipv6IfIcmpOutParmProblems, # of output parameter problem errors */ uint64_t ifs6_out_paramprob; /* ipv6IfIcmpOutPktTooBigs, # of output packet too big errors */ uint64_t ifs6_out_pkttoobig; /* ipv6IfIcmpOutEchos, # of output echo requests */ uint64_t ifs6_out_echo; /* ipv6IfIcmpOutEchoReplies, # of output echo replies */ uint64_t ifs6_out_echoreply; /* ipv6IfIcmpOutRouterSolicits, # of output router solicitations */ uint64_t ifs6_out_routersolicit; /* ipv6IfIcmpOutRouterAdvertisements, # of output router advertisements */ uint64_t ifs6_out_routeradvert; /* ipv6IfIcmpOutNeighborSolicits, # of output neighbor solicitations */ uint64_t ifs6_out_neighborsolicit; /* ipv6IfIcmpOutNeighborAdvertisements, # of output neighbor advertisements */ uint64_t ifs6_out_neighboradvert; /* ipv6IfIcmpOutRedirects, # of output redirects */ uint64_t ifs6_out_redirect; /* ipv6IfIcmpOutGroupMembQueries, # of output MLD queries */ uint64_t ifs6_out_mldquery; /* ipv6IfIcmpOutGroupMembResponses, # of output MLD reports */ uint64_t ifs6_out_mldreport; /* ipv6IfIcmpOutGroupMembReductions, # of output MLD done */ uint64_t ifs6_out_mlddone; }; struct in6_ifreq { char ifr_name[IFNAMSIZ]; union { struct sockaddr_in6 ifru_addr; struct sockaddr_in6 ifru_dstaddr; int ifru_flags; int ifru_flags6; int ifru_metric; caddr_t ifru_data; struct in6_addrlifetime ifru_lifetime; struct in6_ifstat ifru_stat; struct icmp6_ifstat ifru_icmp6stat; u_int32_t ifru_scope_id[16]; } ifr_ifru; }; struct in6_aliasreq { char ifra_name[IFNAMSIZ]; struct sockaddr_in6 ifra_addr; struct sockaddr_in6 ifra_dstaddr; struct sockaddr_in6 ifra_prefixmask; int ifra_flags; struct in6_addrlifetime ifra_lifetime; int ifra_vhid; }; /* pre-10.x compat */ struct oin6_aliasreq { char ifra_name[IFNAMSIZ]; struct sockaddr_in6 ifra_addr; struct sockaddr_in6 ifra_dstaddr; struct sockaddr_in6 ifra_prefixmask; int ifra_flags; struct in6_addrlifetime ifra_lifetime; }; /* prefix type macro */ #define IN6_PREFIX_ND 1 #define IN6_PREFIX_RR 2 /* * prefix related flags passed between kernel(NDP related part) and * user land command(ifconfig) and daemon(rtadvd). */ struct in6_prflags { struct prf_ra { u_char onlink : 1; u_char autonomous : 1; u_char reserved : 6; } prf_ra; u_char prf_reserved1; u_short prf_reserved2; /* want to put this on 4byte offset */ struct prf_rr { u_char decrvalid : 1; u_char decrprefd : 1; u_char reserved : 6; } prf_rr; u_char prf_reserved3; u_short prf_reserved4; }; struct in6_prefixreq { char ipr_name[IFNAMSIZ]; u_char ipr_origin; u_char ipr_plen; u_int32_t ipr_vltime; u_int32_t ipr_pltime; struct in6_prflags ipr_flags; struct sockaddr_in6 ipr_prefix; }; #define PR_ORIG_RA 0 #define PR_ORIG_RR 1 #define PR_ORIG_STATIC 2 #define PR_ORIG_KERNEL 3 #define ipr_raf_onlink ipr_flags.prf_ra.onlink #define ipr_raf_auto ipr_flags.prf_ra.autonomous #define ipr_statef_onlink ipr_flags.prf_state.onlink #define ipr_rrf_decrvalid ipr_flags.prf_rr.decrvalid #define ipr_rrf_decrprefd ipr_flags.prf_rr.decrprefd struct in6_rrenumreq { char irr_name[IFNAMSIZ]; u_char irr_origin; u_char irr_m_len; /* match len for matchprefix */ u_char irr_m_minlen; /* minlen for matching prefix */ u_char irr_m_maxlen; /* maxlen for matching prefix */ u_char irr_u_uselen; /* uselen for adding prefix */ u_char irr_u_keeplen; /* keeplen from matching prefix */ struct irr_raflagmask { u_char onlink : 1; u_char autonomous : 1; u_char reserved : 6; } irr_raflagmask; u_int32_t irr_vltime; u_int32_t irr_pltime; struct in6_prflags irr_flags; struct sockaddr_in6 irr_matchprefix; struct sockaddr_in6 irr_useprefix; }; #define irr_raf_mask_onlink irr_raflagmask.onlink #define irr_raf_mask_auto irr_raflagmask.autonomous #define irr_raf_mask_reserved irr_raflagmask.reserved #define irr_raf_onlink irr_flags.prf_ra.onlink #define irr_raf_auto irr_flags.prf_ra.autonomous #define irr_statef_onlink irr_flags.prf_state.onlink #define irr_rrf irr_flags.prf_rr #define irr_rrf_decrvalid irr_flags.prf_rr.decrvalid #define irr_rrf_decrprefd irr_flags.prf_rr.decrprefd /* * Given a pointer to an in6_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in6 */ #define IA6_IN6(ia) (&((ia)->ia_addr.sin6_addr)) #define IA6_DSTIN6(ia) (&((ia)->ia_dstaddr.sin6_addr)) #define IA6_MASKIN6(ia) (&((ia)->ia_prefixmask.sin6_addr)) #define IA6_SIN6(ia) (&((ia)->ia_addr)) #define IA6_DSTSIN6(ia) (&((ia)->ia_dstaddr)) #define IFA_IN6(x) (&((struct sockaddr_in6 *)((x)->ifa_addr))->sin6_addr) #define IFA_DSTIN6(x) (&((struct sockaddr_in6 *)((x)->ifa_dstaddr))->sin6_addr) #define IFPR_IN6(x) (&((struct sockaddr_in6 *)((x)->ifpr_prefix))->sin6_addr) #ifdef _KERNEL #define IN6_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ (((d)->s6_addr32[0] ^ (a)->s6_addr32[0]) & (m)->s6_addr32[0]) == 0 && \ (((d)->s6_addr32[1] ^ (a)->s6_addr32[1]) & (m)->s6_addr32[1]) == 0 && \ (((d)->s6_addr32[2] ^ (a)->s6_addr32[2]) & (m)->s6_addr32[2]) == 0 && \ (((d)->s6_addr32[3] ^ (a)->s6_addr32[3]) & (m)->s6_addr32[3]) == 0 ) #define IN6_MASK_ADDR(a, m) do { \ (a)->s6_addr32[0] &= (m)->s6_addr32[0]; \ (a)->s6_addr32[1] &= (m)->s6_addr32[1]; \ (a)->s6_addr32[2] &= (m)->s6_addr32[2]; \ (a)->s6_addr32[3] &= (m)->s6_addr32[3]; \ } while (0) #endif #define SIOCSIFADDR_IN6 _IOW('i', 12, struct in6_ifreq) #define SIOCGIFADDR_IN6 _IOWR('i', 33, struct in6_ifreq) #ifdef _KERNEL /* * SIOCSxxx ioctls should be unused (see comments in in6.c), but * we do not shift numbers for binary compatibility. */ #define SIOCSIFDSTADDR_IN6 _IOW('i', 14, struct in6_ifreq) #define SIOCSIFNETMASK_IN6 _IOW('i', 22, struct in6_ifreq) #endif #define SIOCGIFDSTADDR_IN6 _IOWR('i', 34, struct in6_ifreq) #define SIOCGIFNETMASK_IN6 _IOWR('i', 37, struct in6_ifreq) #define SIOCDIFADDR_IN6 _IOW('i', 25, struct in6_ifreq) #define OSIOCAIFADDR_IN6 _IOW('i', 26, struct oin6_aliasreq) #define SIOCAIFADDR_IN6 _IOW('i', 27, struct in6_aliasreq) #define SIOCSIFPHYADDR_IN6 _IOW('i', 70, struct in6_aliasreq) #define SIOCGIFPSRCADDR_IN6 _IOWR('i', 71, struct in6_ifreq) #define SIOCGIFPDSTADDR_IN6 _IOWR('i', 72, struct in6_ifreq) #define SIOCGIFAFLAG_IN6 _IOWR('i', 73, struct in6_ifreq) #ifdef _KERNEL #define OSIOCGIFINFO_IN6 _IOWR('i', 76, struct in6_ondireq) #endif #define SIOCGIFINFO_IN6 _IOWR('i', 108, struct in6_ndireq) #define SIOCSIFINFO_IN6 _IOWR('i', 109, struct in6_ndireq) #define SIOCSNDFLUSH_IN6 _IOWR('i', 77, struct in6_ifreq) #define SIOCGNBRINFO_IN6 _IOWR('i', 78, struct in6_nbrinfo) #define SIOCSPFXFLUSH_IN6 _IOWR('i', 79, struct in6_ifreq) #define SIOCSRTRFLUSH_IN6 _IOWR('i', 80, struct in6_ifreq) #define SIOCGIFALIFETIME_IN6 _IOWR('i', 81, struct in6_ifreq) #define SIOCGIFSTAT_IN6 _IOWR('i', 83, struct in6_ifreq) #define SIOCGIFSTAT_ICMP6 _IOWR('i', 84, struct in6_ifreq) #define SIOCSDEFIFACE_IN6 _IOWR('i', 85, struct in6_ndifreq) #define SIOCGDEFIFACE_IN6 _IOWR('i', 86, struct in6_ndifreq) #define SIOCSIFINFO_FLAGS _IOWR('i', 87, struct in6_ndireq) /* XXX */ #define SIOCSSCOPE6 _IOW('i', 88, struct in6_ifreq) #define SIOCGSCOPE6 _IOWR('i', 89, struct in6_ifreq) #define SIOCGSCOPE6DEF _IOWR('i', 90, struct in6_ifreq) #define SIOCSIFPREFIX_IN6 _IOW('i', 100, struct in6_prefixreq) /* set */ #define SIOCGIFPREFIX_IN6 _IOWR('i', 101, struct in6_prefixreq) /* get */ #define SIOCDIFPREFIX_IN6 _IOW('i', 102, struct in6_prefixreq) /* del */ #define SIOCAIFPREFIX_IN6 _IOW('i', 103, struct in6_rrenumreq) /* add */ #define SIOCCIFPREFIX_IN6 _IOW('i', 104, \ struct in6_rrenumreq) /* change */ #define SIOCSGIFPREFIX_IN6 _IOW('i', 105, \ struct in6_rrenumreq) /* set global */ #define SIOCGETSGCNT_IN6 _IOWR('u', 106, \ struct sioc_sg_req6) /* get s,g pkt cnt */ #define SIOCGETMIFCNT_IN6 _IOWR('u', 107, \ struct sioc_mif_req6) /* get pkt cnt per if */ #define SIOCAADDRCTL_POLICY _IOW('u', 108, struct in6_addrpolicy) #define SIOCDADDRCTL_POLICY _IOW('u', 109, struct in6_addrpolicy) #define IN6_IFF_ANYCAST 0x01 /* anycast address */ #define IN6_IFF_TENTATIVE 0x02 /* tentative address */ #define IN6_IFF_DUPLICATED 0x04 /* DAD detected duplicate */ #define IN6_IFF_DETACHED 0x08 /* may be detached from the link */ #define IN6_IFF_DEPRECATED 0x10 /* deprecated address */ #define IN6_IFF_NODAD 0x20 /* don't perform DAD on this address * (obsolete) */ #define IN6_IFF_AUTOCONF 0x40 /* autoconfigurable address. */ #define IN6_IFF_TEMPORARY 0x80 /* temporary (anonymous) address. */ #define IN6_IFF_PREFER_SOURCE 0x0100 /* preferred address for SAS */ /* do not input/output */ #define IN6_IFF_NOTREADY (IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED) #ifdef _KERNEL #define IN6_ARE_SCOPE_CMP(a,b) ((a)-(b)) #define IN6_ARE_SCOPE_EQUAL(a,b) ((a)==(b)) #endif #ifdef _KERNEL VNET_DECLARE(struct in6_ifaddrhead, in6_ifaddrhead); VNET_DECLARE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl); VNET_DECLARE(u_long, in6_ifaddrhmask); #define V_in6_ifaddrhead VNET(in6_ifaddrhead) #define V_in6_ifaddrhashtbl VNET(in6_ifaddrhashtbl) #define V_in6_ifaddrhmask VNET(in6_ifaddrhmask) #define IN6ADDR_NHASH_LOG2 8 #define IN6ADDR_NHASH (1 << IN6ADDR_NHASH_LOG2) #define IN6ADDR_HASHVAL(x) (in6_addrhash(x)) #define IN6ADDR_HASH(x) \ (&V_in6_ifaddrhashtbl[IN6ADDR_HASHVAL(x) & V_in6_ifaddrhmask]) static __inline uint32_t in6_addrhash(const struct in6_addr *in6) { uint32_t x; x = in6->s6_addr32[0] ^ in6->s6_addr32[1] ^ in6->s6_addr32[2] ^ in6->s6_addr32[3]; return (fnv_32_buf(&x, sizeof(x), FNV1_32_INIT)); } extern struct rmlock in6_ifaddr_lock; #define IN6_IFADDR_LOCK_ASSERT() rm_assert(&in6_ifaddr_lock, RA_LOCKED) #define IN6_IFADDR_RLOCK(t) rm_rlock(&in6_ifaddr_lock, (t)) #define IN6_IFADDR_RLOCK_ASSERT() rm_assert(&in6_ifaddr_lock, RA_RLOCKED) #define IN6_IFADDR_RUNLOCK(t) rm_runlock(&in6_ifaddr_lock, (t)) #define IN6_IFADDR_WLOCK() rm_wlock(&in6_ifaddr_lock) #define IN6_IFADDR_WLOCK_ASSERT() rm_assert(&in6_ifaddr_lock, RA_WLOCKED) #define IN6_IFADDR_WUNLOCK() rm_wunlock(&in6_ifaddr_lock) #define in6_ifstat_inc(ifp, tag) \ do { \ if (ifp) \ counter_u64_add(((struct in6_ifextra *) \ ((ifp)->if_afdata[AF_INET6]))->in6_ifstat[ \ offsetof(struct in6_ifstat, tag) / sizeof(uint64_t)], 1);\ } while (/*CONSTCOND*/ 0) extern u_char inet6ctlerrmap[]; VNET_DECLARE(unsigned long, in6_maxmtu); #define V_in6_maxmtu VNET(in6_maxmtu) #endif /* _KERNEL */ /* * IPv6 multicast MLD-layer source entry. */ struct ip6_msource { RB_ENTRY(ip6_msource) im6s_link; /* RB tree links */ struct in6_addr im6s_addr; struct im6s_st { uint16_t ex; /* # of exclusive members */ uint16_t in; /* # of inclusive members */ } im6s_st[2]; /* state at t0, t1 */ uint8_t im6s_stp; /* pending query */ }; RB_HEAD(ip6_msource_tree, ip6_msource); /* * IPv6 multicast PCB-layer source entry. * * NOTE: overlapping use of struct ip6_msource fields at start. */ struct in6_msource { RB_ENTRY(ip6_msource) im6s_link; /* Common field */ struct in6_addr im6s_addr; /* Common field */ uint8_t im6sl_st[2]; /* state before/at commit */ }; #ifdef _KERNEL /* * IPv6 source tree comparison function. * * An ordered predicate is necessary; bcmp() is not documented to return * an indication of order, memcmp() is, and is an ISO C99 requirement. */ static __inline int ip6_msource_cmp(const struct ip6_msource *a, const struct ip6_msource *b) { return (memcmp(&a->im6s_addr, &b->im6s_addr, sizeof(struct in6_addr))); } RB_PROTOTYPE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp); /* * IPv6 multicast PCB-layer group filter descriptor. */ struct in6_mfilter { struct ip6_msource_tree im6f_sources; /* source list for (S,G) */ u_long im6f_nsrc; /* # of source entries */ uint8_t im6f_st[2]; /* state before/at commit */ struct in6_multi *im6f_in6m; /* associated multicast address */ STAILQ_ENTRY(in6_mfilter) im6f_entry; /* list entry */ }; /* * Helper types and functions for IPv4 multicast filters. */ STAILQ_HEAD(ip6_mfilter_head, in6_mfilter); struct in6_mfilter *ip6_mfilter_alloc(int mflags, int st0, int st1); void ip6_mfilter_free(struct in6_mfilter *); static inline void ip6_mfilter_init(struct ip6_mfilter_head *head) { STAILQ_INIT(head); } static inline struct in6_mfilter * ip6_mfilter_first(const struct ip6_mfilter_head *head) { return (STAILQ_FIRST(head)); } static inline void ip6_mfilter_insert(struct ip6_mfilter_head *head, struct in6_mfilter *imf) { STAILQ_INSERT_TAIL(head, imf, im6f_entry); } static inline void ip6_mfilter_remove(struct ip6_mfilter_head *head, struct in6_mfilter *imf) { STAILQ_REMOVE(head, imf, in6_mfilter, im6f_entry); } #define IP6_MFILTER_FOREACH(imf, head) \ STAILQ_FOREACH(imf, head, im6f_entry) static inline size_t ip6_mfilter_count(struct ip6_mfilter_head *head) { struct in6_mfilter *imf; size_t num = 0; STAILQ_FOREACH(imf, head, im6f_entry) num++; return (num); } /* * Legacy KAME IPv6 multicast membership descriptor. */ struct in6_multi_mship { struct in6_multi *i6mm_maddr; LIST_ENTRY(in6_multi_mship) i6mm_chain; }; /* * IPv6 group descriptor. * * For every entry on an ifnet's if_multiaddrs list which represents * an IP multicast group, there is one of these structures. * * If any source filters are present, then a node will exist in the RB-tree * to permit fast lookup by source whenever an operation takes place. * This permits pre-order traversal when we issue reports. * Source filter trees are kept separately from the socket layer to * greatly simplify locking. * * When MLDv2 is active, in6m_timer is the response to group query timer. * The state-change timer in6m_sctimer is separate; whenever state changes * for the group the state change record is generated and transmitted, * and kept if retransmissions are necessary. * * FUTURE: in6m_link is now only used when groups are being purged * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but * because it is at the very start of the struct, we can't do this * w/o breaking the ABI for ifmcstat. */ struct in6_multi { struct in6_addr in6m_addr; /* IPv6 multicast address */ struct ifnet *in6m_ifp; /* back pointer to ifnet */ struct ifmultiaddr *in6m_ifma; /* back pointer to ifmultiaddr */ u_int in6m_refcount; /* reference count */ u_int in6m_state; /* state of the membership */ u_int in6m_timer; /* MLD6 listener report timer */ /* New fields for MLDv2 follow. */ struct mld_ifsoftc *in6m_mli; /* MLD info */ SLIST_ENTRY(in6_multi) in6m_nrele; /* to-be-released by MLD */ SLIST_ENTRY(in6_multi) in6m_defer; /* deferred MLDv1 */ struct ip6_msource_tree in6m_srcs; /* tree of sources */ u_long in6m_nsrc; /* # of tree entries */ struct mbufq in6m_scq; /* queue of pending * state-change packets */ struct timeval in6m_lastgsrtv; /* last G-S-R query */ uint16_t in6m_sctimer; /* state-change timer */ uint16_t in6m_scrv; /* state-change rexmit count */ /* * SSM state counters which track state at T0 (the time the last * state-change report's RV timer went to zero) and T1 * (time of pending report, i.e. now). * Used for computing MLDv2 state-change reports. Several refcounts * are maintained here to optimize for common use-cases. */ struct in6m_st { uint16_t iss_fmode; /* MLD filter mode */ uint16_t iss_asm; /* # of ASM listeners */ uint16_t iss_ex; /* # of exclusive members */ uint16_t iss_in; /* # of inclusive members */ uint16_t iss_rec; /* # of recorded sources */ } in6m_st[2]; /* state at t0, t1 */ }; void in6m_disconnect_locked(struct in6_multi_head *inmh, struct in6_multi *inm); /* * Helper function to derive the filter mode on a source entry * from its internal counters. Predicates are: * A source is only excluded if all listeners exclude it. * A source is only included if no listeners exclude it, * and at least one listener includes it. * May be used by ifmcstat(8). */ static __inline uint8_t im6s_get_mode(const struct in6_multi *inm, const struct ip6_msource *ims, uint8_t t) { t = !!t; if (inm->in6m_st[t].iss_ex > 0 && inm->in6m_st[t].iss_ex == ims->im6s_st[t].ex) return (MCAST_EXCLUDE); else if (ims->im6s_st[t].in > 0 && ims->im6s_st[t].ex == 0) return (MCAST_INCLUDE); return (MCAST_UNDEFINED); } /* * Lock macros for IPv6 layer multicast address lists. IPv6 lock goes * before link layer multicast locks in the lock order. In most cases, * consumers of IN_*_MULTI() macros should acquire the locks before * calling them; users of the in_{add,del}multi() functions should not. */ extern struct mtx in6_multi_list_mtx; extern struct sx in6_multi_sx; #define IN6_MULTI_LIST_LOCK() mtx_lock(&in6_multi_list_mtx) #define IN6_MULTI_LIST_UNLOCK() mtx_unlock(&in6_multi_list_mtx) #define IN6_MULTI_LIST_LOCK_ASSERT() mtx_assert(&in6_multi_list_mtx, MA_OWNED) #define IN6_MULTI_LIST_UNLOCK_ASSERT() mtx_assert(&in6_multi_list_mtx, MA_NOTOWNED) #define IN6_MULTI_LOCK() sx_xlock(&in6_multi_sx) #define IN6_MULTI_UNLOCK() sx_xunlock(&in6_multi_sx) #define IN6_MULTI_LOCK_ASSERT() sx_assert(&in6_multi_sx, SA_XLOCKED) #define IN6_MULTI_UNLOCK_ASSERT() sx_assert(&in6_multi_sx, SA_XUNLOCKED) /* * Get the in6_multi pointer from a ifmultiaddr. * Returns NULL if ifmultiaddr is no longer valid. */ static __inline struct in6_multi * in6m_ifmultiaddr_get_inm(struct ifmultiaddr *ifma) { NET_EPOCH_ASSERT(); return ((ifma->ifma_addr->sa_family != AF_INET6 || (ifma->ifma_flags & IFMA_F_ENQUEUED) == 0) ? NULL : ifma->ifma_protospec); } /* * Look up an in6_multi record for an IPv6 multicast address * on the interface ifp. * If no record found, return NULL. + * + * SMPng: The IN6_MULTI_LOCK and must be held and must be in network epoch. */ static __inline struct in6_multi * in6m_lookup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr) { struct ifmultiaddr *ifma; struct in6_multi *inm; CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { inm = in6m_ifmultiaddr_get_inm(ifma); if (inm == NULL) continue; if (IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, mcaddr)) return (inm); } return (NULL); } /* * Wrapper for in6m_lookup_locked(). * - * SMPng: Assumes that neithr the IN6_MULTI_LOCK() or IF_ADDR_LOCK() are held. + * SMPng: Assumes network epoch entered and that IN6_MULTI_LOCK() isn't held. */ static __inline struct in6_multi * in6m_lookup(struct ifnet *ifp, const struct in6_addr *mcaddr) { - struct epoch_tracker et; struct in6_multi *inm; + NET_EPOCH_ASSERT(); + IN6_MULTI_LIST_LOCK(); - NET_EPOCH_ENTER(et); inm = in6m_lookup_locked(ifp, mcaddr); - NET_EPOCH_EXIT(et); IN6_MULTI_LIST_UNLOCK(); return (inm); } /* Acquire an in6_multi record. */ static __inline void in6m_acquire_locked(struct in6_multi *inm) { IN6_MULTI_LIST_LOCK_ASSERT(); ++inm->in6m_refcount; } static __inline void in6m_acquire(struct in6_multi *inm) { IN6_MULTI_LIST_LOCK(); in6m_acquire_locked(inm); IN6_MULTI_LIST_UNLOCK(); } static __inline void in6m_rele_locked(struct in6_multi_head *inmh, struct in6_multi *inm) { KASSERT(inm->in6m_refcount > 0, ("refcount == %d inm: %p", inm->in6m_refcount, inm)); IN6_MULTI_LIST_LOCK_ASSERT(); if (--inm->in6m_refcount == 0) { MPASS(inm->in6m_ifp == NULL); inm->in6m_ifma->ifma_protospec = NULL; MPASS(inm->in6m_ifma->ifma_llifma == NULL); SLIST_INSERT_HEAD(inmh, inm, in6m_nrele); } } struct ip6_moptions; struct sockopt; struct inpcbinfo; /* Multicast KPIs. */ int im6o_mc_filter(const struct ip6_moptions *, const struct ifnet *, const struct sockaddr *, const struct sockaddr *); int in6_joingroup(struct ifnet *, const struct in6_addr *, struct in6_mfilter *, struct in6_multi **, int); int in6_joingroup_locked(struct ifnet *, const struct in6_addr *, struct in6_mfilter *, struct in6_multi **, int); int in6_leavegroup(struct in6_multi *, struct in6_mfilter *); int in6_leavegroup_locked(struct in6_multi *, struct in6_mfilter *); void in6m_clear_recorded(struct in6_multi *); void in6m_commit(struct in6_multi *); void in6m_print(const struct in6_multi *); int in6m_record_source(struct in6_multi *, const struct in6_addr *); void in6m_release_list_deferred(struct in6_multi_head *); void in6m_release_wait(void); void ip6_freemoptions(struct ip6_moptions *); int ip6_getmoptions(struct inpcb *, struct sockopt *); int ip6_setmoptions(struct inpcb *, struct sockopt *); /* flags to in6_update_ifa */ #define IN6_IFAUPDATE_DADDELAY 0x1 /* first time to configure an address */ int in6_mask2len(struct in6_addr *, u_char *); int in6_control(struct socket *, u_long, caddr_t, struct ifnet *, struct thread *); int in6_update_ifa(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); void in6_prepare_ifra(struct in6_aliasreq *, const struct in6_addr *, const struct in6_addr *); void in6_purgeaddr(struct ifaddr *); int in6if_do_dad(struct ifnet *); void in6_savemkludge(struct in6_ifaddr *); void *in6_domifattach(struct ifnet *); void in6_domifdetach(struct ifnet *, void *); int in6_domifmtu(struct ifnet *); void in6_setmaxmtu(void); int in6_if2idlen(struct ifnet *); struct in6_ifaddr *in6ifa_ifpforlinklocal(struct ifnet *, int); struct in6_ifaddr *in6ifa_ifpwithaddr(struct ifnet *, const struct in6_addr *); struct in6_ifaddr *in6ifa_ifwithaddr(const struct in6_addr *, uint32_t); struct in6_ifaddr *in6ifa_llaonifp(struct ifnet *); int in6_addr2zoneid(struct ifnet *, struct in6_addr *, u_int32_t *); int in6_matchlen(struct in6_addr *, struct in6_addr *); int in6_are_prefix_equal(struct in6_addr *, struct in6_addr *, int); void in6_prefixlen2mask(struct in6_addr *, int); int in6_prefix_ioctl(struct socket *, u_long, caddr_t, struct ifnet *); int in6_prefix_add_ifid(int, struct in6_ifaddr *); void in6_prefix_remove_ifid(int, struct in6_ifaddr *); void in6_purgeprefix(struct ifnet *); int in6_is_addr_deprecated(struct sockaddr_in6 *); int in6_src_ioctl(u_long, caddr_t); void in6_newaddrmsg(struct in6_ifaddr *, int); /* * Extended API for IPv6 FIB support. */ struct mbuf *ip6_tryforward(struct mbuf *); void in6_rtredirect(struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct sockaddr *, u_int); int in6_rtrequest(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int); void in6_rtalloc(struct route_in6 *, u_int); void in6_rtalloc_ign(struct route_in6 *, u_long, u_int); struct rtentry *in6_rtalloc1(struct sockaddr *, int, u_long, u_int); #endif /* _KERNEL */ #endif /* _NETINET6_IN6_VAR_H_ */ diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c index 08dffde61721..4cf4288b1b5b 100644 --- a/sys/netinet6/ip6_output.c +++ b/sys/netinet6/ip6_output.c @@ -1,3238 +1,3241 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include "opt_route.h" #include "opt_rss.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #include #endif #include #include extern int in6_mcast_loop; struct ip6_exthdrs { struct mbuf *ip6e_ip6; struct mbuf *ip6e_hbh; struct mbuf *ip6e_dest1; struct mbuf *ip6e_rthdr; struct mbuf *ip6e_dest2; }; static MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options"); static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **, struct ucred *, int); static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *, struct socket *, struct sockopt *); static int ip6_getpcbopt(struct inpcb *, int, struct sockopt *); static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *, struct ucred *, int, int, int); static int ip6_copyexthdr(struct mbuf **, caddr_t, int); static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int, struct ip6_frag **); static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t); static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *); static int ip6_getpmtu(struct route_in6 *, int, struct ifnet *, const struct in6_addr *, u_long *, int *, u_int, u_int); static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long, u_long *, int *, u_int); static int ip6_getpmtu_ctl(u_int, const struct in6_addr *, u_long *); static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int); /* * Make an extension header from option data. hp is the source, and * mp is the destination. */ #define MAKE_EXTHDR(hp, mp) \ do { \ if (hp) { \ struct ip6_ext *eh = (struct ip6_ext *)(hp); \ error = ip6_copyexthdr((mp), (caddr_t)(hp), \ ((eh)->ip6e_len + 1) << 3); \ if (error) \ goto freehdrs; \ } \ } while (/*CONSTCOND*/ 0) /* * Form a chain of extension headers. * m is the extension header mbuf * mp is the previous mbuf in the chain * p is the next header * i is the type of option. */ #define MAKE_CHAIN(m, mp, p, i)\ do {\ if (m) {\ if (!hdrsplit) \ panic("assumption failed: hdr not split"); \ *mtod((m), u_char *) = *(p);\ *(p) = (i);\ p = mtod((m), u_char *);\ (m)->m_next = (mp)->m_next;\ (mp)->m_next = (m);\ (mp) = (m);\ }\ } while (/*CONSTCOND*/ 0) void in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset) { u_short csum; csum = in_cksum_skip(m, offset + plen, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(csum) > m->m_len) m_copyback(m, offset, sizeof(csum), (caddr_t)&csum); else *(u_short *)mtodo(m, offset) = csum; } int ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto, int fraglen , uint32_t id) { struct mbuf *m, **mnext, *m_frgpart; struct ip6_hdr *ip6, *mhip6; struct ip6_frag *ip6f; int off; int error; int tlen = m0->m_pkthdr.len; KASSERT((fraglen % 8 == 0), ("Fragment length must be a multiple of 8")); m = m0; ip6 = mtod(m, struct ip6_hdr *); mnext = &m->m_nextpkt; for (off = hlen; off < tlen; off += fraglen) { m = m_gethdr(M_NOWAIT, MT_DATA); if (!m) { IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } /* * Make sure the complete packet header gets copied * from the originating mbuf to the newly created * mbuf. This also ensures that existing firewall * classification(s), VLAN tags and so on get copied * to the resulting fragmented packet(s): */ if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) { m_free(m); IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } *mnext = m; mnext = &m->m_nextpkt; m->m_data += max_linkhdr; mhip6 = mtod(m, struct ip6_hdr *); *mhip6 = *ip6; m->m_len = sizeof(*mhip6); error = ip6_insertfraghdr(m0, m, hlen, &ip6f); if (error) { IP6STAT_INC(ip6s_odropped); return (error); } ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7)); if (off + fraglen >= tlen) fraglen = tlen - off; else ip6f->ip6f_offlg |= IP6F_MORE_FRAG; mhip6->ip6_plen = htons((u_short)(fraglen + hlen + sizeof(*ip6f) - sizeof(struct ip6_hdr))); if ((m_frgpart = m_copym(m0, off, fraglen, M_NOWAIT)) == NULL) { IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } m_cat(m, m_frgpart); m->m_pkthdr.len = fraglen + hlen + sizeof(*ip6f); ip6f->ip6f_reserved = 0; ip6f->ip6f_ident = id; ip6f->ip6f_nxt = nextproto; IP6STAT_INC(ip6s_ofragments); in6_ifstat_inc(ifp, ifs6_out_fragcreat); } return (0); } static int ip6_output_send(struct inpcb *inp, struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m, struct sockaddr_in6 *dst, struct route_in6 *ro) { #ifdef KERN_TLS struct ktls_session *tls = NULL; #endif struct m_snd_tag *mst; int error; MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); mst = NULL; #ifdef KERN_TLS /* * If this is an unencrypted TLS record, save a reference to * the record. This local reference is used to call * ktls_output_eagain after the mbuf has been freed (thus * dropping the mbuf's reference) in if_output. */ if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) { tls = ktls_hold(m->m_next->m_ext.ext_pgs->tls); mst = tls->snd_tag; /* * If a TLS session doesn't have a valid tag, it must * have had an earlier ifp mismatch, so drop this * packet. */ if (mst == NULL) { error = EAGAIN; goto done; } } #endif #ifdef RATELIMIT if (inp != NULL && mst == NULL) { if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 || (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp)) in_pcboutput_txrtlmt(inp, ifp, m); if (inp->inp_snd_tag != NULL) mst = inp->inp_snd_tag; } #endif if (mst != NULL) { KASSERT(m->m_pkthdr.rcvif == NULL, ("trying to add a send tag to a forwarded packet")); if (mst->ifp != ifp) { error = EAGAIN; goto done; } /* stamp send tag on mbuf */ m->m_pkthdr.snd_tag = m_snd_tag_ref(mst); m->m_pkthdr.csum_flags |= CSUM_SND_TAG; } error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); done: /* Check for route change invalidating send tags. */ #ifdef KERN_TLS if (tls != NULL) { if (error == EAGAIN) error = ktls_output_eagain(inp, tls); ktls_free(tls); } #endif #ifdef RATELIMIT if (error == EAGAIN) in_pcboutput_eagain(inp); #endif return (error); } /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 * header (with pri, len, nxt, hlim, src, dst). * This function may modify ver and hlim only. * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route_in6 ro is present and has ro_rt initialized, route lookup would be * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, * then result of route lookup is stored in ro->ro_rt. * * type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and * nd_ifinfo.linkmtu is u_int32_t. so we use u_long to hold largest one, * which is rt_mtu. * * ifpp - XXX: just for statistics */ /* * XXX TODO: no flowid is assigned for outbound flows? */ int ip6_output(struct mbuf *m0, struct ip6_pktopts *opt, struct route_in6 *ro, int flags, struct ip6_moptions *im6o, struct ifnet **ifpp, struct inpcb *inp) { struct ip6_hdr *ip6; struct ifnet *ifp, *origifp; struct mbuf *m = m0; struct mbuf *mprev = NULL; int hlen, tlen, len; + struct epoch_tracker et; struct route_in6 ip6route; struct rtentry *rt = NULL; struct sockaddr_in6 *dst, src_sa, dst_sa; struct in6_addr odst; int error = 0; struct in6_ifaddr *ia = NULL; u_long mtu; int alwaysfrag, dontfrag; u_int32_t optlen = 0, plen = 0, unfragpartlen = 0; struct ip6_exthdrs exthdrs; struct in6_addr src0, dst0; u_int32_t zone; struct route_in6 *ro_pmtu = NULL; int hdrsplit = 0; int sw_csum, tso; int needfiblookup; uint32_t fibnum; struct m_tag *fwd_tag = NULL; uint32_t id; if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); if ((flags & IP_NODEFAULTFLOWID) == 0) { /* unconditionally set flowid */ m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } #ifdef NUMA m->m_pkthdr.numa_domain = inp->inp_numa_domain; #endif } #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * IPSec checking which handles several cases. * FAST IPSEC: We re-injected the packet. * XXX: need scope argument. */ if (IPSEC_ENABLED(ipv6)) { if ((error = IPSEC_OUTPUT(ipv6, m, inp)) != 0) { if (error == EINPROGRESS) error = 0; goto done; } } #endif /* IPSEC */ bzero(&exthdrs, sizeof(exthdrs)); if (opt) { /* Hop-by-Hop options header */ MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh); /* Destination options header(1st part) */ if (opt->ip6po_rthdr) { /* * Destination options header(1st part) * This only makes sense with a routing header. * See Section 9.2 of RFC 3542. * Disabling this part just for MIP6 convenience is * a bad idea. We need to think carefully about a * way to make the advanced API coexist with MIP6 * options, which might automatically be inserted in * the kernel. */ MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1); } /* Routing header */ MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr); /* Destination options header(2nd part) */ MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2); } /* * Calculate the total length of the extension header chain. * Keep the length of the unfragmentable part for fragmentation. */ optlen = 0; if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len; if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len; if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len; unfragpartlen = optlen + sizeof(struct ip6_hdr); /* NOTE: we don't add AH/ESP length here (done in ip6_ipsec_output) */ if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len; /* * If there is at least one extension header, * separate IP6 header from the payload. */ if (optlen && !hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } ip6 = mtod(m, struct ip6_hdr *); /* adjust mbuf packet header length */ m->m_pkthdr.len += optlen; plen = m->m_pkthdr.len - sizeof(*ip6); /* If this is a jumbo payload, insert a jumbo payload option. */ if (plen > IPV6_MAXPACKET) { if (!hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0) goto freehdrs; ip6->ip6_plen = 0; } else ip6->ip6_plen = htons(plen); /* * Concatenate headers and fill in next header fields. * Here we have, on "m" * IPv6 payload * and we insert headers accordingly. Finally, we should be getting: * IPv6 hbh dest1 rthdr ah* [esp* dest2 payload] * * during the header composing process, "m" points to IPv6 header. * "mprev" points to an extension header prior to esp. */ u_char *nexthdrp = &ip6->ip6_nxt; mprev = m; /* * we treat dest2 specially. this makes IPsec processing * much easier. the goal here is to make mprev point the * mbuf prior to dest2. * * result: IPv6 dest2 payload * m and mprev will point to IPv6 header. */ if (exthdrs.ip6e_dest2) { if (!hdrsplit) panic("assumption failed: hdr not split"); exthdrs.ip6e_dest2->m_next = m->m_next; m->m_next = exthdrs.ip6e_dest2; *mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_DSTOPTS; } /* * result: IPv6 hbh dest1 rthdr dest2 payload * m will point to IPv6 header. mprev will point to the * extension header prior to dest2 (rthdr in the above case). */ MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS); MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp, IPPROTO_DSTOPTS); MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp, IPPROTO_ROUTING); /* * If there is a routing header, discard the packet. */ if (exthdrs.ip6e_rthdr) { error = EINVAL; goto bad; } /* Source address validation */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && (flags & IPV6_UNSPECSRC) == 0) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } IP6STAT_INC(ip6s_localout); /* * Route packet. */ if (ro == NULL) { ro = &ip6route; bzero((caddr_t)ro, sizeof(*ro)); } ro_pmtu = ro; if (opt && opt->ip6po_rthdr) ro = &opt->ip6po_route; dst = (struct sockaddr_in6 *)&ro->ro_dst; fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); + NET_EPOCH_ENTER(et); again: /* * if specified, try to fill in the traffic class field. * do not override if a non-zero value is already set. * we check the diffserv field and the ecn field separately. */ if (opt && opt->ip6po_tclass >= 0) { int mask = 0; if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0) mask |= 0xfc; if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0) mask |= 0x03; if (mask != 0) ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); } /* fill in or override the hop limit field, if necessary. */ if (opt && opt->ip6po_hlim != -1) ip6->ip6_hlim = opt->ip6po_hlim & 0xff; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (im6o != NULL) ip6->ip6_hlim = im6o->im6o_multicast_hlim; else ip6->ip6_hlim = V_ip6_defmcasthlim; } /* * Validate route against routing table additions; * a better/more specific route might have been added. * Make sure address family is set in route. */ if (inp) { ro->ro_dst.sin6_family = AF_INET6; RT_VALIDATE((struct route *)ro, &inp->inp_rt_cookie, fibnum); } if (ro->ro_rt && fwd_tag == NULL && (ro->ro_rt->rt_flags & RTF_UP) && ro->ro_dst.sin6_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)) { rt = ro->ro_rt; ifp = ro->ro_rt->rt_ifp; } else { if (ro->ro_lle) LLE_FREE(ro->ro_lle); /* zeros ro_lle */ ro->ro_lle = NULL; if (fwd_tag == NULL) { bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; } error = in6_selectroute_fib(&dst_sa, opt, im6o, ro, &ifp, &rt, fibnum); if (error != 0) { if (ifp != NULL) in6_ifstat_inc(ifp, ifs6_out_discard); goto bad; } } if (rt == NULL) { /* * If in6_selectroute() does not return a route entry, * dst may not have been updated. */ *dst = dst_sa; /* XXX */ } /* * then rt (for unicast) and ifp must be non-NULL valid values. */ if ((flags & IPV6_FORWARDING) == 0) { /* XXX: the FORWARDING flag can be set for mrouting. */ in6_ifstat_inc(ifp, ifs6_out_request); } if (rt != NULL) { ia = (struct in6_ifaddr *)(rt->rt_ifa); counter_u64_add(rt->rt_pksent, 1); } /* Setup data structures for scope ID checks. */ src0 = ip6->ip6_src; bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = ip6->ip6_src; dst0 = ip6->ip6_dst; /* re-initialize to be sure */ bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; /* Check for valid scope ID. */ if (in6_setscope(&src0, ifp, &zone) == 0 && sa6_recoverscope(&src_sa) == 0 && zone == src_sa.sin6_scope_id && in6_setscope(&dst0, ifp, &zone) == 0 && sa6_recoverscope(&dst_sa) == 0 && zone == dst_sa.sin6_scope_id) { /* * The outgoing interface is in the zone of the source * and destination addresses. * * Because the loopback interface cannot receive * packets with a different scope ID than its own, * there is a trick is to pretend the outgoing packet * was received by the real network interface, by * setting "origifp" different from "ifp". This is * only allowed when "ifp" is a loopback network * interface. Refer to code in nd6_output_ifp() for * more details. */ origifp = ifp; /* * We should use ia_ifp to support the case of sending * packets to an address of our own. */ if (ia != NULL && ia->ia_ifp) ifp = ia->ia_ifp; } else if ((ifp->if_flags & IFF_LOOPBACK) == 0 || sa6_recoverscope(&src_sa) != 0 || sa6_recoverscope(&dst_sa) != 0 || dst_sa.sin6_scope_id == 0 || (src_sa.sin6_scope_id != 0 && src_sa.sin6_scope_id != dst_sa.sin6_scope_id) || (origifp = ifnet_byindex(dst_sa.sin6_scope_id)) == NULL) { /* * If the destination network interface is not a * loopback interface, or the destination network * address has no scope ID, or the source address has * a scope ID set which is different from the * destination address one, or there is no network * interface representing this scope ID, the address * pair is considered invalid. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(ifp, ifs6_out_discard); if (error == 0) error = EHOSTUNREACH; /* XXX */ goto bad; } /* All scope ID checks are successful. */ if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (opt && opt->ip6po_nextroute.ro_rt) { /* * The nexthop is explicitly specified by the * application. We assume the next hop is an IPv6 * address. */ dst = (struct sockaddr_in6 *)opt->ip6po_nexthop; } else if ((rt->rt_flags & RTF_GATEWAY)) dst = (struct sockaddr_in6 *)rt->rt_gateway; } if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */ } else { m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST; in6_ifstat_inc(ifp, ifs6_out_mcast); /* * Confirm that the outgoing interface supports multicast. */ if (!(ifp->if_flags & IFF_MULTICAST)) { IP6STAT_INC(ip6s_noroute); in6_ifstat_inc(ifp, ifs6_out_discard); error = ENETUNREACH; goto bad; } if ((im6o == NULL && in6_mcast_loop) || (im6o && im6o->im6o_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we have not joined * the address; protocols will filter it later, * thus deferring a hash lookup and lock acquisition * at the expense of an m_copym(). */ ip6_mloopback(ifp, m); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IPV6_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip6_mloopback(), * above, will be forwarded by the ip6_input() routine, * if necessary. */ if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) { /* * XXX: ip6_mforward expects that rcvif is NULL * when it is called from the originating path. * However, it may not always be the case. */ m->m_pkthdr.rcvif = NULL; if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a hoplimit of zero may be looped back, * above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip6_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) { m_freem(m); goto done; } } /* * Fill the outgoing inteface to tell the upper layer * to increment per-interface statistics. */ if (ifpp) *ifpp = ifp; /* Determine path MTU. */ if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst, &mtu, &alwaysfrag, fibnum, *nexthdrp)) != 0) goto bad; /* * The caller of this function may specify to use the minimum MTU * in some cases. * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU * setting. The logic is a bit complicated; by default, unicast * packets will follow path MTU while multicast packets will be sent at * the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets * including unicast ones will be sent at the minimum MTU. Multicast * packets will always be sent at the minimum MTU unless * IP6PO_MINMTU_DISABLE is explicitly specified. * See RFC 3542 for more details. */ if (mtu > IPV6_MMTU) { if ((flags & IPV6_MINMTU)) mtu = IPV6_MMTU; else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL) mtu = IPV6_MMTU; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && (opt == NULL || opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) { mtu = IPV6_MMTU; } } /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); /* * If the outgoing packet contains a hop-by-hop options header, * it must be examined and processed even by the source node. * (RFC 2460, section 4.) */ if (exthdrs.ip6e_hbh) { struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *); u_int32_t dummy; /* XXX unused */ u_int32_t plen = 0; /* XXX: ip6_process will check the value */ #ifdef DIAGNOSTIC if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len) panic("ip6e_hbh is not contiguous"); #endif /* * XXX: if we have to send an ICMPv6 error to the sender, * we need the M_LOOP flag since icmp6_error() expects * the IPv6 and the hop-by-hop options header are * contiguous unless the flag is set. */ m->m_flags |= M_LOOP; m->m_pkthdr.rcvif = ifp; if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1), ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh), &dummy, &plen) < 0) { /* m was already freed at this point */ error = EINVAL;/* better error? */ goto done; } m->m_flags &= ~M_LOOP; /* XXX */ m->m_pkthdr.rcvif = NULL; } /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED_OUT(V_inet6_pfil_head)) goto passout; odst = ip6->ip6_dst; /* Run through list of hooks for output packets. */ switch (pfil_run_hooks(V_inet6_pfil_head, &m, ifp, PFIL_OUT, inp)) { case PFIL_PASS: ip6 = mtod(m, struct ip6_hdr *); break; case PFIL_DROPPED: error = EPERM; /* FALLTHROUGH */ case PFIL_CONSUMED: goto done; } needfiblookup = 0; /* See if destination IP address was changed by packet filter. */ if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip6_input(). */ if (in6_localip(&ip6->ip6_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } else { RO_INVALIDATE_CACHE(ro); needfiblookup = 1; /* Redo the routing table lookup. */ } } /* See if fib was changed by packet filter. */ if (fibnum != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; fibnum = M_GETFIB(m); RO_INVALIDATE_CACHE(ro); needfiblookup = 1; } if (needfiblookup) goto again; /* See if local, if yes, send it to netisr. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } /* Or forward to some other address? */ if ((m->m_flags & M_IP6_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { dst = (struct sockaddr_in6 *)&ro->ro_dst; bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP6_NEXTHOP; m_tag_delete(m, fwd_tag); goto again; } passout: /* * Send the packet to the outgoing interface. * If necessary, do IPv6 fragmentation before sending. * * the logic here is rather complex: * 1: normal case (dontfrag == 0, alwaysfrag == 0) * 1-a: send as is if tlen <= path mtu * 1-b: fragment if tlen > path mtu * * 2: if user asks us not to fragment (dontfrag == 1) * 2-a: send as is if tlen <= interface mtu * 2-b: error if tlen > interface mtu * * 3: if we always need to attach fragment header (alwaysfrag == 1) * always fragment * * 4: if dontfrag == 1 && alwaysfrag == 1 * error, as we cannot handle this conflicting request */ sw_csum = m->m_pkthdr.csum_flags; if (!hdrsplit) { tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0; sw_csum &= ~ifp->if_hwassist; } else tso = 0; /* * If we added extension headers, we will not do TSO and calculate the * checksums ourselves for now. * XXX-BZ Need a framework to know when the NIC can handle it, even * with ext. hdrs. */ if (sw_csum & CSUM_DELAY_DATA_IPV6) { sw_csum &= ~CSUM_DELAY_DATA_IPV6; m = mb_unmapped_to_ext(m); if (m == NULL) { error = ENOBUFS; IP6STAT_INC(ip6s_odropped); goto bad; } in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr)); } else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) { m = mb_unmapped_to_ext(m); if (m == NULL) { error = ENOBUFS; IP6STAT_INC(ip6s_odropped); goto bad; } } #ifdef SCTP if (sw_csum & CSUM_SCTP_IPV6) { sw_csum &= ~CSUM_SCTP_IPV6; m = mb_unmapped_to_ext(m); if (m == NULL) { error = ENOBUFS; IP6STAT_INC(ip6s_odropped); goto bad; } sctp_delayed_cksum(m, sizeof(struct ip6_hdr)); } #endif m->m_pkthdr.csum_flags &= ifp->if_hwassist; tlen = m->m_pkthdr.len; if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso) dontfrag = 1; else dontfrag = 0; if (dontfrag && alwaysfrag) { /* case 4 */ /* conflicting request - can't transmit */ error = EMSGSIZE; goto bad; } if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) { /* case 2-b */ /* * Even if the DONTFRAG option is specified, we cannot send the * packet when the data length is larger than the MTU of the * outgoing interface. * Notify the error by sending IPV6_PATHMTU ancillary data if * application wanted to know the MTU value. Also return an * error code (this is not described in the API spec). */ if (inp != NULL) ip6_notify_pmtu(inp, &dst_sa, (u_int32_t)mtu); error = EMSGSIZE; goto bad; } /* * transmit packet without fragmentation */ if (dontfrag || (!alwaysfrag && tlen <= mtu)) { /* case 1-a and 2-a */ struct in6_ifaddr *ia6; ip6 = mtod(m, struct ip6_hdr *); ia6 = in6_ifawithifp(ifp, &ip6->ip6_src); if (ia6) { /* Record statistics for this interface address. */ counter_u64_add(ia6->ia_ifa.ifa_opackets, 1); counter_u64_add(ia6->ia_ifa.ifa_obytes, m->m_pkthdr.len); ifa_free(&ia6->ia_ifa); } error = ip6_output_send(inp, ifp, origifp, m, dst, ro); goto done; } /* * try to fragment the packet. case 1-b and 3 */ if (mtu < IPV6_MMTU) { /* path MTU cannot be less than IPV6_MMTU */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else if (ip6->ip6_plen == 0) { /* jumbo payload cannot be fragmented */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else { u_char nextproto; /* * Too large for the destination or interface; * fragment if possible. * Must be able to put at least 8 bytes per fragment. */ hlen = unfragpartlen; if (mtu > IPV6_MAXPACKET) mtu = IPV6_MAXPACKET; len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7; if (len < 8) { error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } /* * If the interface will not calculate checksums on * fragmented packets, then do it here. * XXX-BZ handle the hw offloading case. Need flags. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m = mb_unmapped_to_ext(m); if (m == NULL) { in6_ifstat_inc(ifp, ifs6_out_fragfail); error = ENOBUFS; goto bad; } in6_delayed_cksum(m, plen, hlen); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) { m = mb_unmapped_to_ext(m); if (m == NULL) { in6_ifstat_inc(ifp, ifs6_out_fragfail); error = ENOBUFS; goto bad; } sctp_delayed_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6; } #endif /* * Change the next header field of the last header in the * unfragmentable part. */ if (exthdrs.ip6e_rthdr) { nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *); *mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_dest1) { nextproto = *mtod(exthdrs.ip6e_dest1, u_char *); *mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_hbh) { nextproto = *mtod(exthdrs.ip6e_hbh, u_char *); *mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT; } else { nextproto = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_FRAGMENT; } /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto * chain. */ m0 = m; id = htonl(ip6_randomid()); if ((error = ip6_fragment(ifp, m, hlen, nextproto, len, id))) goto sendorfree; in6_ifstat_inc(ifp, ifs6_out_fragok); } /* * Remove leading garbages. */ sendorfree: m = m0->m_nextpkt; m0->m_nextpkt = 0; m_freem(m0); for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } error = ip6_output_send(inp, ifp, origifp, m, dst, ro); } else m_freem(m); } if (error == 0) IP6STAT_INC(ip6s_fragmented); done: + NET_EPOCH_EXIT(et); if (ro == &ip6route) RO_RTFREE(ro); return (error); freehdrs: m_freem(exthdrs.ip6e_hbh); /* m_freem will check if mbuf is 0 */ m_freem(exthdrs.ip6e_dest1); m_freem(exthdrs.ip6e_rthdr); m_freem(exthdrs.ip6e_dest2); /* FALLTHROUGH */ bad: if (m) m_freem(m); goto done; } static int ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen) { struct mbuf *m; if (hlen > MCLBYTES) return (ENOBUFS); /* XXX */ if (hlen > MLEN) m = m_getcl(M_NOWAIT, MT_DATA, 0); else m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_len = hlen; if (hdr) bcopy(hdr, mtod(m, caddr_t), hlen); *mp = m; return (0); } /* * Insert jumbo payload option. */ static int ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen) { struct mbuf *mopt; u_char *optbuf; u_int32_t v; #define JUMBOOPTLEN 8 /* length of jumbo payload option and padding */ /* * If there is no hop-by-hop options header, allocate new one. * If there is one but it doesn't have enough space to store the * jumbo payload option, allocate a cluster to store the whole options. * Otherwise, use it to store the options. */ if (exthdrs->ip6e_hbh == NULL) { mopt = m_get(M_NOWAIT, MT_DATA); if (mopt == NULL) return (ENOBUFS); mopt->m_len = JUMBOOPTLEN; optbuf = mtod(mopt, u_char *); optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */ exthdrs->ip6e_hbh = mopt; } else { struct ip6_hbh *hbh; mopt = exthdrs->ip6e_hbh; if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) { /* * XXX assumption: * - exthdrs->ip6e_hbh is not referenced from places * other than exthdrs. * - exthdrs->ip6e_hbh is not an mbuf chain. */ int oldoptlen = mopt->m_len; struct mbuf *n; /* * XXX: give up if the whole (new) hbh header does * not fit even in an mbuf cluster. */ if (oldoptlen + JUMBOOPTLEN > MCLBYTES) return (ENOBUFS); /* * As a consequence, we must always prepare a cluster * at this point. */ n = m_getcl(M_NOWAIT, MT_DATA, 0); if (n == NULL) return (ENOBUFS); n->m_len = oldoptlen + JUMBOOPTLEN; bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t), oldoptlen); optbuf = mtod(n, caddr_t) + oldoptlen; m_freem(mopt); mopt = exthdrs->ip6e_hbh = n; } else { optbuf = mtod(mopt, u_char *) + mopt->m_len; mopt->m_len += JUMBOOPTLEN; } optbuf[0] = IP6OPT_PADN; optbuf[1] = 1; /* * Adjust the header length according to the pad and * the jumbo payload option. */ hbh = mtod(mopt, struct ip6_hbh *); hbh->ip6h_len += (JUMBOOPTLEN >> 3); } /* fill in the option. */ optbuf[2] = IP6OPT_JUMBO; optbuf[3] = 4; v = (u_int32_t)htonl(plen + JUMBOOPTLEN); bcopy(&v, &optbuf[4], sizeof(u_int32_t)); /* finally, adjust the packet header length */ exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN; return (0); #undef JUMBOOPTLEN } /* * Insert fragment header and copy unfragmentable header portions. */ static int ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen, struct ip6_frag **frghdrp) { struct mbuf *n, *mlast; if (hlen > sizeof(struct ip6_hdr)) { n = m_copym(m0, sizeof(struct ip6_hdr), hlen - sizeof(struct ip6_hdr), M_NOWAIT); if (n == NULL) return (ENOBUFS); m->m_next = n; } else n = m; /* Search for the last mbuf of unfragmentable part. */ for (mlast = n; mlast->m_next; mlast = mlast->m_next) ; if (M_WRITABLE(mlast) && M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) { /* use the trailing space of the last mbuf for the fragment hdr */ *frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) + mlast->m_len); mlast->m_len += sizeof(struct ip6_frag); m->m_pkthdr.len += sizeof(struct ip6_frag); } else { /* allocate a new mbuf for the fragment header */ struct mbuf *mfrg; mfrg = m_get(M_NOWAIT, MT_DATA); if (mfrg == NULL) return (ENOBUFS); mfrg->m_len = sizeof(struct ip6_frag); *frghdrp = mtod(mfrg, struct ip6_frag *); mlast->m_next = mfrg; } return (0); } /* * Calculates IPv6 path mtu for destination @dst. * Resulting MTU is stored in @mtup. * * Returns 0 on success. */ static int ip6_getpmtu_ctl(u_int fibnum, const struct in6_addr *dst, u_long *mtup) { struct nhop6_extended nh6; struct in6_addr kdst; uint32_t scopeid; struct ifnet *ifp; u_long mtu; int error; in6_splitscope(dst, &kdst, &scopeid); if (fib6_lookup_nh_ext(fibnum, &kdst, scopeid, NHR_REF, 0, &nh6) != 0) return (EHOSTUNREACH); ifp = nh6.nh_ifp; mtu = nh6.nh_mtu; error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL, 0); fib6_free_nh_ext(fibnum, &nh6); return (error); } /* * Calculates IPv6 path MTU for @dst based on transmit @ifp, * and cached data in @ro_pmtu. * MTU from (successful) route lookup is saved (along with dst) * inside @ro_pmtu to avoid subsequent route lookups after packet * filter processing. * * Stores mtu and always-frag value into @mtup and @alwaysfragp. * Returns 0 on success. */ static int ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup, struct ifnet *ifp, const struct in6_addr *dst, u_long *mtup, int *alwaysfragp, u_int fibnum, u_int proto) { struct nhop6_basic nh6; struct in6_addr kdst; uint32_t scopeid; struct sockaddr_in6 *sa6_dst; u_long mtu; mtu = 0; if (do_lookup) { /* * Here ro_pmtu has final destination address, while * ro might represent immediate destination. * Use ro_pmtu destination since mtu might differ. */ sa6_dst = (struct sockaddr_in6 *)&ro_pmtu->ro_dst; if (!IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst)) ro_pmtu->ro_mtu = 0; if (ro_pmtu->ro_mtu == 0) { bzero(sa6_dst, sizeof(*sa6_dst)); sa6_dst->sin6_family = AF_INET6; sa6_dst->sin6_len = sizeof(struct sockaddr_in6); sa6_dst->sin6_addr = *dst; in6_splitscope(dst, &kdst, &scopeid); if (fib6_lookup_nh_basic(fibnum, &kdst, scopeid, 0, 0, &nh6) == 0) ro_pmtu->ro_mtu = nh6.nh_mtu; } mtu = ro_pmtu->ro_mtu; } if (ro_pmtu->ro_rt) mtu = ro_pmtu->ro_rt->rt_mtu; return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto)); } /* * Calculate MTU based on transmit @ifp, route mtu @rt_mtu and * hostcache data for @dst. * Stores mtu and always-frag value into @mtup and @alwaysfragp. * * Returns 0 on success. */ static int ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu, u_long *mtup, int *alwaysfragp, u_int proto) { u_long mtu = 0; int alwaysfrag = 0; int error = 0; if (rt_mtu > 0) { u_int32_t ifmtu; struct in_conninfo inc; bzero(&inc, sizeof(inc)); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; ifmtu = IN6_LINKMTU(ifp); /* TCP is known to react to pmtu changes so skip hc */ if (proto != IPPROTO_TCP) mtu = tcp_hc_getmtu(&inc); if (mtu) mtu = min(mtu, rt_mtu); else mtu = rt_mtu; if (mtu == 0) mtu = ifmtu; else if (mtu < IPV6_MMTU) { /* * RFC2460 section 5, last paragraph: * if we record ICMPv6 too big message with * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU * or smaller, with framgent header attached. * (fragment header is needed regardless from the * packet size, for translators to identify packets) */ alwaysfrag = 1; mtu = IPV6_MMTU; } } else if (ifp) { mtu = IN6_LINKMTU(ifp); } else error = EHOSTUNREACH; /* XXX */ *mtup = mtu; if (alwaysfragp) *alwaysfragp = alwaysfrag; return (error); } /* * IP6 socket option processing. */ int ip6_ctloutput(struct socket *so, struct sockopt *sopt) { int optdatalen, uproto; void *optdata; struct inpcb *inp = sotoinpcb(so); int error, optval; int level, op, optname; int optlen; struct thread *td; #ifdef RSS uint32_t rss_bucket; int retval; #endif /* * Don't use more than a quarter of mbuf clusters. N.B.: * nmbclusters is an int, but nmbclusters * MCLBYTES may overflow * on LP64 architectures, so cast to u_long to avoid undefined * behavior. ILP32 architectures cannot have nmbclusters * large enough to overflow for other reasons. */ #define IPV6_PKTOPTIONS_MBUF_LIMIT ((u_long)nmbclusters * MCLBYTES / 4) level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; td = sopt->sopt_td; error = 0; optval = 0; uproto = (int)so->so_proto->pr_protocol; if (level != IPPROTO_IPV6) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(inp); if ((so->so_options & SO_REUSEADDR) != 0) inp->inp_flags2 |= INP_REUSEADDR; else inp->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT) != 0) inp->inp_flags2 |= INP_REUSEPORT; else inp->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT_LB: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT_LB) != 0) inp->inp_flags2 |= INP_REUSEPORT_LB; else inp->inp_flags2 &= ~INP_REUSEPORT_LB; INP_WUNLOCK(inp); error = 0; break; case SO_SETFIB: INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); error = 0; break; case SO_MAX_PACING_RATE: #ifdef RATELIMIT INP_WLOCK(inp); inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; INP_WUNLOCK(inp); error = 0; #else error = EOPNOTSUPP; #endif break; default: break; } } } else { /* level == IPPROTO_IPV6 */ switch (op) { case SOPT_SET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif { struct mbuf *m; if (optlen > IPV6_PKTOPTIONS_MBUF_LIMIT) { printf("ip6_ctloutput: mbuf limit hit\n"); error = ENOBUFS; break; } error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; error = ip6_pcbopts(&inp->in6p_outputopts, m, so, sopt); m_freem(m); /* XXX */ break; } /* * Use of some Hop-by-Hop options or some * Destination options, might require special * privilege. That is, normal applications * (without special privilege) might be forbidden * from setting certain options in outgoing packets, * and might never see certain options in received * packets. [RFC 2292 Section 6] * KAME specific note: * KAME prevents non-privileged users from sending or * receiving ANY hbh/dst options in order to avoid * overhead of parsing options in the kernel. */ case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) break; } /* FALLTHROUGH */ case IPV6_UNICAST_HOPS: case IPV6_HOPLIMIT: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_RECVTCLASS: case IPV6_RECVFLOWID: #ifdef RSS case IPV6_RECVRSSBUCKETID: #endif case IPV6_V6ONLY: case IPV6_AUTOFLOWLABEL: case IPV6_ORIGDSTADDR: case IPV6_BINDANY: case IPV6_BINDMULTI: #ifdef RSS case IPV6_RSS_LISTEN_BUCKET: #endif if (optname == IPV6_BINDANY && td != NULL) { error = priv_check(td, PRIV_NETINET_BINDANY); if (error) break; } if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_UNICAST_HOPS: if (optval < -1 || optval >= 256) error = EINVAL; else { /* -1 = kernel default */ inp->in6p_hops = optval; if ((inp->inp_vflag & INP_IPV4) != 0) inp->inp_ip_ttl = optval; } break; #define OPTSET(bit) \ do { \ INP_WLOCK(inp); \ if (optval) \ inp->inp_flags |= (bit); \ else \ inp->inp_flags &= ~(bit); \ INP_WUNLOCK(inp); \ } while (/*CONSTCOND*/ 0) #define OPTSET2292(bit) \ do { \ INP_WLOCK(inp); \ inp->inp_flags |= IN6P_RFC2292; \ if (optval) \ inp->inp_flags |= (bit); \ else \ inp->inp_flags &= ~(bit); \ INP_WUNLOCK(inp); \ } while (/*CONSTCOND*/ 0) #define OPTBIT(bit) (inp->inp_flags & (bit) ? 1 : 0) #define OPTSET2_N(bit, val) do { \ if (val) \ inp->inp_flags2 |= bit; \ else \ inp->inp_flags2 &= ~bit; \ } while (0) #define OPTSET2(bit, val) do { \ INP_WLOCK(inp); \ OPTSET2_N(bit, val); \ INP_WUNLOCK(inp); \ } while (0) #define OPTBIT2(bit) (inp->inp_flags2 & (bit) ? 1 : 0) #define OPTSET2292_EXCLUSIVE(bit) \ do { \ INP_WLOCK(inp); \ if (OPTBIT(IN6P_RFC2292)) { \ error = EINVAL; \ } else { \ if (optval) \ inp->inp_flags |= (bit); \ else \ inp->inp_flags &= ~(bit); \ } \ INP_WUNLOCK(inp); \ } while (/*CONSTCOND*/ 0) case IPV6_RECVPKTINFO: OPTSET2292_EXCLUSIVE(IN6P_PKTINFO); break; case IPV6_HOPLIMIT: { struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } optp = &inp->in6p_outputopts; error = ip6_pcbopt(IPV6_HOPLIMIT, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); INP_WUNLOCK(inp); break; } case IPV6_RECVHOPLIMIT: OPTSET2292_EXCLUSIVE(IN6P_HOPLIMIT); break; case IPV6_RECVHOPOPTS: OPTSET2292_EXCLUSIVE(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: OPTSET2292_EXCLUSIVE(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: OPTSET2292_EXCLUSIVE(IN6P_RTHDRDSTOPTS); break; case IPV6_RECVRTHDR: OPTSET2292_EXCLUSIVE(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: /* * We ignore this option for TCP * sockets. * (RFC3542 leaves this case * unspecified.) */ if (uproto != IPPROTO_TCP) OPTSET(IN6P_MTU); break; case IPV6_RECVFLOWID: OPTSET2(INP_RECVFLOWID, optval); break; #ifdef RSS case IPV6_RECVRSSBUCKETID: OPTSET2(INP_RECVRSSBUCKETID, optval); break; #endif case IPV6_V6ONLY: /* * make setsockopt(IPV6_V6ONLY) * available only prior to bind(2). * see ipng mailing list, Jun 22 2001. */ if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { error = EINVAL; break; } OPTSET(IN6P_IPV6_V6ONLY); if (optval) inp->inp_vflag &= ~INP_IPV4; else inp->inp_vflag |= INP_IPV4; break; case IPV6_RECVTCLASS: /* cannot mix with RFC2292 XXX */ OPTSET2292_EXCLUSIVE(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: OPTSET(IN6P_AUTOFLOWLABEL); break; case IPV6_ORIGDSTADDR: OPTSET2(INP_ORIGDSTADDR, optval); break; case IPV6_BINDANY: OPTSET(INP_BINDANY); break; case IPV6_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; #ifdef RSS case IPV6_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { INP_WLOCK(inp); inp->inp_rss_listen_bucket = optval; OPTSET2_N(INP_RSS_BUCKET_SET, 1); INP_WUNLOCK(inp); } else { error = EINVAL; } break; #endif } break; case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: if (optlen != sizeof(optval)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; { struct ip6_pktopts **optp; INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } optp = &inp->in6p_outputopts; error = ip6_pcbopt(optname, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); INP_WUNLOCK(inp); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: /* RFC 2292 */ if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_2292PKTINFO: OPTSET2292(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: OPTSET2292(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: /* * Check super-user privilege. * See comments for IPV6_RECVHOPOPTS. */ if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_HOPOPTS); break; case IPV6_2292DSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */ break; case IPV6_2292RTHDR: OPTSET2292(IN6P_RTHDR); break; } break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: { /* new advanced API (RFC3542) */ u_char *optbuf; u_char optbuf_storage[MCLBYTES]; int optlen; struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } /* * We only ensure valsize is not too large * here. Further validation will be done * later. */ error = sooptcopyin(sopt, optbuf_storage, sizeof(optbuf_storage), 0); if (error) break; optlen = sopt->sopt_valsize; optbuf = optbuf_storage; INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } optp = &inp->in6p_outputopts; error = ip6_pcbopt(optname, optbuf, optlen, optp, (td != NULL) ? td->td_ucred : NULL, uproto); INP_WUNLOCK(inp); break; } #undef OPTSET case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: case IPV6_MSFILTER: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = ip6_setmoptions(inp, sopt); break; case IPV6_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(inp); switch (optval) { case IPV6_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IPV6_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IPV6_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(inp); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IPV6_IPSEC_POLICY: if (IPSEC_ENABLED(ipv6)) { error = IPSEC_PCBCTL(ipv6, inp, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif /* * RFC3542 (effectively) deprecated the * semantics of the 2292-style pktoptions. * Since it was not reliable in nature (i.e., * applications had to expect the lack of some * information after all), it would make sense * to simplify this part by always returning * empty data. */ sopt->sopt_valsize = 0; break; case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: case IPV6_UNICAST_HOPS: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_V6ONLY: case IPV6_PORTRANGE: case IPV6_RECVTCLASS: case IPV6_AUTOFLOWLABEL: case IPV6_BINDANY: case IPV6_FLOWID: case IPV6_FLOWTYPE: case IPV6_RECVFLOWID: #ifdef RSS case IPV6_RSSBUCKETID: case IPV6_RECVRSSBUCKETID: #endif case IPV6_BINDMULTI: switch (optname) { case IPV6_RECVHOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: optval = OPTBIT(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: optval = OPTBIT(IN6P_RTHDRDSTOPTS); break; case IPV6_UNICAST_HOPS: optval = inp->in6p_hops; break; case IPV6_RECVPKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_RECVHOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_RECVRTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: optval = OPTBIT(IN6P_MTU); break; case IPV6_V6ONLY: optval = OPTBIT(IN6P_IPV6_V6ONLY); break; case IPV6_PORTRANGE: { int flags; flags = inp->inp_flags; if (flags & INP_HIGHPORT) optval = IPV6_PORTRANGE_HIGH; else if (flags & INP_LOWPORT) optval = IPV6_PORTRANGE_LOW; else optval = 0; break; } case IPV6_RECVTCLASS: optval = OPTBIT(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: optval = OPTBIT(IN6P_AUTOFLOWLABEL); break; case IPV6_ORIGDSTADDR: optval = OPTBIT2(INP_ORIGDSTADDR); break; case IPV6_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IPV6_FLOWID: optval = inp->inp_flowid; break; case IPV6_FLOWTYPE: optval = inp->inp_flowtype; break; case IPV6_RECVFLOWID: optval = OPTBIT2(INP_RECVFLOWID); break; #ifdef RSS case IPV6_RSSBUCKETID: retval = rss_hash2bucket(inp->inp_flowid, inp->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; case IPV6_RECVRSSBUCKETID: optval = OPTBIT2(INP_RECVRSSBUCKETID); break; #endif case IPV6_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; } if (error) break; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PATHMTU: { u_long pmtu = 0; struct ip6_mtuinfo mtuinfo; struct in6_addr addr; if (!(so->so_state & SS_ISCONNECTED)) return (ENOTCONN); /* * XXX: we dot not consider the case of source * routing, or optional information to specify * the outgoing interface. * Copy faddr out of inp to avoid holding lock * on inp during route lookup. */ INP_RLOCK(inp); bcopy(&inp->in6p_faddr, &addr, sizeof(addr)); INP_RUNLOCK(inp); error = ip6_getpmtu_ctl(so->so_fibnum, &addr, &pmtu); if (error) break; if (pmtu > IPV6_MAXPACKET) pmtu = IPV6_MAXPACKET; bzero(&mtuinfo, sizeof(mtuinfo)); mtuinfo.ip6m_mtu = (u_int32_t)pmtu; optdata = (void *)&mtuinfo; optdatalen = sizeof(mtuinfo); error = sooptcopyout(sopt, optdata, optdatalen); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292RTHDR: case IPV6_2292DSTOPTS: switch (optname) { case IPV6_2292PKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_2292RTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_2292DSTOPTS: optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: error = ip6_getpcbopt(inp, optname, sopt); break; case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_MSFILTER: error = ip6_getmoptions(inp, sopt); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IPV6_IPSEC_POLICY: if (IPSEC_ENABLED(ipv6)) { error = IPSEC_PCBCTL(ipv6, inp, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } } return (error); } int ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0, optval, optlen; const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum); struct inpcb *inp = sotoinpcb(so); int level, op, optname; level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; if (level != IPPROTO_IPV6) { return (EINVAL); } switch (optname) { case IPV6_CHECKSUM: /* * For ICMPv6 sockets, no modification allowed for checksum * offset, permit "no change" values to help existing apps. * * RFC3542 says: "An attempt to set IPV6_CHECKSUM * for an ICMPv6 socket will fail." * The current behavior does not meet RFC3542. */ switch (op) { case SOPT_SET: if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; if (optval < -1 || (optval % 2) != 0) { /* * The API assumes non-negative even offset * values or -1 as a special value. */ error = EINVAL; } else if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (optval != icmp6off) error = EINVAL; } else inp->in6p_cksum = optval; break; case SOPT_GET: if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) optval = icmp6off; else optval = inp->in6p_cksum; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = EINVAL; break; } break; default: error = ENOPROTOOPT; break; } return (error); } /* * Set up IP6 options in pcb for insertion in output packets or * specifying behavior of outgoing packets. */ static int ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m, struct socket *so, struct sockopt *sopt) { struct ip6_pktopts *opt = *pktopt; int error = 0; struct thread *td = sopt->sopt_td; /* turn off any old options. */ if (opt) { #ifdef DIAGNOSTIC if (opt->ip6po_pktinfo || opt->ip6po_nexthop || opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 || opt->ip6po_rhinfo.ip6po_rhi_rthdr) printf("ip6_pcbopts: all specified options are cleared.\n"); #endif ip6_clearpktopts(opt, -1); } else opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK); *pktopt = NULL; if (!m || m->m_len == 0) { /* * Only turning off any previous options, regardless of * whether the opt is just created or given. */ free(opt, M_IP6OPT); return (0); } /* set options specified by user. */ if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ? td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) { ip6_clearpktopts(opt, -1); /* XXX: discard all options */ free(opt, M_IP6OPT); return (error); } *pktopt = opt; return (0); } /* * initialize ip6_pktopts. beware that there are non-zero default values in * the struct. */ void ip6_initpktopts(struct ip6_pktopts *opt) { bzero(opt, sizeof(*opt)); opt->ip6po_hlim = -1; /* -1 means default hop limit */ opt->ip6po_tclass = -1; /* -1 means default traffic class */ opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY; opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM; } static int ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, struct ucred *cred, int uproto) { struct ip6_pktopts *opt; if (*pktopt == NULL) { *pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT, M_NOWAIT); if (*pktopt == NULL) return (ENOBUFS); ip6_initpktopts(*pktopt); } opt = *pktopt; return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto)); } #define GET_PKTOPT_VAR(field, lenexpr) do { \ if (pktopt && pktopt->field) { \ INP_RUNLOCK(inp); \ optdata = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK); \ malloc_optdata = true; \ INP_RLOCK(inp); \ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ INP_RUNLOCK(inp); \ free(optdata, M_TEMP); \ return (ECONNRESET); \ } \ pktopt = inp->in6p_outputopts; \ if (pktopt && pktopt->field) { \ optdatalen = min(lenexpr, sopt->sopt_valsize); \ bcopy(&pktopt->field, optdata, optdatalen); \ } else { \ free(optdata, M_TEMP); \ optdata = NULL; \ malloc_optdata = false; \ } \ } \ } while(0) #define GET_PKTOPT_EXT_HDR(field) GET_PKTOPT_VAR(field, \ (((struct ip6_ext *)pktopt->field)->ip6e_len + 1) << 3) #define GET_PKTOPT_SOCKADDR(field) GET_PKTOPT_VAR(field, \ pktopt->field->sa_len) static int ip6_getpcbopt(struct inpcb *inp, int optname, struct sockopt *sopt) { void *optdata = NULL; bool malloc_optdata = false; int optdatalen = 0; int error = 0; struct in6_pktinfo null_pktinfo; int deftclass = 0, on; int defminmtu = IP6PO_MINMTU_MCASTONLY; int defpreftemp = IP6PO_TEMPADDR_SYSTEM; struct ip6_pktopts *pktopt; INP_RLOCK(inp); pktopt = inp->in6p_outputopts; switch (optname) { case IPV6_PKTINFO: optdata = (void *)&null_pktinfo; if (pktopt && pktopt->ip6po_pktinfo) { bcopy(pktopt->ip6po_pktinfo, &null_pktinfo, sizeof(null_pktinfo)); in6_clearscope(&null_pktinfo.ipi6_addr); } else { /* XXX: we don't have to do this every time... */ bzero(&null_pktinfo, sizeof(null_pktinfo)); } optdatalen = sizeof(struct in6_pktinfo); break; case IPV6_TCLASS: if (pktopt && pktopt->ip6po_tclass >= 0) deftclass = pktopt->ip6po_tclass; optdata = (void *)&deftclass; optdatalen = sizeof(int); break; case IPV6_HOPOPTS: GET_PKTOPT_EXT_HDR(ip6po_hbh); break; case IPV6_RTHDR: GET_PKTOPT_EXT_HDR(ip6po_rthdr); break; case IPV6_RTHDRDSTOPTS: GET_PKTOPT_EXT_HDR(ip6po_dest1); break; case IPV6_DSTOPTS: GET_PKTOPT_EXT_HDR(ip6po_dest2); break; case IPV6_NEXTHOP: GET_PKTOPT_SOCKADDR(ip6po_nexthop); break; case IPV6_USE_MIN_MTU: if (pktopt) defminmtu = pktopt->ip6po_minmtu; optdata = (void *)&defminmtu; optdatalen = sizeof(int); break; case IPV6_DONTFRAG: if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG)) on = 1; else on = 0; optdata = (void *)&on; optdatalen = sizeof(on); break; case IPV6_PREFER_TEMPADDR: if (pktopt) defpreftemp = pktopt->ip6po_prefer_tempaddr; optdata = (void *)&defpreftemp; optdatalen = sizeof(int); break; default: /* should not happen */ #ifdef DIAGNOSTIC panic("ip6_getpcbopt: unexpected option\n"); #endif INP_RUNLOCK(inp); return (ENOPROTOOPT); } INP_RUNLOCK(inp); error = sooptcopyout(sopt, optdata, optdatalen); if (malloc_optdata) free(optdata, M_TEMP); return (error); } void ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname) { if (pktopt == NULL) return; if (optname == -1 || optname == IPV6_PKTINFO) { if (pktopt->ip6po_pktinfo) free(pktopt->ip6po_pktinfo, M_IP6OPT); pktopt->ip6po_pktinfo = NULL; } if (optname == -1 || optname == IPV6_HOPLIMIT) pktopt->ip6po_hlim = -1; if (optname == -1 || optname == IPV6_TCLASS) pktopt->ip6po_tclass = -1; if (optname == -1 || optname == IPV6_NEXTHOP) { if (pktopt->ip6po_nextroute.ro_rt) { RTFREE(pktopt->ip6po_nextroute.ro_rt); pktopt->ip6po_nextroute.ro_rt = NULL; } if (pktopt->ip6po_nexthop) free(pktopt->ip6po_nexthop, M_IP6OPT); pktopt->ip6po_nexthop = NULL; } if (optname == -1 || optname == IPV6_HOPOPTS) { if (pktopt->ip6po_hbh) free(pktopt->ip6po_hbh, M_IP6OPT); pktopt->ip6po_hbh = NULL; } if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) { if (pktopt->ip6po_dest1) free(pktopt->ip6po_dest1, M_IP6OPT); pktopt->ip6po_dest1 = NULL; } if (optname == -1 || optname == IPV6_RTHDR) { if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT); pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL; if (pktopt->ip6po_route.ro_rt) { RTFREE(pktopt->ip6po_route.ro_rt); pktopt->ip6po_route.ro_rt = NULL; } } if (optname == -1 || optname == IPV6_DSTOPTS) { if (pktopt->ip6po_dest2) free(pktopt->ip6po_dest2, M_IP6OPT); pktopt->ip6po_dest2 = NULL; } } #define PKTOPT_EXTHDRCPY(type) \ do {\ if (src->type) {\ int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\ dst->type = malloc(hlen, M_IP6OPT, canwait);\ if (dst->type == NULL)\ goto bad;\ bcopy(src->type, dst->type, hlen);\ }\ } while (/*CONSTCOND*/ 0) static int copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait) { if (dst == NULL || src == NULL) { printf("ip6_clearpktopts: invalid argument\n"); return (EINVAL); } dst->ip6po_hlim = src->ip6po_hlim; dst->ip6po_tclass = src->ip6po_tclass; dst->ip6po_flags = src->ip6po_flags; dst->ip6po_minmtu = src->ip6po_minmtu; dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr; if (src->ip6po_pktinfo) { dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo), M_IP6OPT, canwait); if (dst->ip6po_pktinfo == NULL) goto bad; *dst->ip6po_pktinfo = *src->ip6po_pktinfo; } if (src->ip6po_nexthop) { dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len, M_IP6OPT, canwait); if (dst->ip6po_nexthop == NULL) goto bad; bcopy(src->ip6po_nexthop, dst->ip6po_nexthop, src->ip6po_nexthop->sa_len); } PKTOPT_EXTHDRCPY(ip6po_hbh); PKTOPT_EXTHDRCPY(ip6po_dest1); PKTOPT_EXTHDRCPY(ip6po_dest2); PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */ return (0); bad: ip6_clearpktopts(dst, -1); return (ENOBUFS); } #undef PKTOPT_EXTHDRCPY struct ip6_pktopts * ip6_copypktopts(struct ip6_pktopts *src, int canwait) { int error; struct ip6_pktopts *dst; dst = malloc(sizeof(*dst), M_IP6OPT, canwait); if (dst == NULL) return (NULL); ip6_initpktopts(dst); if ((error = copypktopts(dst, src, canwait)) != 0) { free(dst, M_IP6OPT); return (NULL); } return (dst); } void ip6_freepcbopts(struct ip6_pktopts *pktopt) { if (pktopt == NULL) return; ip6_clearpktopts(pktopt, -1); free(pktopt, M_IP6OPT); } /* * Set IPv6 outgoing packet options based on advanced API. */ int ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt, struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto) { struct cmsghdr *cm = NULL; if (control == NULL || opt == NULL) return (EINVAL); ip6_initpktopts(opt); if (stickyopt) { int error; /* * If stickyopt is provided, make a local copy of the options * for this particular packet, then override them by ancillary * objects. * XXX: copypktopts() does not copy the cached route to a next * hop (if any). This is not very good in terms of efficiency, * but we can allow this since this option should be rarely * used. */ if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0) return (error); } /* * XXX: Currently, we assume all the optional information is stored * in a single mbuf. */ if (control->m_next) return (EINVAL); for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { int error; if (control->m_len < CMSG_LEN(0)) return (EINVAL); cm = mtod(control, struct cmsghdr *); if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) return (EINVAL); if (cm->cmsg_level != IPPROTO_IPV6) continue; error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm), cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto); if (error) return (error); } return (0); } /* * Set a particular packet option, as a sticky option or an ancillary data * item. "len" can be 0 only when it's a sticky option. * We have 4 cases of combination of "sticky" and "cmsg": * "sticky=0, cmsg=0": impossible * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data * "sticky=1, cmsg=0": RFC3542 socket option * "sticky=1, cmsg=1": RFC2292 socket option */ static int ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, struct ucred *cred, int sticky, int cmsg, int uproto) { int minmtupolicy, preftemp; int error; if (!sticky && !cmsg) { #ifdef DIAGNOSTIC printf("ip6_setpktopt: impossible case\n"); #endif return (EINVAL); } /* * IPV6_2292xxx is for backward compatibility to RFC2292, and should * not be specified in the context of RFC3542. Conversely, * RFC3542 types should not be specified in the context of RFC2292. */ if (!cmsg) { switch (optname) { case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292NEXTHOP: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: case IPV6_2292PKTOPTIONS: return (ENOPROTOOPT); } } if (sticky && cmsg) { switch (optname) { case IPV6_PKTINFO: case IPV6_HOPLIMIT: case IPV6_NEXTHOP: case IPV6_HOPOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_USE_MIN_MTU: case IPV6_DONTFRAG: case IPV6_TCLASS: case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */ return (ENOPROTOOPT); } } switch (optname) { case IPV6_2292PKTINFO: case IPV6_PKTINFO: { struct ifnet *ifp = NULL; struct in6_pktinfo *pktinfo; if (len != sizeof(struct in6_pktinfo)) return (EINVAL); pktinfo = (struct in6_pktinfo *)buf; /* * An application can clear any sticky IPV6_PKTINFO option by * doing a "regular" setsockopt with ipi6_addr being * in6addr_any and ipi6_ifindex being zero. * [RFC 3542, Section 6] */ if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo && pktinfo->ipi6_ifindex == 0 && IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ip6_clearpktopts(opt, optname); break; } if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO && sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { return (EINVAL); } if (IN6_IS_ADDR_MULTICAST(&pktinfo->ipi6_addr)) return (EINVAL); /* validate the interface index if specified. */ if (pktinfo->ipi6_ifindex > V_if_index) return (ENXIO); if (pktinfo->ipi6_ifindex) { ifp = ifnet_byindex(pktinfo->ipi6_ifindex); if (ifp == NULL) return (ENXIO); } if (ifp != NULL && (ifp->if_afdata[AF_INET6] == NULL || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) != 0)) return (ENETDOWN); if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { struct in6_ifaddr *ia; in6_setscope(&pktinfo->ipi6_addr, ifp, NULL); ia = in6ifa_ifpwithaddr(ifp, &pktinfo->ipi6_addr); if (ia == NULL) return (EADDRNOTAVAIL); ifa_free(&ia->ia_ifa); } /* * We store the address anyway, and let in6_selectsrc() * validate the specified address. This is because ipi6_addr * may not have enough information about its scope zone, and * we may need additional information (such as outgoing * interface or the scope zone of a destination address) to * disambiguate the scope. * XXX: the delay of the validation may confuse the * application when it is used as a sticky option. */ if (opt->ip6po_pktinfo == NULL) { opt->ip6po_pktinfo = malloc(sizeof(*pktinfo), M_IP6OPT, M_NOWAIT); if (opt->ip6po_pktinfo == NULL) return (ENOBUFS); } bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo)); break; } case IPV6_2292HOPLIMIT: case IPV6_HOPLIMIT: { int *hlimp; /* * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT * to simplify the ordering among hoplimit options. */ if (optname == IPV6_HOPLIMIT && sticky) return (ENOPROTOOPT); if (len != sizeof(int)) return (EINVAL); hlimp = (int *)buf; if (*hlimp < -1 || *hlimp > 255) return (EINVAL); opt->ip6po_hlim = *hlimp; break; } case IPV6_TCLASS: { int tclass; if (len != sizeof(int)) return (EINVAL); tclass = *(int *)buf; if (tclass < -1 || tclass > 255) return (EINVAL); opt->ip6po_tclass = tclass; break; } case IPV6_2292NEXTHOP: case IPV6_NEXTHOP: if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS); if (error) return (error); } if (len == 0) { /* just remove the option */ ip6_clearpktopts(opt, IPV6_NEXTHOP); break; } /* check if cmsg_len is large enough for sa_len */ if (len < sizeof(struct sockaddr) || len < *buf) return (EINVAL); switch (((struct sockaddr *)buf)->sa_family) { case AF_INET6: { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf; int error; if (sa6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) { return (EINVAL); } if ((error = sa6_embedscope(sa6, V_ip6_use_defzone)) != 0) { return (error); } break; } case AF_LINK: /* should eventually be supported */ default: return (EAFNOSUPPORT); } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_NEXTHOP); opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT); if (opt->ip6po_nexthop == NULL) return (ENOBUFS); bcopy(buf, opt->ip6po_nexthop, *buf); break; case IPV6_2292HOPOPTS: case IPV6_HOPOPTS: { struct ip6_hbh *hbh; int hbhlen; /* * XXX: We don't allow a non-privileged user to set ANY HbH * options, since per-option restriction has too much * overhead. */ if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, IPV6_HOPOPTS); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_hbh)) return (EINVAL); hbh = (struct ip6_hbh *)buf; hbhlen = (hbh->ip6h_len + 1) << 3; if (len != hbhlen) return (EINVAL); /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_HOPOPTS); opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_hbh == NULL) return (ENOBUFS); bcopy(hbh, opt->ip6po_hbh, hbhlen); break; } case IPV6_2292DSTOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: { struct ip6_dest *dest, **newdest = NULL; int destlen; if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */ error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, optname); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_dest)) return (EINVAL); dest = (struct ip6_dest *)buf; destlen = (dest->ip6d_len + 1) << 3; if (len != destlen) return (EINVAL); /* * Determine the position that the destination options header * should be inserted; before or after the routing header. */ switch (optname) { case IPV6_2292DSTOPTS: /* * The old advacned API is ambiguous on this point. * Our approach is to determine the position based * according to the existence of a routing header. * Note, however, that this depends on the order of the * extension headers in the ancillary data; the 1st * part of the destination options header must appear * before the routing header in the ancillary data, * too. * RFC3542 solved the ambiguity by introducing * separate ancillary data or option types. */ if (opt->ip6po_rthdr == NULL) newdest = &opt->ip6po_dest1; else newdest = &opt->ip6po_dest2; break; case IPV6_RTHDRDSTOPTS: newdest = &opt->ip6po_dest1; break; case IPV6_DSTOPTS: newdest = &opt->ip6po_dest2; break; } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, optname); *newdest = malloc(destlen, M_IP6OPT, M_NOWAIT); if (*newdest == NULL) return (ENOBUFS); bcopy(dest, *newdest, destlen); break; } case IPV6_2292RTHDR: case IPV6_RTHDR: { struct ip6_rthdr *rth; int rthlen; if (len == 0) { ip6_clearpktopts(opt, IPV6_RTHDR); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_rthdr)) return (EINVAL); rth = (struct ip6_rthdr *)buf; rthlen = (rth->ip6r_len + 1) << 3; if (len != rthlen) return (EINVAL); switch (rth->ip6r_type) { case IPV6_RTHDR_TYPE_0: if (rth->ip6r_len == 0) /* must contain one addr */ return (EINVAL); if (rth->ip6r_len % 2) /* length must be even */ return (EINVAL); if (rth->ip6r_len / 2 != rth->ip6r_segleft) return (EINVAL); break; default: return (EINVAL); /* not supported */ } /* turn off the previous option */ ip6_clearpktopts(opt, IPV6_RTHDR); opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_rthdr == NULL) return (ENOBUFS); bcopy(rth, opt->ip6po_rthdr, rthlen); break; } case IPV6_USE_MIN_MTU: if (len != sizeof(int)) return (EINVAL); minmtupolicy = *(int *)buf; if (minmtupolicy != IP6PO_MINMTU_MCASTONLY && minmtupolicy != IP6PO_MINMTU_DISABLE && minmtupolicy != IP6PO_MINMTU_ALL) { return (EINVAL); } opt->ip6po_minmtu = minmtupolicy; break; case IPV6_DONTFRAG: if (len != sizeof(int)) return (EINVAL); if (uproto == IPPROTO_TCP || *(int *)buf == 0) { /* * we ignore this option for TCP sockets. * (RFC3542 leaves this case unspecified.) */ opt->ip6po_flags &= ~IP6PO_DONTFRAG; } else opt->ip6po_flags |= IP6PO_DONTFRAG; break; case IPV6_PREFER_TEMPADDR: if (len != sizeof(int)) return (EINVAL); preftemp = *(int *)buf; if (preftemp != IP6PO_TEMPADDR_SYSTEM && preftemp != IP6PO_TEMPADDR_NOTPREFER && preftemp != IP6PO_TEMPADDR_PREFER) { return (EINVAL); } opt->ip6po_prefer_tempaddr = preftemp; break; default: return (ENOPROTOOPT); } /* end of switch */ return (0); } /* * Routine called from ip6_output() to loop back a copy of an IP6 multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be &loif -- easier than replicating that code here. */ void ip6_mloopback(struct ifnet *ifp, struct mbuf *m) { struct mbuf *copym; struct ip6_hdr *ip6; copym = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (copym == NULL) return; /* * Make sure to deep-copy IPv6 header portion in case the data * is in an mbuf cluster, so that we can safely override the IPv6 * header portion later. */ if (!M_WRITABLE(copym) || copym->m_len < sizeof(struct ip6_hdr)) { copym = m_pullup(copym, sizeof(struct ip6_hdr)); if (copym == NULL) return; } ip6 = mtod(copym, struct ip6_hdr *); /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } if_simloop(ifp, copym, AF_INET6, 0); } /* * Chop IPv6 header off from the payload. */ static int ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs) { struct mbuf *mh; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); if (m->m_len > sizeof(*ip6)) { mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { m_freem(m); return ENOBUFS; } m_move_pkthdr(mh, m); M_ALIGN(mh, sizeof(*ip6)); m->m_len -= sizeof(*ip6); m->m_data += sizeof(*ip6); mh->m_next = m; m = mh; m->m_len = sizeof(*ip6); bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6)); } exthdrs->ip6e_ip6 = m; return 0; } /* * Compute IPv6 extension header length. */ int ip6_optlen(struct inpcb *inp) { int len; if (!inp->in6p_outputopts) return 0; len = 0; #define elen(x) \ (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) len += elen(inp->in6p_outputopts->ip6po_hbh); if (inp->in6p_outputopts->ip6po_rthdr) /* dest1 is valid with rthdr only */ len += elen(inp->in6p_outputopts->ip6po_dest1); len += elen(inp->in6p_outputopts->ip6po_rthdr); len += elen(inp->in6p_outputopts->ip6po_dest2); return len; #undef elen } diff --git a/sys/netinet6/mld6.c b/sys/netinet6/mld6.c index 6ea63b47b47a..868edeaff8c5 100644 --- a/sys/netinet6/mld6.c +++ b/sys/netinet6/mld6.c @@ -1,3338 +1,3329 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2009 Bruce Simpson. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $ */ /*- * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_MLD #define KTR_MLD KTR_INET6 #endif static struct mld_ifsoftc * mli_alloc_locked(struct ifnet *); static void mli_delete_locked(const struct ifnet *); static void mld_dispatch_packet(struct mbuf *); static void mld_dispatch_queue(struct mbufq *, int); static void mld_final_leave(struct in6_multi *, struct mld_ifsoftc *); static void mld_fasttimo_vnet(struct in6_multi_head *inmh); static int mld_handle_state_change(struct in6_multi *, struct mld_ifsoftc *); static int mld_initial_join(struct in6_multi *, struct mld_ifsoftc *, const int); #ifdef KTR static char * mld_rec_type_to_str(const int); #endif static void mld_set_version(struct mld_ifsoftc *, const int); static void mld_slowtimo_vnet(void); static int mld_v1_input_query(struct ifnet *, const struct ip6_hdr *, /*const*/ struct mld_hdr *); static int mld_v1_input_report(struct ifnet *, const struct ip6_hdr *, /*const*/ struct mld_hdr *); static void mld_v1_process_group_timer(struct in6_multi_head *, struct in6_multi *); static void mld_v1_process_querier_timers(struct mld_ifsoftc *); static int mld_v1_transmit_report(struct in6_multi *, const int); static void mld_v1_update_group(struct in6_multi *, const int); static void mld_v2_cancel_link_timers(struct mld_ifsoftc *); static void mld_v2_dispatch_general_query(struct mld_ifsoftc *); static struct mbuf * mld_v2_encap_report(struct ifnet *, struct mbuf *); static int mld_v2_enqueue_filter_change(struct mbufq *, struct in6_multi *); static int mld_v2_enqueue_group_record(struct mbufq *, struct in6_multi *, const int, const int, const int, const int); static int mld_v2_input_query(struct ifnet *, const struct ip6_hdr *, struct mbuf *, struct mldv2_query *, const int, const int); static int mld_v2_merge_state_changes(struct in6_multi *, struct mbufq *); static void mld_v2_process_group_timers(struct in6_multi_head *, struct mbufq *, struct mbufq *, struct in6_multi *, const int); static int mld_v2_process_group_query(struct in6_multi *, struct mld_ifsoftc *mli, int, struct mbuf *, struct mldv2_query *, const int); static int sysctl_mld_gsr(SYSCTL_HANDLER_ARGS); static int sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS); /* * Normative references: RFC 2710, RFC 3590, RFC 3810. * * Locking: * * The MLD subsystem lock ends up being system-wide for the moment, * but could be per-VIMAGE later on. * * The permitted lock order is: IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK. * Any may be taken independently; if any are held at the same * time, the above lock order must be followed. * * IN6_MULTI_LOCK covers in_multi. * * MLD_LOCK covers per-link state and any global variables in this file. * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of * per-link state iterators. * * XXX LOR PREVENTION * A special case for IPv6 is the in6_setscope() routine. ip6_output() * will not accept an ifp; it wants an embedded scope ID, unlike * ip_output(), which happily takes the ifp given to it. The embedded * scope ID is only used by MLD to select the outgoing interface. * * During interface attach and detach, MLD will take MLD_LOCK *after* * the IF_AFDATA_LOCK. * As in6_setscope() takes IF_AFDATA_LOCK then SCOPE_LOCK, we can't call * it with MLD_LOCK held without triggering an LOR. A netisr with indirect * dispatch could work around this, but we'd rather not do that, as it * can introduce other races. * * As such, we exploit the fact that the scope ID is just the interface * index, and embed it in the IPv6 destination address accordingly. * This is potentially NOT VALID for MLDv1 reports, as they * are always sent to the multicast group itself; as MLDv2 * reports are always sent to ff02::16, this is not an issue * when MLDv2 is in use. * * This does not however eliminate the LOR when ip6_output() itself * calls in6_setscope() internally whilst MLD_LOCK is held. This will * trigger a LOR warning in WITNESS when the ifnet is detached. * * The right answer is probably to make IF_AFDATA_LOCK an rwlock, given * how it's used across the network stack. Here we're simply exploiting * the fact that MLD runs at a similar layer in the stack to scope6.c. * * VIMAGE: * * Each in6_multi corresponds to an ifp, and each ifp corresponds * to a vnet in ifp->if_vnet. */ static struct mtx mld_mtx; static MALLOC_DEFINE(M_MLD, "mld", "mld state"); #define MLD_EMBEDSCOPE(pin6, zoneid) \ if (IN6_IS_SCOPE_LINKLOCAL(pin6) || \ IN6_IS_ADDR_MC_INTFACELOCAL(pin6)) \ (pin6)->s6_addr16[1] = htons((zoneid) & 0xFFFF) \ /* * VIMAGE-wide globals. */ VNET_DEFINE_STATIC(struct timeval, mld_gsrdelay) = {10, 0}; VNET_DEFINE_STATIC(LIST_HEAD(, mld_ifsoftc), mli_head); VNET_DEFINE_STATIC(int, interface_timers_running6); VNET_DEFINE_STATIC(int, state_change_timers_running6); VNET_DEFINE_STATIC(int, current_state_timers_running6); #define V_mld_gsrdelay VNET(mld_gsrdelay) #define V_mli_head VNET(mli_head) #define V_interface_timers_running6 VNET(interface_timers_running6) #define V_state_change_timers_running6 VNET(state_change_timers_running6) #define V_current_state_timers_running6 VNET(current_state_timers_running6) SYSCTL_DECL(_net_inet6); /* Note: Not in any common header. */ SYSCTL_NODE(_net_inet6, OID_AUTO, mld, CTLFLAG_RW, 0, "IPv6 Multicast Listener Discovery"); /* * Virtualized sysctls. */ SYSCTL_PROC(_net_inet6_mld, OID_AUTO, gsrdelay, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(mld_gsrdelay.tv_sec), 0, sysctl_mld_gsr, "I", "Rate limit for MLDv2 Group-and-Source queries in seconds"); /* * Non-virtualized sysctls. */ static SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mld_ifinfo, "Per-interface MLDv2 state"); static int mld_v1enable = 1; SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN, &mld_v1enable, 0, "Enable fallback to MLDv1"); static int mld_v2enable = 1; SYSCTL_INT(_net_inet6_mld, OID_AUTO, v2enable, CTLFLAG_RWTUN, &mld_v2enable, 0, "Enable MLDv2"); static int mld_use_allow = 1; SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN, &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves"); /* * Packed Router Alert option structure declaration. */ struct mld_raopt { struct ip6_hbh hbh; struct ip6_opt pad; struct ip6_opt_router ra; } __packed; /* * Router Alert hop-by-hop option header. */ static struct mld_raopt mld_ra = { .hbh = { 0, 0 }, .pad = { .ip6o_type = IP6OPT_PADN, 0 }, .ra = { .ip6or_type = IP6OPT_ROUTER_ALERT, .ip6or_len = IP6OPT_RTALERT_LEN - 2, .ip6or_value[0] = ((IP6OPT_RTALERT_MLD >> 8) & 0xFF), .ip6or_value[1] = (IP6OPT_RTALERT_MLD & 0xFF) } }; static struct ip6_pktopts mld_po; static __inline void mld_save_context(struct mbuf *m, struct ifnet *ifp) { #ifdef VIMAGE m->m_pkthdr.PH_loc.ptr = ifp->if_vnet; #endif /* VIMAGE */ m->m_pkthdr.flowid = ifp->if_index; } static __inline void mld_scrub_context(struct mbuf *m) { m->m_pkthdr.PH_loc.ptr = NULL; m->m_pkthdr.flowid = 0; } /* * Restore context from a queued output chain. * Return saved ifindex. * * VIMAGE: The assertion is there to make sure that we * actually called CURVNET_SET() with what's in the mbuf chain. */ static __inline uint32_t mld_restore_context(struct mbuf *m) { #if defined(VIMAGE) && defined(INVARIANTS) KASSERT(curvnet == m->m_pkthdr.PH_loc.ptr, ("%s: called when curvnet was not restored: cuvnet %p m ptr %p", __func__, curvnet, m->m_pkthdr.PH_loc.ptr)); #endif return (m->m_pkthdr.flowid); } /* * Retrieve or set threshold between group-source queries in seconds. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by MLD lock. */ static int sysctl_mld_gsr(SYSCTL_HANDLER_ARGS) { int error; int i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); MLD_LOCK(); i = V_mld_gsrdelay.tv_sec; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) goto out_locked; if (i < -1 || i >= 60) { error = EINVAL; goto out_locked; } CTR2(KTR_MLD, "change mld_gsrdelay from %d to %d", V_mld_gsrdelay.tv_sec, i); V_mld_gsrdelay.tv_sec = i; out_locked: MLD_UNLOCK(); return (error); } /* * Expose struct mld_ifsoftc to userland, keyed by ifindex. * For use by ifmcstat(8). * * SMPng: NOTE: Does an unlocked ifindex space read. * VIMAGE: Assume curvnet set by caller. The node handler itself * is not directly virtualized. */ static int sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS) { int *name; int error; u_int namelen; struct ifnet *ifp; struct mld_ifsoftc *mli; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, sizeof(struct mld_ifinfo)); if (error) return (error); IN6_MULTI_LOCK(); IN6_MULTI_LIST_LOCK(); MLD_LOCK(); if (name[0] <= 0 || name[0] > V_if_index) { error = ENOENT; goto out_locked; } error = ENOENT; ifp = ifnet_byindex(name[0]); if (ifp == NULL) goto out_locked; LIST_FOREACH(mli, &V_mli_head, mli_link) { if (ifp == mli->mli_ifp) { struct mld_ifinfo info; info.mli_version = mli->mli_version; info.mli_v1_timer = mli->mli_v1_timer; info.mli_v2_timer = mli->mli_v2_timer; info.mli_flags = mli->mli_flags; info.mli_rv = mli->mli_rv; info.mli_qi = mli->mli_qi; info.mli_qri = mli->mli_qri; info.mli_uri = mli->mli_uri; error = SYSCTL_OUT(req, &info, sizeof(info)); break; } } out_locked: MLD_UNLOCK(); IN6_MULTI_LIST_UNLOCK(); IN6_MULTI_UNLOCK(); return (error); } /* * Dispatch an entire queue of pending packet chains. * VIMAGE: Assumes the vnet pointer has been set. */ static void mld_dispatch_queue(struct mbufq *mq, int limit) { struct mbuf *m; while ((m = mbufq_dequeue(mq)) != NULL) { CTR3(KTR_MLD, "%s: dispatch %p from %p", __func__, mq, m); mld_dispatch_packet(m); if (--limit == 0) break; } } /* * Filter outgoing MLD report state by group. * * Reports are ALWAYS suppressed for ALL-HOSTS (ff02::1) * and node-local addresses. However, kernel and socket consumers * always embed the KAME scope ID in the address provided, so strip it * when performing comparison. * Note: This is not the same as the *multicast* scope. * * Return zero if the given group is one for which MLD reports * should be suppressed, or non-zero if reports should be issued. */ static __inline int mld_is_addr_reported(const struct in6_addr *addr) { KASSERT(IN6_IS_ADDR_MULTICAST(addr), ("%s: not multicast", __func__)); if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_NODELOCAL) return (0); if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_LINKLOCAL) { struct in6_addr tmp = *addr; in6_clearscope(&tmp); if (IN6_ARE_ADDR_EQUAL(&tmp, &in6addr_linklocal_allnodes)) return (0); } return (1); } /* * Attach MLD when PF_INET6 is attached to an interface. * * SMPng: Normally called with IF_AFDATA_LOCK held. */ struct mld_ifsoftc * mld_domifattach(struct ifnet *ifp) { struct mld_ifsoftc *mli; CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp, if_name(ifp)); MLD_LOCK(); mli = mli_alloc_locked(ifp); if (!(ifp->if_flags & IFF_MULTICAST)) mli->mli_flags |= MLIF_SILENT; if (mld_use_allow) mli->mli_flags |= MLIF_USEALLOW; MLD_UNLOCK(); return (mli); } /* * VIMAGE: assume curvnet set by caller. */ static struct mld_ifsoftc * mli_alloc_locked(/*const*/ struct ifnet *ifp) { struct mld_ifsoftc *mli; MLD_LOCK_ASSERT(); mli = malloc(sizeof(struct mld_ifsoftc), M_MLD, M_NOWAIT|M_ZERO); if (mli == NULL) goto out; mli->mli_ifp = ifp; mli->mli_version = MLD_VERSION_2; mli->mli_flags = 0; mli->mli_rv = MLD_RV_INIT; mli->mli_qi = MLD_QI_INIT; mli->mli_qri = MLD_QRI_INIT; mli->mli_uri = MLD_URI_INIT; mbufq_init(&mli->mli_gq, MLD_MAX_RESPONSE_PACKETS); LIST_INSERT_HEAD(&V_mli_head, mli, mli_link); CTR2(KTR_MLD, "allocate mld_ifsoftc for ifp %p(%s)", ifp, if_name(ifp)); out: return (mli); } /* * Hook for ifdetach. * * NOTE: Some finalization tasks need to run before the protocol domain * is detached, but also before the link layer does its cleanup. * Run before link-layer cleanup; cleanup groups, but do not free MLD state. * * SMPng: Caller must hold IN6_MULTI_LOCK(). * Must take IF_ADDR_LOCK() to cover if_multiaddrs iterator. * XXX This routine is also bitten by unlocked ifma_protospec access. */ void mld_ifdetach(struct ifnet *ifp, struct in6_multi_head *inmh) { struct epoch_tracker et; struct mld_ifsoftc *mli; struct ifmultiaddr *ifma; struct in6_multi *inm; CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp, if_name(ifp)); IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK(); mli = MLD_IFINFO(ifp); IF_ADDR_WLOCK(ifp); /* * Extract list of in6_multi associated with the detaching ifp * which the PF_INET6 layer is about to release. */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { inm = in6m_ifmultiaddr_get_inm(ifma); if (inm == NULL) continue; in6m_disconnect_locked(inmh, inm); if (mli->mli_version == MLD_VERSION_2) { in6m_clear_recorded(inm); /* * We need to release the final reference held * for issuing the INCLUDE {}. */ if (inm->in6m_state == MLD_LEAVING_MEMBER) { inm->in6m_state = MLD_NOT_MEMBER; in6m_rele_locked(inmh, inm); } } } NET_EPOCH_EXIT(et); IF_ADDR_WUNLOCK(ifp); MLD_UNLOCK(); } /* * Hook for domifdetach. * Runs after link-layer cleanup; free MLD state. * * SMPng: Normally called with IF_AFDATA_LOCK held. */ void mld_domifdetach(struct ifnet *ifp) { CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp, if_name(ifp)); MLD_LOCK(); mli_delete_locked(ifp); MLD_UNLOCK(); } static void mli_delete_locked(const struct ifnet *ifp) { struct mld_ifsoftc *mli, *tmli; CTR3(KTR_MLD, "%s: freeing mld_ifsoftc for ifp %p(%s)", __func__, ifp, if_name(ifp)); MLD_LOCK_ASSERT(); LIST_FOREACH_SAFE(mli, &V_mli_head, mli_link, tmli) { if (mli->mli_ifp == ifp) { /* * Free deferred General Query responses. */ mbufq_drain(&mli->mli_gq); LIST_REMOVE(mli, mli_link); free(mli, M_MLD); return; } } } /* * Process a received MLDv1 general or address-specific query. * Assumes that the query header has been pulled up to sizeof(mld_hdr). * * NOTE: Can't be fully const correct as we temporarily embed scope ID in * mld_addr. This is OK as we own the mbuf chain. */ static int mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, /*const*/ struct mld_hdr *mld) { - struct epoch_tracker et; struct ifmultiaddr *ifma; struct mld_ifsoftc *mli; struct in6_multi *inm; int is_general_query; uint16_t timer; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif + NET_EPOCH_ASSERT(); + is_general_query = 0; if (!mld_v1enable) { CTR3(KTR_MLD, "ignore v1 query %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); return (0); } /* * RFC3810 Section 6.2: MLD queries must originate from * a router's link-local address. */ if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); return (0); } /* * Do address field validation upfront before we accept * the query. */ if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) { /* * MLDv1 General Query. * If this was not sent to the all-nodes group, ignore it. */ struct in6_addr dst; dst = ip6->ip6_dst; in6_clearscope(&dst); if (!IN6_ARE_ADDR_EQUAL(&dst, &in6addr_linklocal_allnodes)) return (EINVAL); is_general_query = 1; } else { /* * Embed scope ID of receiving interface in MLD query for * lookup whilst we don't hold other locks. */ in6_setscope(&mld->mld_addr, ifp, NULL); } IN6_MULTI_LIST_LOCK(); MLD_LOCK(); /* * Switch to MLDv1 host compatibility mode. */ mli = MLD_IFINFO(ifp); KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp)); mld_set_version(mli, MLD_VERSION_1); timer = (ntohs(mld->mld_maxdelay) * PR_FASTHZ) / MLD_TIMER_SCALE; if (timer == 0) timer = 1; - NET_EPOCH_ENTER(et); if (is_general_query) { /* * For each reporting group joined on this * interface, kick the report timer. */ CTR2(KTR_MLD, "process v1 general query on ifp %p(%s)", ifp, if_name(ifp)); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { inm = in6m_ifmultiaddr_get_inm(ifma); if (inm == NULL) continue; mld_v1_update_group(inm, timer); } } else { /* * MLDv1 Group-Specific Query. * If this is a group-specific MLDv1 query, we need only * look up the single group to process it. */ inm = in6m_lookup_locked(ifp, &mld->mld_addr); if (inm != NULL) { CTR3(KTR_MLD, "process v1 query %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); mld_v1_update_group(inm, timer); } /* XXX Clear embedded scope ID as userland won't expect it. */ in6_clearscope(&mld->mld_addr); } - NET_EPOCH_EXIT(et); MLD_UNLOCK(); IN6_MULTI_LIST_UNLOCK(); return (0); } /* * Update the report timer on a group in response to an MLDv1 query. * * If we are becoming the reporting member for this group, start the timer. * If we already are the reporting member for this group, and timer is * below the threshold, reset it. * * We may be updating the group for the first time since we switched * to MLDv2. If we are, then we must clear any recorded source lists, * and transition to REPORTING state; the group timer is overloaded * for group and group-source query responses. * * Unlike MLDv2, the delay per group should be jittered * to avoid bursts of MLDv1 reports. */ static void mld_v1_update_group(struct in6_multi *inm, const int timer) { #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif CTR4(KTR_MLD, "%s: %s/%s timer=%d", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp), timer); IN6_MULTI_LIST_LOCK_ASSERT(); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: break; case MLD_REPORTING_MEMBER: if (inm->in6m_timer != 0 && inm->in6m_timer <= timer) { CTR1(KTR_MLD, "%s: REPORTING and timer running, " "skipping.", __func__); break; } /* FALLTHROUGH */ case MLD_SG_QUERY_PENDING_MEMBER: case MLD_G_QUERY_PENDING_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_AWAKENING_MEMBER: CTR1(KTR_MLD, "%s: ->REPORTING", __func__); inm->in6m_state = MLD_REPORTING_MEMBER; inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; break; case MLD_SLEEPING_MEMBER: CTR1(KTR_MLD, "%s: ->AWAKENING", __func__); inm->in6m_state = MLD_AWAKENING_MEMBER; break; case MLD_LEAVING_MEMBER: break; } } /* * Process a received MLDv2 general, group-specific or * group-and-source-specific query. * * Assumes that mld points to a struct mldv2_query which is stored in * contiguous memory. * * Return 0 if successful, otherwise an appropriate error code is returned. */ static int mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, struct mbuf *m, struct mldv2_query *mld, const int off, const int icmp6len) { struct mld_ifsoftc *mli; struct in6_multi *inm; uint32_t maxdelay, nsrc, qqi; int is_general_query; uint16_t timer; uint8_t qrv; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif + NET_EPOCH_ASSERT(); + if (!mld_v2enable) { CTR3(KTR_MLD, "ignore v2 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); return (0); } /* * RFC3810 Section 6.2: MLD queries must originate from * a router's link-local address. */ if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); return (0); } is_general_query = 0; CTR2(KTR_MLD, "input v2 query on ifp %p(%s)", ifp, if_name(ifp)); maxdelay = ntohs(mld->mld_maxdelay); /* in 1/10ths of a second */ if (maxdelay >= 32768) { maxdelay = (MLD_MRC_MANT(maxdelay) | 0x1000) << (MLD_MRC_EXP(maxdelay) + 3); } timer = (maxdelay * PR_FASTHZ) / MLD_TIMER_SCALE; if (timer == 0) timer = 1; qrv = MLD_QRV(mld->mld_misc); if (qrv < 2) { CTR3(KTR_MLD, "%s: clamping qrv %d to %d", __func__, qrv, MLD_RV_INIT); qrv = MLD_RV_INIT; } qqi = mld->mld_qqi; if (qqi >= 128) { qqi = MLD_QQIC_MANT(mld->mld_qqi) << (MLD_QQIC_EXP(mld->mld_qqi) + 3); } nsrc = ntohs(mld->mld_numsrc); if (nsrc > MLD_MAX_GS_SOURCES) return (EMSGSIZE); if (icmp6len < sizeof(struct mldv2_query) + (nsrc * sizeof(struct in6_addr))) return (EMSGSIZE); /* * Do further input validation upfront to avoid resetting timers * should we need to discard this query. */ if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) { /* * A general query with a source list has undefined * behaviour; discard it. */ if (nsrc > 0) return (EINVAL); is_general_query = 1; } else { /* * Embed scope ID of receiving interface in MLD query for * lookup whilst we don't hold other locks (due to KAME * locking lameness). We own this mbuf chain just now. */ in6_setscope(&mld->mld_addr, ifp, NULL); } IN6_MULTI_LIST_LOCK(); MLD_LOCK(); mli = MLD_IFINFO(ifp); KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp)); /* * Discard the v2 query if we're in Compatibility Mode. * The RFC is pretty clear that hosts need to stay in MLDv1 mode * until the Old Version Querier Present timer expires. */ if (mli->mli_version != MLD_VERSION_2) goto out_locked; mld_set_version(mli, MLD_VERSION_2); mli->mli_rv = qrv; mli->mli_qi = qqi; mli->mli_qri = maxdelay; CTR4(KTR_MLD, "%s: qrv %d qi %d maxdelay %d", __func__, qrv, qqi, maxdelay); if (is_general_query) { /* * MLDv2 General Query. * * Schedule a current-state report on this ifp for * all groups, possibly containing source lists. * * If there is a pending General Query response * scheduled earlier than the selected delay, do * not schedule any other reports. * Otherwise, reset the interface timer. */ CTR2(KTR_MLD, "process v2 general query on ifp %p(%s)", ifp, if_name(ifp)); if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) { mli->mli_v2_timer = MLD_RANDOM_DELAY(timer); V_interface_timers_running6 = 1; } } else { - struct epoch_tracker et; - /* * MLDv2 Group-specific or Group-and-source-specific Query. * * Group-source-specific queries are throttled on * a per-group basis to defeat denial-of-service attempts. * Queries for groups we are not a member of on this * link are simply ignored. */ - NET_EPOCH_ENTER(et); inm = in6m_lookup_locked(ifp, &mld->mld_addr); - if (inm == NULL) { - NET_EPOCH_EXIT(et); + if (inm == NULL) goto out_locked; - } if (nsrc > 0) { if (!ratecheck(&inm->in6m_lastgsrtv, &V_mld_gsrdelay)) { CTR1(KTR_MLD, "%s: GS query throttled.", __func__); - NET_EPOCH_EXIT(et); goto out_locked; } } CTR2(KTR_MLD, "process v2 group query on ifp %p(%s)", ifp, if_name(ifp)); /* * If there is a pending General Query response * scheduled sooner than the selected delay, no * further report need be scheduled. * Otherwise, prepare to respond to the * group-specific or group-and-source query. */ if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) mld_v2_process_group_query(inm, mli, timer, m, mld, off); /* XXX Clear embedded scope ID as userland won't expect it. */ in6_clearscope(&mld->mld_addr); - NET_EPOCH_EXIT(et); } out_locked: MLD_UNLOCK(); IN6_MULTI_LIST_UNLOCK(); return (0); } /* * Process a received MLDv2 group-specific or group-and-source-specific * query. * Return <0 if any error occurred. Currently this is ignored. */ static int mld_v2_process_group_query(struct in6_multi *inm, struct mld_ifsoftc *mli, int timer, struct mbuf *m0, struct mldv2_query *mld, const int off) { int retval; uint16_t nsrc; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); retval = 0; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_LAZY_MEMBER: case MLD_AWAKENING_MEMBER: case MLD_IDLE_MEMBER: case MLD_LEAVING_MEMBER: return (retval); break; case MLD_REPORTING_MEMBER: case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: break; } nsrc = ntohs(mld->mld_numsrc); /* Length should be checked by calling function. */ KASSERT((m0->m_flags & M_PKTHDR) == 0 || m0->m_pkthdr.len >= off + sizeof(struct mldv2_query) + nsrc * sizeof(struct in6_addr), ("mldv2 packet is too short: (%d bytes < %zd bytes, m=%p)", m0->m_pkthdr.len, off + sizeof(struct mldv2_query) + nsrc * sizeof(struct in6_addr), m0)); /* * Deal with group-specific queries upfront. * If any group query is already pending, purge any recorded * source-list state if it exists, and schedule a query response * for this group-specific query. */ if (nsrc == 0) { if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER || inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) { in6m_clear_recorded(inm); timer = min(inm->in6m_timer, timer); } inm->in6m_state = MLD_G_QUERY_PENDING_MEMBER; inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; return (retval); } /* * Deal with the case where a group-and-source-specific query has * been received but a group-specific query is already pending. */ if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER) { timer = min(inm->in6m_timer, timer); inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; return (retval); } /* * Finally, deal with the case where a group-and-source-specific * query has been received, where a response to a previous g-s-r * query exists, or none exists. * In this case, we need to parse the source-list which the Querier * has provided us with and check if we have any source list filter * entries at T1 for these sources. If we do not, there is no need * schedule a report and the query may be dropped. * If we do, we must record them and schedule a current-state * report for those sources. */ if (inm->in6m_nsrc > 0) { struct in6_addr srcaddr; int i, nrecorded; int soff; soff = off + sizeof(struct mldv2_query); nrecorded = 0; for (i = 0; i < nsrc; i++) { m_copydata(m0, soff, sizeof(struct in6_addr), (caddr_t)&srcaddr); retval = in6m_record_source(inm, &srcaddr); if (retval < 0) break; nrecorded += retval; soff += sizeof(struct in6_addr); } if (nrecorded > 0) { CTR1(KTR_MLD, "%s: schedule response to SG query", __func__); inm->in6m_state = MLD_SG_QUERY_PENDING_MEMBER; inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; } } return (retval); } /* * Process a received MLDv1 host membership report. * Assumes mld points to mld_hdr in pulled up mbuf chain. * * NOTE: Can't be fully const correct as we temporarily embed scope ID in * mld_addr. This is OK as we own the mbuf chain. */ static int mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6, /*const*/ struct mld_hdr *mld) { struct in6_addr src, dst; - struct epoch_tracker et; struct in6_ifaddr *ia; struct in6_multi *inm; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif + NET_EPOCH_ASSERT(); + if (!mld_v1enable) { CTR3(KTR_MLD, "ignore v1 report %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); return (0); } if (ifp->if_flags & IFF_LOOPBACK) return (0); /* * MLDv1 reports must originate from a host's link-local address, * or the unspecified address (when booting). */ src = ip6->ip6_src; in6_clearscope(&src); if (!IN6_IS_SCOPE_LINKLOCAL(&src) && !IN6_IS_ADDR_UNSPECIFIED(&src)) { CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); return (EINVAL); } /* * RFC2710 Section 4: MLDv1 reports must pertain to a multicast * group, and must be directed to the group itself. */ dst = ip6->ip6_dst; in6_clearscope(&dst); if (!IN6_IS_ADDR_MULTICAST(&mld->mld_addr) || !IN6_ARE_ADDR_EQUAL(&mld->mld_addr, &dst)) { CTR3(KTR_MLD, "ignore v1 query dst %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_dst), ifp, if_name(ifp)); return (EINVAL); } /* * Make sure we don't hear our own membership report, as fast * leave requires knowing that we are the only member of a * group. Assume we used the link-local address if available, * otherwise look for ::. * * XXX Note that scope ID comparison is needed for the address * returned by in6ifa_ifpforlinklocal(), but SHOULD NOT be * performed for the on-wire address. */ ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); if ((ia && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, IA6_IN6(ia))) || (ia == NULL && IN6_IS_ADDR_UNSPECIFIED(&src))) { if (ia != NULL) ifa_free(&ia->ia_ifa); return (0); } if (ia != NULL) ifa_free(&ia->ia_ifa); CTR3(KTR_MLD, "process v1 report %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); /* * Embed scope ID of receiving interface in MLD query for lookup * whilst we don't hold other locks (due to KAME locking lameness). */ if (!IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) in6_setscope(&mld->mld_addr, ifp, NULL); IN6_MULTI_LIST_LOCK(); MLD_LOCK(); - NET_EPOCH_ENTER(et); /* * MLDv1 report suppression. * If we are a member of this group, and our membership should be * reported, and our group timer is pending or about to be reset, * stop our group timer by transitioning to the 'lazy' state. */ inm = in6m_lookup_locked(ifp, &mld->mld_addr); if (inm != NULL) { struct mld_ifsoftc *mli; mli = inm->in6m_mli; KASSERT(mli != NULL, ("%s: no mli for ifp %p", __func__, ifp)); /* * If we are in MLDv2 host mode, do not allow the * other host's MLDv1 report to suppress our reports. */ if (mli->mli_version == MLD_VERSION_2) goto out_locked; inm->in6m_timer = 0; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_SLEEPING_MEMBER: break; case MLD_REPORTING_MEMBER: case MLD_IDLE_MEMBER: case MLD_AWAKENING_MEMBER: CTR3(KTR_MLD, "report suppressed for %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); case MLD_LAZY_MEMBER: inm->in6m_state = MLD_LAZY_MEMBER; break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: case MLD_LEAVING_MEMBER: break; } } out_locked: - NET_EPOCH_EXIT(et); MLD_UNLOCK(); IN6_MULTI_LIST_UNLOCK(); /* XXX Clear embedded scope ID as userland won't expect it. */ in6_clearscope(&mld->mld_addr); return (0); } /* * MLD input path. * * Assume query messages which fit in a single ICMPv6 message header * have been pulled up. * Assume that userland will want to see the message, even if it * otherwise fails kernel input validation; do not free it. * Pullup may however free the mbuf chain m if it fails. * * Return IPPROTO_DONE if we freed m. Otherwise, return 0. */ int mld_input(struct mbuf *m, int off, int icmp6len) { struct ifnet *ifp; struct ip6_hdr *ip6; struct mld_hdr *mld; int mldlen; CTR3(KTR_MLD, "%s: called w/mbuf (%p,%d)", __func__, m, off); ifp = m->m_pkthdr.rcvif; ip6 = mtod(m, struct ip6_hdr *); /* Pullup to appropriate size. */ mld = (struct mld_hdr *)(mtod(m, uint8_t *) + off); if (mld->mld_type == MLD_LISTENER_QUERY && icmp6len >= sizeof(struct mldv2_query)) { mldlen = sizeof(struct mldv2_query); } else { mldlen = sizeof(struct mld_hdr); } IP6_EXTHDR_GET(mld, struct mld_hdr *, m, off, mldlen); if (mld == NULL) { ICMP6STAT_INC(icp6s_badlen); return (IPPROTO_DONE); } /* * Userland needs to see all of this traffic for implementing * the endpoint discovery portion of multicast routing. */ switch (mld->mld_type) { case MLD_LISTENER_QUERY: icmp6_ifstat_inc(ifp, ifs6_in_mldquery); if (icmp6len == sizeof(struct mld_hdr)) { if (mld_v1_input_query(ifp, ip6, mld) != 0) return (0); } else if (icmp6len >= sizeof(struct mldv2_query)) { if (mld_v2_input_query(ifp, ip6, m, (struct mldv2_query *)mld, off, icmp6len) != 0) return (0); } break; case MLD_LISTENER_REPORT: icmp6_ifstat_inc(ifp, ifs6_in_mldreport); if (mld_v1_input_report(ifp, ip6, mld) != 0) return (0); break; case MLDV2_LISTENER_REPORT: icmp6_ifstat_inc(ifp, ifs6_in_mldreport); break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(ifp, ifs6_in_mlddone); break; default: break; } return (0); } /* * Fast timeout handler (global). * VIMAGE: Timeout handlers are expected to service all vimages. */ void mld_fasttimo(void) { struct in6_multi_head inmh; VNET_ITERATOR_DECL(vnet_iter); SLIST_INIT(&inmh); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); mld_fasttimo_vnet(&inmh); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); in6m_release_list_deferred(&inmh); } /* * Fast timeout handler (per-vnet). * * VIMAGE: Assume caller has set up our curvnet. */ static void mld_fasttimo_vnet(struct in6_multi_head *inmh) { struct epoch_tracker et; struct mbufq scq; /* State-change packets */ struct mbufq qrq; /* Query response packets */ struct ifnet *ifp; struct mld_ifsoftc *mli; struct ifmultiaddr *ifma; struct in6_multi *inm; int uri_fasthz; uri_fasthz = 0; /* * Quick check to see if any work needs to be done, in order to * minimize the overhead of fasttimo processing. * SMPng: XXX Unlocked reads. */ if (!V_current_state_timers_running6 && !V_interface_timers_running6 && !V_state_change_timers_running6) return; IN6_MULTI_LIST_LOCK(); MLD_LOCK(); /* * MLDv2 General Query response timer processing. */ if (V_interface_timers_running6) { CTR1(KTR_MLD, "%s: interface timers running", __func__); V_interface_timers_running6 = 0; LIST_FOREACH(mli, &V_mli_head, mli_link) { if (mli->mli_v2_timer == 0) { /* Do nothing. */ } else if (--mli->mli_v2_timer == 0) { mld_v2_dispatch_general_query(mli); } else { V_interface_timers_running6 = 1; } } } if (!V_current_state_timers_running6 && !V_state_change_timers_running6) goto out_locked; V_current_state_timers_running6 = 0; V_state_change_timers_running6 = 0; CTR1(KTR_MLD, "%s: state change timers running", __func__); /* * MLD host report and state-change timer processing. * Note: Processing a v2 group timer may remove a node. */ LIST_FOREACH(mli, &V_mli_head, mli_link) { ifp = mli->mli_ifp; if (mli->mli_version == MLD_VERSION_2) { uri_fasthz = MLD_RANDOM_DELAY(mli->mli_uri * PR_FASTHZ); mbufq_init(&qrq, MLD_MAX_G_GS_PACKETS); mbufq_init(&scq, MLD_MAX_STATE_CHANGE_PACKETS); } - IF_ADDR_WLOCK(ifp); NET_EPOCH_ENTER(et); + IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { inm = in6m_ifmultiaddr_get_inm(ifma); if (inm == NULL) continue; switch (mli->mli_version) { case MLD_VERSION_1: mld_v1_process_group_timer(inmh, inm); break; case MLD_VERSION_2: mld_v2_process_group_timers(inmh, &qrq, &scq, inm, uri_fasthz); break; } } IF_ADDR_WUNLOCK(ifp); switch (mli->mli_version) { case MLD_VERSION_1: /* * Transmit reports for this lifecycle. This * is done while not holding IF_ADDR_LOCK * since this can call * in6ifa_ifpforlinklocal() which locks * IF_ADDR_LOCK internally as well as * ip6_output() to transmit a packet. */ while ((inm = SLIST_FIRST(inmh)) != NULL) { SLIST_REMOVE_HEAD(inmh, in6m_defer); (void)mld_v1_transmit_report(inm, MLD_LISTENER_REPORT); } break; case MLD_VERSION_2: mld_dispatch_queue(&qrq, 0); mld_dispatch_queue(&scq, 0); break; } NET_EPOCH_EXIT(et); } out_locked: MLD_UNLOCK(); IN6_MULTI_LIST_UNLOCK(); } /* * Update host report group timer. * Will update the global pending timer flags. */ static void mld_v1_process_group_timer(struct in6_multi_head *inmh, struct in6_multi *inm) { int report_timer_expired; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); if (inm->in6m_timer == 0) { report_timer_expired = 0; } else if (--inm->in6m_timer == 0) { report_timer_expired = 1; } else { V_current_state_timers_running6 = 1; return; } switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: break; case MLD_REPORTING_MEMBER: if (report_timer_expired) { inm->in6m_state = MLD_IDLE_MEMBER; SLIST_INSERT_HEAD(inmh, inm, in6m_defer); } break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: case MLD_LEAVING_MEMBER: break; } } /* * Update a group's timers for MLDv2. * Will update the global pending timer flags. * Note: Unlocked read from mli. */ static void mld_v2_process_group_timers(struct in6_multi_head *inmh, struct mbufq *qrq, struct mbufq *scq, struct in6_multi *inm, const int uri_fasthz) { int query_response_timer_expired; int state_change_retransmit_timer_expired; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); query_response_timer_expired = 0; state_change_retransmit_timer_expired = 0; /* * During a transition from compatibility mode back to MLDv2, * a group record in REPORTING state may still have its group * timer active. This is a no-op in this function; it is easier * to deal with it here than to complicate the slow-timeout path. */ if (inm->in6m_timer == 0) { query_response_timer_expired = 0; } else if (--inm->in6m_timer == 0) { query_response_timer_expired = 1; } else { V_current_state_timers_running6 = 1; } if (inm->in6m_sctimer == 0) { state_change_retransmit_timer_expired = 0; } else if (--inm->in6m_sctimer == 0) { state_change_retransmit_timer_expired = 1; } else { V_state_change_timers_running6 = 1; } /* We are in fasttimo, so be quick about it. */ if (!state_change_retransmit_timer_expired && !query_response_timer_expired) return; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_LAZY_MEMBER: case MLD_AWAKENING_MEMBER: case MLD_IDLE_MEMBER: break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: /* * Respond to a previously pending Group-Specific * or Group-and-Source-Specific query by enqueueing * the appropriate Current-State report for * immediate transmission. */ if (query_response_timer_expired) { int retval; retval = mld_v2_enqueue_group_record(qrq, inm, 0, 1, (inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER), 0); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); inm->in6m_state = MLD_REPORTING_MEMBER; in6m_clear_recorded(inm); } /* FALLTHROUGH */ case MLD_REPORTING_MEMBER: case MLD_LEAVING_MEMBER: if (state_change_retransmit_timer_expired) { /* * State-change retransmission timer fired. * If there are any further pending retransmissions, * set the global pending state-change flag, and * reset the timer. */ if (--inm->in6m_scrv > 0) { inm->in6m_sctimer = uri_fasthz; V_state_change_timers_running6 = 1; } /* * Retransmit the previously computed state-change * report. If there are no further pending * retransmissions, the mbuf queue will be consumed. * Update T0 state to T1 as we have now sent * a state-change. */ (void)mld_v2_merge_state_changes(inm, scq); in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); /* * If we are leaving the group for good, make sure * we release MLD's reference to it. * This release must be deferred using a SLIST, * as we are called from a loop which traverses * the in_ifmultiaddr TAILQ. */ if (inm->in6m_state == MLD_LEAVING_MEMBER && inm->in6m_scrv == 0) { inm->in6m_state = MLD_NOT_MEMBER; in6m_disconnect_locked(inmh, inm); in6m_rele_locked(inmh, inm); } } break; } } /* * Switch to a different version on the given interface, * as per Section 9.12. */ static void mld_set_version(struct mld_ifsoftc *mli, const int version) { int old_version_timer; MLD_LOCK_ASSERT(); CTR4(KTR_MLD, "%s: switching to v%d on ifp %p(%s)", __func__, version, mli->mli_ifp, if_name(mli->mli_ifp)); if (version == MLD_VERSION_1) { /* * Compute the "Older Version Querier Present" timer as per * Section 9.12. */ old_version_timer = (mli->mli_rv * mli->mli_qi) + mli->mli_qri; old_version_timer *= PR_SLOWHZ; mli->mli_v1_timer = old_version_timer; } if (mli->mli_v1_timer > 0 && mli->mli_version != MLD_VERSION_1) { mli->mli_version = MLD_VERSION_1; mld_v2_cancel_link_timers(mli); } } /* * Cancel pending MLDv2 timers for the given link and all groups * joined on it; state-change, general-query, and group-query timers. */ static void mld_v2_cancel_link_timers(struct mld_ifsoftc *mli) { struct epoch_tracker et; struct in6_multi_head inmh; struct ifmultiaddr *ifma; struct ifnet *ifp; struct in6_multi *inm; CTR3(KTR_MLD, "%s: cancel v2 timers on ifp %p(%s)", __func__, mli->mli_ifp, if_name(mli->mli_ifp)); SLIST_INIT(&inmh); IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); /* * Fast-track this potentially expensive operation * by checking all the global 'timer pending' flags. */ if (!V_interface_timers_running6 && !V_state_change_timers_running6 && !V_current_state_timers_running6) return; mli->mli_v2_timer = 0; ifp = mli->mli_ifp; IF_ADDR_WLOCK(ifp); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { inm = in6m_ifmultiaddr_get_inm(ifma); if (inm == NULL) continue; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: break; case MLD_LEAVING_MEMBER: /* * If we are leaving the group and switching * version, we need to release the final * reference held for issuing the INCLUDE {}. */ if (inm->in6m_refcount == 1) in6m_disconnect_locked(&inmh, inm); in6m_rele_locked(&inmh, inm); /* FALLTHROUGH */ case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: in6m_clear_recorded(inm); /* FALLTHROUGH */ case MLD_REPORTING_MEMBER: inm->in6m_sctimer = 0; inm->in6m_timer = 0; inm->in6m_state = MLD_REPORTING_MEMBER; /* * Free any pending MLDv2 state-change records. */ mbufq_drain(&inm->in6m_scq); break; } } NET_EPOCH_EXIT(et); IF_ADDR_WUNLOCK(ifp); in6m_release_list_deferred(&inmh); } /* * Global slowtimo handler. * VIMAGE: Timeout handlers are expected to service all vimages. */ void mld_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); mld_slowtimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Per-vnet slowtimo handler. */ static void mld_slowtimo_vnet(void) { struct mld_ifsoftc *mli; MLD_LOCK(); LIST_FOREACH(mli, &V_mli_head, mli_link) { mld_v1_process_querier_timers(mli); } MLD_UNLOCK(); } /* * Update the Older Version Querier Present timers for a link. * See Section 9.12 of RFC 3810. */ static void mld_v1_process_querier_timers(struct mld_ifsoftc *mli) { MLD_LOCK_ASSERT(); if (mli->mli_version != MLD_VERSION_2 && --mli->mli_v1_timer == 0) { /* * MLDv1 Querier Present timer expired; revert to MLDv2. */ CTR5(KTR_MLD, "%s: transition from v%d -> v%d on %p(%s)", __func__, mli->mli_version, MLD_VERSION_2, mli->mli_ifp, if_name(mli->mli_ifp)); mli->mli_version = MLD_VERSION_2; } } /* * Transmit an MLDv1 report immediately. */ static int mld_v1_transmit_report(struct in6_multi *in6m, const int type) { struct ifnet *ifp; struct in6_ifaddr *ia; struct ip6_hdr *ip6; struct mbuf *mh, *md; struct mld_hdr *mld; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); ifp = in6m->in6m_ifp; /* in process of being freed */ if (ifp == NULL) return (0); ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); /* ia may be NULL if link-local address is tentative. */ mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { if (ia != NULL) ifa_free(&ia->ia_ifa); return (ENOMEM); } md = m_get(M_NOWAIT, MT_DATA); if (md == NULL) { m_free(mh); if (ia != NULL) ifa_free(&ia->ia_ifa); return (ENOMEM); } mh->m_next = md; /* * FUTURE: Consider increasing alignment by ETHER_HDR_LEN, so * that ether_output() does not need to allocate another mbuf * for the header in the most common case. */ M_ALIGN(mh, sizeof(struct ip6_hdr)); mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr); mh->m_len = sizeof(struct ip6_hdr); ip6 = mtod(mh, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any; ip6->ip6_dst = in6m->in6m_addr; md->m_len = sizeof(struct mld_hdr); mld = mtod(md, struct mld_hdr *); mld->mld_type = type; mld->mld_code = 0; mld->mld_cksum = 0; mld->mld_maxdelay = 0; mld->mld_reserved = 0; mld->mld_addr = in6m->in6m_addr; in6_clearscope(&mld->mld_addr); mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), sizeof(struct mld_hdr)); mld_save_context(mh, ifp); mh->m_flags |= M_MLDV1; mld_dispatch_packet(mh); if (ia != NULL) ifa_free(&ia->ia_ifa); return (0); } /* * Process a state change from the upper layer for the given IPv6 group. * * Each socket holds a reference on the in_multi in its own ip_moptions. * The socket layer will have made the necessary updates to.the group * state, it is now up to MLD to issue a state change report if there * has been any change between T0 (when the last state-change was issued) * and T1 (now). * * We use the MLDv2 state machine at group level. The MLd module * however makes the decision as to which MLD protocol version to speak. * A state change *from* INCLUDE {} always means an initial join. * A state change *to* INCLUDE {} always means a final leave. * * If delay is non-zero, and the state change is an initial multicast * join, the state change report will be delayed by 'delay' ticks * in units of PR_FASTHZ if MLDv1 is active on the link; otherwise * the initial MLDv2 state change report will be delayed by whichever * is sooner, a pending state-change timer or delay itself. * * VIMAGE: curvnet should have been set by caller, as this routine * is called from the socket option handlers. */ int mld_change_state(struct in6_multi *inm, const int delay) { struct mld_ifsoftc *mli; struct ifnet *ifp; int error; IN6_MULTI_LIST_LOCK_ASSERT(); error = 0; /* * Check if the in6_multi has already been disconnected. */ if (inm->in6m_ifp == NULL) { CTR1(KTR_MLD, "%s: inm is disconnected", __func__); return (0); } /* * Try to detect if the upper layer just asked us to change state * for an interface which has now gone away. */ KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->in6m_ifma->ifma_ifp; if (ifp == NULL) return (0); /* * Sanity check that netinet6's notion of ifp is the * same as net's. */ KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__)); MLD_LOCK(); mli = MLD_IFINFO(ifp); KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp)); /* * If we detect a state transition to or from MCAST_UNDEFINED * for this group, then we are starting or finishing an MLD * life cycle for this group. */ if (inm->in6m_st[1].iss_fmode != inm->in6m_st[0].iss_fmode) { CTR3(KTR_MLD, "%s: inm transition %d -> %d", __func__, inm->in6m_st[0].iss_fmode, inm->in6m_st[1].iss_fmode); if (inm->in6m_st[0].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_MLD, "%s: initial join", __func__); error = mld_initial_join(inm, mli, delay); goto out_locked; } else if (inm->in6m_st[1].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_MLD, "%s: final leave", __func__); mld_final_leave(inm, mli); goto out_locked; } } else { CTR1(KTR_MLD, "%s: filter set change", __func__); } error = mld_handle_state_change(inm, mli); out_locked: MLD_UNLOCK(); return (error); } /* * Perform the initial join for an MLD group. * * When joining a group: * If the group should have its MLD traffic suppressed, do nothing. * MLDv1 starts sending MLDv1 host membership reports. * MLDv2 will schedule an MLDv2 state-change report containing the * initial state of the membership. * * If the delay argument is non-zero, then we must delay sending the * initial state change for delay ticks (in units of PR_FASTHZ). */ static int mld_initial_join(struct in6_multi *inm, struct mld_ifsoftc *mli, const int delay) { struct ifnet *ifp; struct mbufq *mq; int error, retval, syncstates; int odelay; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif CTR4(KTR_MLD, "%s: initial join %s on ifp %p(%s)", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp)); error = 0; syncstates = 1; ifp = inm->in6m_ifp; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); KASSERT(mli && mli->mli_ifp == ifp, ("%s: inconsistent ifp", __func__)); /* * Groups joined on loopback or marked as 'not reported', * enter the MLD_SILENT_MEMBER state and * are never reported in any protocol exchanges. * All other groups enter the appropriate state machine * for the version in use on this link. * A link marked as MLIF_SILENT causes MLD to be completely * disabled for the link. */ if ((ifp->if_flags & IFF_LOOPBACK) || (mli->mli_flags & MLIF_SILENT) || !mld_is_addr_reported(&inm->in6m_addr)) { CTR1(KTR_MLD, "%s: not kicking state machine for silent group", __func__); inm->in6m_state = MLD_SILENT_MEMBER; inm->in6m_timer = 0; } else { /* * Deal with overlapping in_multi lifecycle. * If this group was LEAVING, then make sure * we drop the reference we picked up to keep the * group around for the final INCLUDE {} enqueue. */ if (mli->mli_version == MLD_VERSION_2 && inm->in6m_state == MLD_LEAVING_MEMBER) { inm->in6m_refcount--; MPASS(inm->in6m_refcount > 0); } inm->in6m_state = MLD_REPORTING_MEMBER; switch (mli->mli_version) { case MLD_VERSION_1: /* * If a delay was provided, only use it if * it is greater than the delay normally * used for an MLDv1 state change report, * and delay sending the initial MLDv1 report * by not transitioning to the IDLE state. */ odelay = MLD_RANDOM_DELAY(MLD_V1_MAX_RI * PR_FASTHZ); if (delay) { inm->in6m_timer = max(delay, odelay); V_current_state_timers_running6 = 1; } else { inm->in6m_state = MLD_IDLE_MEMBER; error = mld_v1_transmit_report(inm, MLD_LISTENER_REPORT); if (error == 0) { inm->in6m_timer = odelay; V_current_state_timers_running6 = 1; } } break; case MLD_VERSION_2: /* * Defer update of T0 to T1, until the first copy * of the state change has been transmitted. */ syncstates = 0; /* * Immediately enqueue a State-Change Report for * this interface, freeing any previous reports. * Don't kick the timers if there is nothing to do, * or if an error occurred. */ mq = &inm->in6m_scq; mbufq_drain(mq); retval = mld_v2_enqueue_group_record(mq, inm, 1, 0, 0, (mli->mli_flags & MLIF_USEALLOW)); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) { error = retval * -1; break; } /* * Schedule transmission of pending state-change * report up to RV times for this link. The timer * will fire at the next mld_fasttimo (~200ms), * giving us an opportunity to merge the reports. * * If a delay was provided to this function, only * use this delay if sooner than the existing one. */ KASSERT(mli->mli_rv > 1, ("%s: invalid robustness %d", __func__, mli->mli_rv)); inm->in6m_scrv = mli->mli_rv; if (delay) { if (inm->in6m_sctimer > 1) { inm->in6m_sctimer = min(inm->in6m_sctimer, delay); } else inm->in6m_sctimer = delay; } else inm->in6m_sctimer = 1; V_state_change_timers_running6 = 1; error = 0; break; } } /* * Only update the T0 state if state change is atomic, * i.e. we don't need to wait for a timer to fire before we * can consider the state change to have been communicated. */ if (syncstates) { in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); } return (error); } /* * Issue an intermediate state change during the life-cycle. */ static int mld_handle_state_change(struct in6_multi *inm, struct mld_ifsoftc *mli) { struct ifnet *ifp; int retval; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif CTR4(KTR_MLD, "%s: state change for %s on ifp %p(%s)", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp)); ifp = inm->in6m_ifp; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); KASSERT(mli && mli->mli_ifp == ifp, ("%s: inconsistent ifp", __func__)); if ((ifp->if_flags & IFF_LOOPBACK) || (mli->mli_flags & MLIF_SILENT) || !mld_is_addr_reported(&inm->in6m_addr) || (mli->mli_version != MLD_VERSION_2)) { if (!mld_is_addr_reported(&inm->in6m_addr)) { CTR1(KTR_MLD, "%s: not kicking state machine for silent group", __func__); } CTR1(KTR_MLD, "%s: nothing to do", __func__); in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); return (0); } mbufq_drain(&inm->in6m_scq); retval = mld_v2_enqueue_group_record(&inm->in6m_scq, inm, 1, 0, 0, (mli->mli_flags & MLIF_USEALLOW)); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) return (-retval); /* * If record(s) were enqueued, start the state-change * report timer for this group. */ inm->in6m_scrv = mli->mli_rv; inm->in6m_sctimer = 1; V_state_change_timers_running6 = 1; return (0); } /* * Perform the final leave for a multicast address. * * When leaving a group: * MLDv1 sends a DONE message, if and only if we are the reporter. * MLDv2 enqueues a state-change report containing a transition * to INCLUDE {} for immediate transmission. */ static void mld_final_leave(struct in6_multi *inm, struct mld_ifsoftc *mli) { int syncstates; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif syncstates = 1; CTR4(KTR_MLD, "%s: final leave %s on ifp %p(%s)", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp)); IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_LEAVING_MEMBER: /* Already leaving or left; do nothing. */ CTR1(KTR_MLD, "%s: not kicking state machine for silent group", __func__); break; case MLD_REPORTING_MEMBER: case MLD_IDLE_MEMBER: case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: if (mli->mli_version == MLD_VERSION_1) { #ifdef INVARIANTS if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER || inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) panic("%s: MLDv2 state reached, not MLDv2 mode", __func__); #endif mld_v1_transmit_report(inm, MLD_LISTENER_DONE); inm->in6m_state = MLD_NOT_MEMBER; V_current_state_timers_running6 = 1; } else if (mli->mli_version == MLD_VERSION_2) { /* * Stop group timer and all pending reports. * Immediately enqueue a state-change report * TO_IN {} to be sent on the next fast timeout, * giving us an opportunity to merge reports. */ mbufq_drain(&inm->in6m_scq); inm->in6m_timer = 0; inm->in6m_scrv = mli->mli_rv; CTR4(KTR_MLD, "%s: Leaving %s/%s with %d " "pending retransmissions.", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp), inm->in6m_scrv); if (inm->in6m_scrv == 0) { inm->in6m_state = MLD_NOT_MEMBER; inm->in6m_sctimer = 0; } else { int retval; in6m_acquire_locked(inm); retval = mld_v2_enqueue_group_record( &inm->in6m_scq, inm, 1, 0, 0, (mli->mli_flags & MLIF_USEALLOW)); KASSERT(retval != 0, ("%s: enqueue record = %d", __func__, retval)); inm->in6m_state = MLD_LEAVING_MEMBER; inm->in6m_sctimer = 1; V_state_change_timers_running6 = 1; syncstates = 0; } break; } break; case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: /* Our reports are suppressed; do nothing. */ break; } if (syncstates) { in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; CTR3(KTR_MLD, "%s: T1 now MCAST_UNDEFINED for %p/%s", __func__, &inm->in6m_addr, if_name(inm->in6m_ifp)); } } /* * Enqueue an MLDv2 group record to the given output queue. * * If is_state_change is zero, a current-state record is appended. * If is_state_change is non-zero, a state-change report is appended. * * If is_group_query is non-zero, an mbuf packet chain is allocated. * If is_group_query is zero, and if there is a packet with free space * at the tail of the queue, it will be appended to providing there * is enough free space. * Otherwise a new mbuf packet chain is allocated. * * If is_source_query is non-zero, each source is checked to see if * it was recorded for a Group-Source query, and will be omitted if * it is not both in-mode and recorded. * * If use_block_allow is non-zero, state change reports for initial join * and final leave, on an inclusive mode group with a source list, will be * rewritten to use the ALLOW_NEW and BLOCK_OLD record types, respectively. * * The function will attempt to allocate leading space in the packet * for the IPv6+ICMP headers to be prepended without fragmenting the chain. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int mld_v2_enqueue_group_record(struct mbufq *mq, struct in6_multi *inm, const int is_state_change, const int is_group_query, const int is_source_query, const int use_block_allow) { struct mldv2_record mr; struct mldv2_record *pmr; struct ifnet *ifp; struct ip6_msource *ims, *nims; struct mbuf *m0, *m, *md; int is_filter_list_change; int minrec0len, m0srcs, msrcs, nbytes, off; int record_has_sources; int now; int type; uint8_t mode; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif IN6_MULTI_LIST_LOCK_ASSERT(); ifp = inm->in6m_ifp; is_filter_list_change = 0; m = NULL; m0 = NULL; m0srcs = 0; msrcs = 0; nbytes = 0; nims = NULL; record_has_sources = 1; pmr = NULL; type = MLD_DO_NOTHING; mode = inm->in6m_st[1].iss_fmode; /* * If we did not transition out of ASM mode during t0->t1, * and there are no source nodes to process, we can skip * the generation of source records. */ if (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0 && inm->in6m_nsrc == 0) record_has_sources = 0; if (is_state_change) { /* * Queue a state change record. * If the mode did not change, and there are non-ASM * listeners or source filters present, * we potentially need to issue two records for the group. * If there are ASM listeners, and there was no filter * mode transition of any kind, do nothing. * * If we are transitioning to MCAST_UNDEFINED, we need * not send any sources. A transition to/from this state is * considered inclusive with some special treatment. * * If we are rewriting initial joins/leaves to use * ALLOW/BLOCK, and the group's membership is inclusive, * we need to send sources in all cases. */ if (mode != inm->in6m_st[0].iss_fmode) { if (mode == MCAST_EXCLUDE) { CTR1(KTR_MLD, "%s: change to EXCLUDE", __func__); type = MLD_CHANGE_TO_EXCLUDE_MODE; } else { CTR1(KTR_MLD, "%s: change to INCLUDE", __func__); if (use_block_allow) { /* * XXX * Here we're interested in state * edges either direction between * MCAST_UNDEFINED and MCAST_INCLUDE. * Perhaps we should just check * the group state, rather than * the filter mode. */ if (mode == MCAST_UNDEFINED) { type = MLD_BLOCK_OLD_SOURCES; } else { type = MLD_ALLOW_NEW_SOURCES; } } else { type = MLD_CHANGE_TO_INCLUDE_MODE; if (mode == MCAST_UNDEFINED) record_has_sources = 0; } } } else { if (record_has_sources) { is_filter_list_change = 1; } else { type = MLD_DO_NOTHING; } } } else { /* * Queue a current state record. */ if (mode == MCAST_EXCLUDE) { type = MLD_MODE_IS_EXCLUDE; } else if (mode == MCAST_INCLUDE) { type = MLD_MODE_IS_INCLUDE; KASSERT(inm->in6m_st[1].iss_asm == 0, ("%s: inm %p is INCLUDE but ASM count is %d", __func__, inm, inm->in6m_st[1].iss_asm)); } } /* * Generate the filter list changes using a separate function. */ if (is_filter_list_change) return (mld_v2_enqueue_filter_change(mq, inm)); if (type == MLD_DO_NOTHING) { CTR3(KTR_MLD, "%s: nothing to do for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); return (0); } /* * If any sources are present, we must be able to fit at least * one in the trailing space of the tail packet's mbuf, * ideally more. */ minrec0len = sizeof(struct mldv2_record); if (record_has_sources) minrec0len += sizeof(struct in6_addr); CTR4(KTR_MLD, "%s: queueing %s for %s/%s", __func__, mld_rec_type_to_str(type), ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); /* * Check if we have a packet in the tail of the queue for this * group into which the first group record for this group will fit. * Otherwise allocate a new packet. * Always allocate leading space for IP6+RA+ICMPV6+REPORT. * Note: Group records for G/GSR query responses MUST be sent * in their own packet. */ m0 = mbufq_last(mq); if (!is_group_query && m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) && (m0->m_pkthdr.len + minrec0len) < (ifp->if_mtu - MLD_MTUSPACE)) { m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); m = m0; CTR1(KTR_MLD, "%s: use existing packet", __func__); } else { if (mbufq_full(mq)) { CTR1(KTR_MLD, "%s: outbound queue full", __func__); return (-ENOMEM); } m = NULL; m0srcs = (ifp->if_mtu - MLD_MTUSPACE - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); if (!is_state_change && !is_group_query) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (-ENOMEM); mld_save_context(m, ifp); CTR1(KTR_MLD, "%s: allocated first packet", __func__); } /* * Append group record. * If we have sources, we don't know how many yet. */ mr.mr_type = type; mr.mr_datalen = 0; mr.mr_numsrc = 0; mr.mr_addr = inm->in6m_addr; in6_clearscope(&mr.mr_addr); if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct mldv2_record); /* * Append as many sources as will fit in the first packet. * If we are appending to a new packet, the chain allocation * may potentially use clusters; use m_getptr() in this case. * If we are appending to an existing packet, we need to obtain * a pointer to the group record after m_append(), in case a new * mbuf was allocated. * * Only append sources which are in-mode at t1. If we are * transitioning to MCAST_UNDEFINED state on the group, and * use_block_allow is zero, do not include source entries. * Otherwise, we need to include this source in the report. * * Only report recorded sources in our filter set when responding * to a group-source query. */ if (record_has_sources) { if (m == m0) { md = m_last(m); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + md->m_len - nbytes); } else { md = m_getptr(m, 0, &off); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off); } msrcs = 0; RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, nims) { CTR2(KTR_MLD, "%s: visit node %s", __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr)); now = im6s_get_mode(inm, ims, 1); CTR2(KTR_MLD, "%s: node is %d", __func__, now); if ((now != mode) || (now == mode && (!use_block_allow && mode == MCAST_UNDEFINED))) { CTR1(KTR_MLD, "%s: skip node", __func__); continue; } if (is_source_query && ims->im6s_stp == 0) { CTR1(KTR_MLD, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_MLD, "%s: append node", __func__); if (!m_append(m, sizeof(struct in6_addr), (void *)&ims->im6s_addr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct in6_addr); ++msrcs; if (msrcs == m0srcs) break; } CTR2(KTR_MLD, "%s: msrcs is %d this packet", __func__, msrcs); pmr->mr_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(struct in6_addr)); } if (is_source_query && msrcs == 0) { CTR1(KTR_MLD, "%s: no recorded sources to report", __func__); if (m != m0) m_freem(m); return (0); } /* * We are good to go with first packet. */ if (m != m0) { CTR1(KTR_MLD, "%s: enqueueing first packet", __func__); m->m_pkthdr.PH_vt.vt_nrecs = 1; mbufq_enqueue(mq, m); } else m->m_pkthdr.PH_vt.vt_nrecs++; /* * No further work needed if no source list in packet(s). */ if (!record_has_sources) return (nbytes); /* * Whilst sources remain to be announced, we need to allocate * a new packet and fill out as many sources as will fit. * Always try for a cluster first. */ while (nims != NULL) { if (mbufq_full(mq)) { CTR1(KTR_MLD, "%s: outbound queue full", __func__); return (-ENOMEM); } m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (-ENOMEM); mld_save_context(m, ifp); md = m_getptr(m, 0, &off); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off); CTR1(KTR_MLD, "%s: allocated next packet", __func__); if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 1; nbytes += sizeof(struct mldv2_record); m0srcs = (ifp->if_mtu - MLD_MTUSPACE - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); msrcs = 0; RB_FOREACH_FROM(ims, ip6_msource_tree, nims) { CTR2(KTR_MLD, "%s: visit node %s", __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr)); now = im6s_get_mode(inm, ims, 1); if ((now != mode) || (now == mode && (!use_block_allow && mode == MCAST_UNDEFINED))) { CTR1(KTR_MLD, "%s: skip node", __func__); continue; } if (is_source_query && ims->im6s_stp == 0) { CTR1(KTR_MLD, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_MLD, "%s: append node", __func__); if (!m_append(m, sizeof(struct in6_addr), (void *)&ims->im6s_addr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } ++msrcs; if (msrcs == m0srcs) break; } pmr->mr_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(struct in6_addr)); CTR1(KTR_MLD, "%s: enqueueing next packet", __func__); mbufq_enqueue(mq, m); } return (nbytes); } /* * Type used to mark record pass completion. * We exploit the fact we can cast to this easily from the * current filter modes on each ip_msource node. */ typedef enum { REC_NONE = 0x00, /* MCAST_UNDEFINED */ REC_ALLOW = 0x01, /* MCAST_INCLUDE */ REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ REC_FULL = REC_ALLOW | REC_BLOCK } rectype_t; /* * Enqueue an MLDv2 filter list change to the given output queue. * * Source list filter state is held in an RB-tree. When the filter list * for a group is changed without changing its mode, we need to compute * the deltas between T0 and T1 for each source in the filter set, * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. * * As we may potentially queue two record types, and the entire R-B tree * needs to be walked at once, we break this out into its own function * so we can generate a tightly packed queue of packets. * * XXX This could be written to only use one tree walk, although that makes * serializing into the mbuf chains a bit harder. For now we do two walks * which makes things easier on us, and it may or may not be harder on * the L2 cache. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int mld_v2_enqueue_filter_change(struct mbufq *mq, struct in6_multi *inm) { static const int MINRECLEN = sizeof(struct mldv2_record) + sizeof(struct in6_addr); struct ifnet *ifp; struct mldv2_record mr; struct mldv2_record *pmr; struct ip6_msource *ims, *nims; struct mbuf *m, *m0, *md; int m0srcs, nbytes, npbytes, off, rsrcs, schanged; int nallow, nblock; uint8_t mode, now, then; rectype_t crt, drt, nrt; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif IN6_MULTI_LIST_LOCK_ASSERT(); if (inm->in6m_nsrc == 0 || (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0)) return (0); ifp = inm->in6m_ifp; /* interface */ mode = inm->in6m_st[1].iss_fmode; /* filter mode at t1 */ crt = REC_NONE; /* current group record type */ drt = REC_NONE; /* mask of completed group record types */ nrt = REC_NONE; /* record type for current node */ m0srcs = 0; /* # source which will fit in current mbuf chain */ npbytes = 0; /* # of bytes appended this packet */ nbytes = 0; /* # of bytes appended to group's state-change queue */ rsrcs = 0; /* # sources encoded in current record */ schanged = 0; /* # nodes encoded in overall filter change */ nallow = 0; /* # of source entries in ALLOW_NEW */ nblock = 0; /* # of source entries in BLOCK_OLD */ nims = NULL; /* next tree node pointer */ /* * For each possible filter record mode. * The first kind of source we encounter tells us which * is the first kind of record we start appending. * If a node transitioned to UNDEFINED at t1, its mode is treated * as the inverse of the group's filter mode. */ while (drt != REC_FULL) { do { m0 = mbufq_last(mq); if (m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) && (m0->m_pkthdr.len + MINRECLEN) < (ifp->if_mtu - MLD_MTUSPACE)) { m = m0; m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); CTR1(KTR_MLD, "%s: use previous packet", __func__); } else { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CTR1(KTR_MLD, "%s: m_get*() failed", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 0; mld_save_context(m, ifp); m0srcs = (ifp->if_mtu - MLD_MTUSPACE - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); npbytes = 0; CTR1(KTR_MLD, "%s: allocated new packet", __func__); } /* * Append the MLD group record header to the * current packet's data area. * Recalculate pointer to free space for next * group record, in case m_append() allocated * a new mbuf or cluster. */ memset(&mr, 0, sizeof(mr)); mr.mr_addr = inm->in6m_addr; in6_clearscope(&mr.mr_addr); if (!m_append(m, sizeof(mr), (void *)&mr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed", __func__); return (-ENOMEM); } npbytes += sizeof(struct mldv2_record); if (m != m0) { /* new packet; offset in chain */ md = m_getptr(m, npbytes - sizeof(struct mldv2_record), &off); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off); } else { /* current packet; offset from last append */ md = m_last(m); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + md->m_len - sizeof(struct mldv2_record)); } /* * Begin walking the tree for this record type * pass, or continue from where we left off * previously if we had to allocate a new packet. * Only report deltas in-mode at t1. * We need not report included sources as allowed * if we are in inclusive mode on the group, * however the converse is not true. */ rsrcs = 0; if (nims == NULL) { nims = RB_MIN(ip6_msource_tree, &inm->in6m_srcs); } RB_FOREACH_FROM(ims, ip6_msource_tree, nims) { CTR2(KTR_MLD, "%s: visit node %s", __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr)); now = im6s_get_mode(inm, ims, 1); then = im6s_get_mode(inm, ims, 0); CTR3(KTR_MLD, "%s: mode: t0 %d, t1 %d", __func__, then, now); if (now == then) { CTR1(KTR_MLD, "%s: skip unchanged", __func__); continue; } if (mode == MCAST_EXCLUDE && now == MCAST_INCLUDE) { CTR1(KTR_MLD, "%s: skip IN src on EX group", __func__); continue; } nrt = (rectype_t)now; if (nrt == REC_NONE) nrt = (rectype_t)(~mode & REC_FULL); if (schanged++ == 0) { crt = nrt; } else if (crt != nrt) continue; if (!m_append(m, sizeof(struct in6_addr), (void *)&ims->im6s_addr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed", __func__); return (-ENOMEM); } nallow += !!(crt == REC_ALLOW); nblock += !!(crt == REC_BLOCK); if (++rsrcs == m0srcs) break; } /* * If we did not append any tree nodes on this * pass, back out of allocations. */ if (rsrcs == 0) { npbytes -= sizeof(struct mldv2_record); if (m != m0) { CTR1(KTR_MLD, "%s: m_free(m)", __func__); m_freem(m); } else { CTR1(KTR_MLD, "%s: m_adj(m, -mr)", __func__); m_adj(m, -((int)sizeof( struct mldv2_record))); } continue; } npbytes += (rsrcs * sizeof(struct in6_addr)); if (crt == REC_ALLOW) pmr->mr_type = MLD_ALLOW_NEW_SOURCES; else if (crt == REC_BLOCK) pmr->mr_type = MLD_BLOCK_OLD_SOURCES; pmr->mr_numsrc = htons(rsrcs); /* * Count the new group record, and enqueue this * packet if it wasn't already queued. */ m->m_pkthdr.PH_vt.vt_nrecs++; if (m != m0) mbufq_enqueue(mq, m); nbytes += npbytes; } while (nims != NULL); drt |= crt; crt = (~crt & REC_FULL); } CTR3(KTR_MLD, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__, nallow, nblock); return (nbytes); } static int mld_v2_merge_state_changes(struct in6_multi *inm, struct mbufq *scq) { struct mbufq *gq; struct mbuf *m; /* pending state-change */ struct mbuf *m0; /* copy of pending state-change */ struct mbuf *mt; /* last state-change in packet */ int docopy, domerge; u_int recslen; docopy = 0; domerge = 0; recslen = 0; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); /* * If there are further pending retransmissions, make a writable * copy of each queued state-change message before merging. */ if (inm->in6m_scrv > 0) docopy = 1; gq = &inm->in6m_scq; #ifdef KTR if (mbufq_first(gq) == NULL) { CTR2(KTR_MLD, "%s: WARNING: queue for inm %p is empty", __func__, inm); } #endif m = mbufq_first(gq); while (m != NULL) { /* * Only merge the report into the current packet if * there is sufficient space to do so; an MLDv2 report * packet may only contain 65,535 group records. * Always use a simple mbuf chain concatentation to do this, * as large state changes for single groups may have * allocated clusters. */ domerge = 0; mt = mbufq_last(scq); if (mt != NULL) { recslen = m_length(m, NULL); if ((mt->m_pkthdr.PH_vt.vt_nrecs + m->m_pkthdr.PH_vt.vt_nrecs <= MLD_V2_REPORT_MAXRECS) && (mt->m_pkthdr.len + recslen <= (inm->in6m_ifp->if_mtu - MLD_MTUSPACE))) domerge = 1; } if (!domerge && mbufq_full(gq)) { CTR2(KTR_MLD, "%s: outbound queue full, skipping whole packet %p", __func__, m); mt = m->m_nextpkt; if (!docopy) m_freem(m); m = mt; continue; } if (!docopy) { CTR2(KTR_MLD, "%s: dequeueing %p", __func__, m); m0 = mbufq_dequeue(gq); m = m0->m_nextpkt; } else { CTR2(KTR_MLD, "%s: copying %p", __func__, m); m0 = m_dup(m, M_NOWAIT); if (m0 == NULL) return (ENOMEM); m0->m_nextpkt = NULL; m = m->m_nextpkt; } if (!domerge) { CTR3(KTR_MLD, "%s: queueing %p to scq %p)", __func__, m0, scq); mbufq_enqueue(scq, m0); } else { struct mbuf *mtl; /* last mbuf of packet mt */ CTR3(KTR_MLD, "%s: merging %p with ifscq tail %p)", __func__, m0, mt); mtl = m_last(mt); m0->m_flags &= ~M_PKTHDR; mt->m_pkthdr.len += recslen; mt->m_pkthdr.PH_vt.vt_nrecs += m0->m_pkthdr.PH_vt.vt_nrecs; mtl->m_next = m0; } } return (0); } /* * Respond to a pending MLDv2 General Query. */ static void mld_v2_dispatch_general_query(struct mld_ifsoftc *mli) { - struct epoch_tracker et; struct ifmultiaddr *ifma; struct ifnet *ifp; struct in6_multi *inm; int retval; + NET_EPOCH_ASSERT(); IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); KASSERT(mli->mli_version == MLD_VERSION_2, ("%s: called when version %d", __func__, mli->mli_version)); /* * Check that there are some packets queued. If so, send them first. * For large number of groups the reply to general query can take * many packets, we should finish sending them before starting of * queuing the new reply. */ if (mbufq_len(&mli->mli_gq) != 0) goto send; ifp = mli->mli_ifp; - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { inm = in6m_ifmultiaddr_get_inm(ifma); if (inm == NULL) continue; KASSERT(ifp == inm->in6m_ifp, ("%s: inconsistent ifp", __func__)); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: break; case MLD_REPORTING_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: inm->in6m_state = MLD_REPORTING_MEMBER; retval = mld_v2_enqueue_group_record(&mli->mli_gq, inm, 0, 0, 0, 0); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: case MLD_LEAVING_MEMBER: break; } } - NET_EPOCH_EXIT(et); send: mld_dispatch_queue(&mli->mli_gq, MLD_MAX_RESPONSE_BURST); /* * Slew transmission of bursts over 500ms intervals. */ if (mbufq_first(&mli->mli_gq) != NULL) { mli->mli_v2_timer = 1 + MLD_RANDOM_DELAY( MLD_RESPONSE_BURST_INTERVAL); V_interface_timers_running6 = 1; } } /* * Transmit the next pending message in the output queue. * * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis. * MRT: Nothing needs to be done, as MLD traffic is always local to * a link and uses a link-scope multicast address. */ static void mld_dispatch_packet(struct mbuf *m) { struct ip6_moptions im6o; struct ifnet *ifp; struct ifnet *oifp; struct mbuf *m0; struct mbuf *md; struct ip6_hdr *ip6; struct mld_hdr *mld; int error; int off; int type; uint32_t ifindex; CTR2(KTR_MLD, "%s: transmit %p", __func__, m); /* * Set VNET image pointer from enqueued mbuf chain * before doing anything else. Whilst we use interface * indexes to guard against interface detach, they are * unique to each VIMAGE and must be retrieved. */ ifindex = mld_restore_context(m); /* * Check if the ifnet still exists. This limits the scope of * any race in the absence of a global ifp lock for low cost * (an array lookup). */ ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR3(KTR_MLD, "%s: dropped %p as ifindex %u went away.", __func__, m, ifindex); m_freem(m); IP6STAT_INC(ip6s_noroute); goto out; } im6o.im6o_multicast_hlim = 1; im6o.im6o_multicast_loop = (V_ip6_mrouter != NULL); im6o.im6o_multicast_ifp = ifp; if (m->m_flags & M_MLDV1) { m0 = m; } else { m0 = mld_v2_encap_report(ifp, m); if (m0 == NULL) { CTR2(KTR_MLD, "%s: dropped %p", __func__, m); IP6STAT_INC(ip6s_odropped); goto out; } } mld_scrub_context(m0); m_clrprotoflags(m); m0->m_pkthdr.rcvif = V_loif; ip6 = mtod(m0, struct ip6_hdr *); #if 0 (void)in6_setscope(&ip6->ip6_dst, ifp, NULL); /* XXX LOR */ #else /* * XXX XXX Break some KPI rules to prevent an LOR which would * occur if we called in6_setscope() at transmission. * See comments at top of file. */ MLD_EMBEDSCOPE(&ip6->ip6_dst, ifp->if_index); #endif /* * Retrieve the ICMPv6 type before handoff to ip6_output(), * so we can bump the stats. */ md = m_getptr(m0, sizeof(struct ip6_hdr), &off); mld = (struct mld_hdr *)(mtod(md, uint8_t *) + off); type = mld->mld_type; error = ip6_output(m0, &mld_po, NULL, IPV6_UNSPECSRC, &im6o, &oifp, NULL); if (error) { CTR3(KTR_MLD, "%s: ip6_output(%p) = %d", __func__, m0, error); goto out; } ICMP6STAT_INC(icp6s_outhist[type]); if (oifp != NULL) { icmp6_ifstat_inc(oifp, ifs6_out_msg); switch (type) { case MLD_LISTENER_REPORT: case MLDV2_LISTENER_REPORT: icmp6_ifstat_inc(oifp, ifs6_out_mldreport); break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(oifp, ifs6_out_mlddone); break; } } out: return; } /* * Encapsulate an MLDv2 report. * * KAME IPv6 requires that hop-by-hop options be passed separately, * and that the IPv6 header be prepended in a separate mbuf. * * Returns a pointer to the new mbuf chain head, or NULL if the * allocation failed. */ static struct mbuf * mld_v2_encap_report(struct ifnet *ifp, struct mbuf *m) { struct mbuf *mh; struct mldv2_report *mld; struct ip6_hdr *ip6; struct in6_ifaddr *ia; int mldreclen; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); KASSERT((m->m_flags & M_PKTHDR), ("%s: mbuf chain %p is !M_PKTHDR", __func__, m)); /* * RFC3590: OK to send as :: or tentative during DAD. */ ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); if (ia == NULL) CTR1(KTR_MLD, "%s: warning: ia is NULL", __func__); mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { if (ia != NULL) ifa_free(&ia->ia_ifa); m_freem(m); return (NULL); } M_ALIGN(mh, sizeof(struct ip6_hdr) + sizeof(struct mldv2_report)); mldreclen = m_length(m, NULL); CTR2(KTR_MLD, "%s: mldreclen is %d", __func__, mldreclen); mh->m_len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report); mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report) + mldreclen; ip6 = mtod(mh, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any; if (ia != NULL) ifa_free(&ia->ia_ifa); ip6->ip6_dst = in6addr_linklocal_allv2routers; /* scope ID will be set in netisr */ mld = (struct mldv2_report *)(ip6 + 1); mld->mld_type = MLDV2_LISTENER_REPORT; mld->mld_code = 0; mld->mld_cksum = 0; mld->mld_v2_reserved = 0; mld->mld_v2_numrecs = htons(m->m_pkthdr.PH_vt.vt_nrecs); m->m_pkthdr.PH_vt.vt_nrecs = 0; mh->m_next = m; mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), sizeof(struct mldv2_report) + mldreclen); return (mh); } #ifdef KTR static char * mld_rec_type_to_str(const int type) { switch (type) { case MLD_CHANGE_TO_EXCLUDE_MODE: return "TO_EX"; break; case MLD_CHANGE_TO_INCLUDE_MODE: return "TO_IN"; break; case MLD_MODE_IS_EXCLUDE: return "MODE_EX"; break; case MLD_MODE_IS_INCLUDE: return "MODE_IN"; break; case MLD_ALLOW_NEW_SOURCES: return "ALLOW_NEW"; break; case MLD_BLOCK_OLD_SOURCES: return "BLOCK_OLD"; break; default: break; } return "unknown"; } #endif static void mld_init(void *unused __unused) { CTR1(KTR_MLD, "%s: initializing", __func__); MLD_LOCK_INIT(); ip6_initpktopts(&mld_po); mld_po.ip6po_hlim = 1; mld_po.ip6po_hbh = &mld_ra.hbh; mld_po.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER; mld_po.ip6po_flags = IP6PO_DONTFRAG; } SYSINIT(mld_init, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_init, NULL); static void mld_uninit(void *unused __unused) { CTR1(KTR_MLD, "%s: tearing down", __func__); MLD_LOCK_DESTROY(); } SYSUNINIT(mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_uninit, NULL); static void vnet_mld_init(const void *unused __unused) { CTR1(KTR_MLD, "%s: initializing", __func__); LIST_INIT(&V_mli_head); } VNET_SYSINIT(vnet_mld_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_init, NULL); static void vnet_mld_uninit(const void *unused __unused) { /* This can happen if we shutdown the network stack. */ CTR1(KTR_MLD, "%s: tearing down", __func__); } VNET_SYSUNINIT(vnet_mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_uninit, NULL); static int mld_modevent(module_t mod, int type, void *unused __unused) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t mld_mod = { "mld", mld_modevent, 0 }; DECLARE_MODULE(mld, mld_mod, SI_SUB_PROTO_MC, SI_ORDER_ANY); diff --git a/sys/netinet6/nd6.c b/sys/netinet6/nd6.c index e359b72cb455..fae88877a2f5 100644 --- a/sys/netinet6/nd6.c +++ b/sys/netinet6/nd6.c @@ -1,2757 +1,2750 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: nd6.c,v 1.144 2001/05/24 07:44:00 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */ #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */ #define SIN6(s) ((const struct sockaddr_in6 *)(s)) MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery"); /* timer values */ VNET_DEFINE(int, nd6_prune) = 1; /* walk list every 1 seconds */ VNET_DEFINE(int, nd6_delay) = 5; /* delay first probe time 5 second */ VNET_DEFINE(int, nd6_umaxtries) = 3; /* maximum unicast query */ VNET_DEFINE(int, nd6_mmaxtries) = 3; /* maximum multicast query */ VNET_DEFINE(int, nd6_useloopback) = 1; /* use loopback interface for * local traffic */ VNET_DEFINE(int, nd6_gctimer) = (60 * 60 * 24); /* 1 day: garbage * collection timer */ /* preventing too many loops in ND option parsing */ VNET_DEFINE_STATIC(int, nd6_maxndopt) = 10; /* max # of ND options allowed */ VNET_DEFINE(int, nd6_maxnudhint) = 0; /* max # of subsequent upper * layer hints */ VNET_DEFINE_STATIC(int, nd6_maxqueuelen) = 1; /* max pkts cached in unresolved * ND entries */ #define V_nd6_maxndopt VNET(nd6_maxndopt) #define V_nd6_maxqueuelen VNET(nd6_maxqueuelen) #ifdef ND6_DEBUG VNET_DEFINE(int, nd6_debug) = 1; #else VNET_DEFINE(int, nd6_debug) = 0; #endif static eventhandler_tag lle_event_eh, iflladdr_event_eh, ifnet_link_event_eh; VNET_DEFINE(struct nd_drhead, nd_defrouter); VNET_DEFINE(struct nd_prhead, nd_prefix); VNET_DEFINE(struct rwlock, nd6_lock); VNET_DEFINE(uint64_t, nd6_list_genid); VNET_DEFINE(struct mtx, nd6_onlink_mtx); VNET_DEFINE(int, nd6_recalc_reachtm_interval) = ND6_RECALC_REACHTM_INTERVAL; #define V_nd6_recalc_reachtm_interval VNET(nd6_recalc_reachtm_interval) int (*send_sendso_input_hook)(struct mbuf *, struct ifnet *, int, int); static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *, struct ifnet *); static void nd6_setmtu0(struct ifnet *, struct nd_ifinfo *); static void nd6_slowtimo(void *); static int regen_tmpaddr(struct in6_ifaddr *); static void nd6_free(struct llentry **, int); static void nd6_free_redirect(const struct llentry *); static void nd6_llinfo_timer(void *); static void nd6_llinfo_settimer_locked(struct llentry *, long); static void clear_llinfo_pqueue(struct llentry *); static void nd6_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int nd6_resolve_slow(struct ifnet *, int, struct mbuf *, const struct sockaddr_in6 *, u_char *, uint32_t *, struct llentry **); static int nd6_need_cache(struct ifnet *); VNET_DEFINE_STATIC(struct callout, nd6_slowtimo_ch); #define V_nd6_slowtimo_ch VNET(nd6_slowtimo_ch) VNET_DEFINE(struct callout, nd6_timer_ch); #define V_nd6_timer_ch VNET(nd6_timer_ch) static void nd6_lle_event(void *arg __unused, struct llentry *lle, int evt) { struct rt_addrinfo rtinfo; struct sockaddr_in6 dst; struct sockaddr_dl gw; struct ifnet *ifp; int type; int fibnum; LLE_WLOCK_ASSERT(lle); if (lltable_get_af(lle->lle_tbl) != AF_INET6) return; switch (evt) { case LLENTRY_RESOLVED: type = RTM_ADD; KASSERT(lle->la_flags & LLE_VALID, ("%s: %p resolved but not valid?", __func__, lle)); break; case LLENTRY_EXPIRED: type = RTM_DELETE; break; default: return; } ifp = lltable_get_ifp(lle->lle_tbl); bzero(&dst, sizeof(dst)); bzero(&gw, sizeof(gw)); bzero(&rtinfo, sizeof(rtinfo)); lltable_fill_sa_entry(lle, (struct sockaddr *)&dst); dst.sin6_scope_id = in6_getscopezone(ifp, in6_addrscope(&dst.sin6_addr)); gw.sdl_len = sizeof(struct sockaddr_dl); gw.sdl_family = AF_LINK; gw.sdl_alen = ifp->if_addrlen; gw.sdl_index = ifp->if_index; gw.sdl_type = ifp->if_type; if (evt == LLENTRY_RESOLVED) bcopy(lle->ll_addr, gw.sdl_data, ifp->if_addrlen); rtinfo.rti_info[RTAX_DST] = (struct sockaddr *)&dst; rtinfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gw; rtinfo.rti_addrs = RTA_DST | RTA_GATEWAY; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : ifp->if_fib; rt_missmsg_fib(type, &rtinfo, RTF_HOST | RTF_LLDATA | ( type == RTM_ADD ? RTF_UP: 0), 0, fibnum); } /* * A handler for interface link layer address change event. */ static void nd6_iflladdr(void *arg __unused, struct ifnet *ifp) { lltable_update_ifaddr(LLTABLE6(ifp)); } void nd6_init(void) { mtx_init(&V_nd6_onlink_mtx, "nd6 onlink", NULL, MTX_DEF); rw_init(&V_nd6_lock, "nd6 list"); LIST_INIT(&V_nd_prefix); TAILQ_INIT(&V_nd_defrouter); /* Start timers. */ callout_init(&V_nd6_slowtimo_ch, 0); callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, curvnet); callout_init(&V_nd6_timer_ch, 0); callout_reset(&V_nd6_timer_ch, hz, nd6_timer, curvnet); nd6_dad_init(); if (IS_DEFAULT_VNET(curvnet)) { lle_event_eh = EVENTHANDLER_REGISTER(lle_event, nd6_lle_event, NULL, EVENTHANDLER_PRI_ANY); iflladdr_event_eh = EVENTHANDLER_REGISTER(iflladdr_event, nd6_iflladdr, NULL, EVENTHANDLER_PRI_ANY); ifnet_link_event_eh = EVENTHANDLER_REGISTER(ifnet_link_event, nd6_ifnet_link_event, NULL, EVENTHANDLER_PRI_ANY); } } #ifdef VIMAGE void nd6_destroy() { callout_drain(&V_nd6_slowtimo_ch); callout_drain(&V_nd6_timer_ch); if (IS_DEFAULT_VNET(curvnet)) { EVENTHANDLER_DEREGISTER(ifnet_link_event, ifnet_link_event_eh); EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh); EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_event_eh); } rw_destroy(&V_nd6_lock); mtx_destroy(&V_nd6_onlink_mtx); } #endif struct nd_ifinfo * nd6_ifattach(struct ifnet *ifp) { struct nd_ifinfo *nd; nd = malloc(sizeof(*nd), M_IP6NDP, M_WAITOK | M_ZERO); nd->initialized = 1; nd->chlim = IPV6_DEFHLIM; nd->basereachable = REACHABLE_TIME; nd->reachable = ND_COMPUTE_RTIME(nd->basereachable); nd->retrans = RETRANS_TIMER; nd->flags = ND6_IFF_PERFORMNUD; /* A loopback interface always has ND6_IFF_AUTO_LINKLOCAL. * XXXHRS: Clear ND6_IFF_AUTO_LINKLOCAL on an IFT_BRIDGE interface by * default regardless of the V_ip6_auto_linklocal configuration to * give a reasonable default behavior. */ if ((V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) || (ifp->if_flags & IFF_LOOPBACK)) nd->flags |= ND6_IFF_AUTO_LINKLOCAL; /* * A loopback interface does not need to accept RTADV. * XXXHRS: Clear ND6_IFF_ACCEPT_RTADV on an IFT_BRIDGE interface by * default regardless of the V_ip6_accept_rtadv configuration to * prevent the interface from accepting RA messages arrived * on one of the member interfaces with ND6_IFF_ACCEPT_RTADV. */ if (V_ip6_accept_rtadv && !(ifp->if_flags & IFF_LOOPBACK) && (ifp->if_type != IFT_BRIDGE)) nd->flags |= ND6_IFF_ACCEPT_RTADV; if (V_ip6_no_radr && !(ifp->if_flags & IFF_LOOPBACK)) nd->flags |= ND6_IFF_NO_RADR; /* XXX: we cannot call nd6_setmtu since ifp is not fully initialized */ nd6_setmtu0(ifp, nd); return nd; } void nd6_ifdetach(struct ifnet *ifp, struct nd_ifinfo *nd) { struct epoch_tracker et; struct ifaddr *ifa, *next; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; /* stop DAD processing */ nd6_dad_stop(ifa); } NET_EPOCH_EXIT(et); free(nd, M_IP6NDP); } /* * Reset ND level link MTU. This function is called when the physical MTU * changes, which means we might have to adjust the ND level MTU. */ void nd6_setmtu(struct ifnet *ifp) { if (ifp->if_afdata[AF_INET6] == NULL) return; nd6_setmtu0(ifp, ND_IFINFO(ifp)); } /* XXX todo: do not maintain copy of ifp->if_mtu in ndi->maxmtu */ void nd6_setmtu0(struct ifnet *ifp, struct nd_ifinfo *ndi) { u_int32_t omaxmtu; omaxmtu = ndi->maxmtu; ndi->maxmtu = ifp->if_mtu; /* * Decreasing the interface MTU under IPV6 minimum MTU may cause * undesirable situation. We thus notify the operator of the change * explicitly. The check for omaxmtu is necessary to restrict the * log to the case of changing the MTU, not initializing it. */ if (omaxmtu >= IPV6_MMTU && ndi->maxmtu < IPV6_MMTU) { log(LOG_NOTICE, "nd6_setmtu0: " "new link MTU on %s (%lu) is too small for IPv6\n", if_name(ifp), (unsigned long)ndi->maxmtu); } if (ndi->maxmtu > V_in6_maxmtu) in6_setmaxmtu(); /* check all interfaces just in case */ } void nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts) { bzero(ndopts, sizeof(*ndopts)); ndopts->nd_opts_search = (struct nd_opt_hdr *)opt; ndopts->nd_opts_last = (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len); if (icmp6len == 0) { ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } } /* * Take one ND option. */ struct nd_opt_hdr * nd6_option(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int olen; KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__)); KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts", __func__)); if (ndopts->nd_opts_search == NULL) return NULL; if (ndopts->nd_opts_done) return NULL; nd_opt = ndopts->nd_opts_search; /* make sure nd_opt_len is inside the buffer */ if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) { bzero(ndopts, sizeof(*ndopts)); return NULL; } olen = nd_opt->nd_opt_len << 3; if (olen == 0) { /* * Message validation requires that all included * options have a length that is greater than zero. */ bzero(ndopts, sizeof(*ndopts)); return NULL; } ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen); if (ndopts->nd_opts_search > ndopts->nd_opts_last) { /* option overruns the end of buffer, invalid */ bzero(ndopts, sizeof(*ndopts)); return NULL; } else if (ndopts->nd_opts_search == ndopts->nd_opts_last) { /* reached the end of options chain */ ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } return nd_opt; } /* * Parse multiple ND options. * This function is much easier to use, for ND routines that do not need * multiple options of the same type. */ int nd6_options(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int i = 0; KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__)); KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts", __func__)); if (ndopts->nd_opts_search == NULL) return 0; while (1) { nd_opt = nd6_option(ndopts); if (nd_opt == NULL && ndopts->nd_opts_last == NULL) { /* * Message validation requires that all included * options have a length that is greater than zero. */ ICMP6STAT_INC(icp6s_nd_badopt); bzero(ndopts, sizeof(*ndopts)); return -1; } if (nd_opt == NULL) goto skip1; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LINKADDR: case ND_OPT_TARGET_LINKADDR: case ND_OPT_MTU: case ND_OPT_REDIRECTED_HEADER: case ND_OPT_NONCE: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { nd6log((LOG_INFO, "duplicated ND6 option found (type=%d)\n", nd_opt->nd_opt_type)); /* XXX bark? */ } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } break; case ND_OPT_PREFIX_INFORMATION: if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } ndopts->nd_opts_pi_end = (struct nd_opt_prefix_info *)nd_opt; break; /* What about ND_OPT_ROUTE_INFO? RFC 4191 */ case ND_OPT_RDNSS: /* RFC 6106 */ case ND_OPT_DNSSL: /* RFC 6106 */ /* * Silently ignore options we know and do not care about * in the kernel. */ break; default: /* * Unknown options must be silently ignored, * to accommodate future extension to the protocol. */ nd6log((LOG_DEBUG, "nd6_options: unsupported option %d - " "option ignored\n", nd_opt->nd_opt_type)); } skip1: i++; if (i > V_nd6_maxndopt) { ICMP6STAT_INC(icp6s_nd_toomanyopt); nd6log((LOG_INFO, "too many loop in nd opt\n")); break; } if (ndopts->nd_opts_done) break; } return 0; } /* * ND6 timer routine to handle ND6 entries */ static void nd6_llinfo_settimer_locked(struct llentry *ln, long tick) { int canceled; LLE_WLOCK_ASSERT(ln); if (tick < 0) { ln->la_expire = 0; ln->ln_ntick = 0; canceled = callout_stop(&ln->lle_timer); } else { ln->la_expire = time_uptime + tick / hz; LLE_ADDREF(ln); if (tick > INT_MAX) { ln->ln_ntick = tick - INT_MAX; canceled = callout_reset(&ln->lle_timer, INT_MAX, nd6_llinfo_timer, ln); } else { ln->ln_ntick = 0; canceled = callout_reset(&ln->lle_timer, tick, nd6_llinfo_timer, ln); } } if (canceled > 0) LLE_REMREF(ln); } /* * Gets source address of the first packet in hold queue * and stores it in @src. * Returns pointer to @src (if hold queue is not empty) or NULL. * * Set noinline to be dtrace-friendly */ static __noinline struct in6_addr * nd6_llinfo_get_holdsrc(struct llentry *ln, struct in6_addr *src) { struct ip6_hdr hdr; struct mbuf *m; if (ln->la_hold == NULL) return (NULL); /* * assume every packet in la_hold has the same IP header */ m = ln->la_hold; if (sizeof(hdr) > m->m_len) return (NULL); m_copydata(m, 0, sizeof(hdr), (caddr_t)&hdr); *src = hdr.ip6_src; return (src); } /* * Checks if we need to switch from STALE state. * * RFC 4861 requires switching from STALE to DELAY state * on first packet matching entry, waiting V_nd6_delay and * transition to PROBE state (if upper layer confirmation was * not received). * * This code performs a bit differently: * On packet hit we don't change state (but desired state * can be guessed by control plane). However, after V_nd6_delay * seconds code will transition to PROBE state (so DELAY state * is kinda skipped in most situations). * * Typically, V_nd6_gctimer is bigger than V_nd6_delay, so * we perform the following upon entering STALE state: * * 1) Arm timer to run each V_nd6_delay seconds to make sure that * if packet was transmitted at the start of given interval, we * would be able to switch to PROBE state in V_nd6_delay seconds * as user expects. * * 2) Reschedule timer until original V_nd6_gctimer expires keeping * lle in STALE state (remaining timer value stored in lle_remtime). * * 3) Reschedule timer if packet was transmitted less that V_nd6_delay * seconds ago. * * Returns non-zero value if the entry is still STALE (storing * the next timer interval in @pdelay). * * Returns zero value if original timer expired or we need to switch to * PROBE (store that in @do_switch variable). */ static int nd6_is_stale(struct llentry *lle, long *pdelay, int *do_switch) { int nd_delay, nd_gctimer, r_skip_req; time_t lle_hittime; long delay; *do_switch = 0; nd_gctimer = V_nd6_gctimer; nd_delay = V_nd6_delay; LLE_REQ_LOCK(lle); r_skip_req = lle->r_skip_req; lle_hittime = lle->lle_hittime; LLE_REQ_UNLOCK(lle); if (r_skip_req > 0) { /* * Nonzero r_skip_req value was set upon entering * STALE state. Since value was not changed, no * packets were passed using this lle. Ask for * timer reschedule and keep STALE state. */ delay = (long)(MIN(nd_gctimer, nd_delay)); delay *= hz; if (lle->lle_remtime > delay) lle->lle_remtime -= delay; else { delay = lle->lle_remtime; lle->lle_remtime = 0; } if (delay == 0) { /* * The original ng6_gctime timeout ended, * no more rescheduling. */ return (0); } *pdelay = delay; return (1); } /* * Packet received. Verify timestamp */ delay = (long)(time_uptime - lle_hittime); if (delay < nd_delay) { /* * V_nd6_delay still not passed since the first * hit in STALE state. * Reshedule timer and return. */ *pdelay = (long)(nd_delay - delay) * hz; return (1); } /* Request switching to probe */ *do_switch = 1; return (0); } /* * Switch @lle state to new state optionally arming timers. * * Set noinline to be dtrace-friendly */ __noinline void nd6_llinfo_setstate(struct llentry *lle, int newstate) { struct ifnet *ifp; int nd_gctimer, nd_delay; long delay, remtime; delay = 0; remtime = 0; switch (newstate) { case ND6_LLINFO_INCOMPLETE: ifp = lle->lle_tbl->llt_ifp; delay = (long)ND_IFINFO(ifp)->retrans * hz / 1000; break; case ND6_LLINFO_REACHABLE: if (!ND6_LLINFO_PERMANENT(lle)) { ifp = lle->lle_tbl->llt_ifp; delay = (long)ND_IFINFO(ifp)->reachable * hz; } break; case ND6_LLINFO_STALE: /* * Notify fast path that we want to know if any packet * is transmitted by setting r_skip_req. */ LLE_REQ_LOCK(lle); lle->r_skip_req = 1; LLE_REQ_UNLOCK(lle); nd_delay = V_nd6_delay; nd_gctimer = V_nd6_gctimer; delay = (long)(MIN(nd_gctimer, nd_delay)) * hz; remtime = (long)nd_gctimer * hz - delay; break; case ND6_LLINFO_DELAY: lle->la_asked = 0; delay = (long)V_nd6_delay * hz; break; } if (delay > 0) nd6_llinfo_settimer_locked(lle, delay); lle->lle_remtime = remtime; lle->ln_state = newstate; } /* * Timer-dependent part of nd state machine. * * Set noinline to be dtrace-friendly */ static __noinline void nd6_llinfo_timer(void *arg) { + struct epoch_tracker et; struct llentry *ln; struct in6_addr *dst, *pdst, *psrc, src; struct ifnet *ifp; struct nd_ifinfo *ndi; int do_switch, send_ns; long delay; KASSERT(arg != NULL, ("%s: arg NULL", __func__)); ln = (struct llentry *)arg; ifp = lltable_get_ifp(ln->lle_tbl); CURVNET_SET(ifp->if_vnet); ND6_RLOCK(); LLE_WLOCK(ln); if (callout_pending(&ln->lle_timer)) { /* * Here we are a bit odd here in the treatment of * active/pending. If the pending bit is set, it got * rescheduled before I ran. The active * bit we ignore, since if it was stopped * in ll_tablefree() and was currently running * it would have return 0 so the code would * not have deleted it since the callout could * not be stopped so we want to go through * with the delete here now. If the callout * was restarted, the pending bit will be back on and * we just want to bail since the callout_reset would * return 1 and our reference would have been removed * by nd6_llinfo_settimer_locked above since canceled * would have been 1. */ LLE_WUNLOCK(ln); ND6_RUNLOCK(); CURVNET_RESTORE(); return; } + NET_EPOCH_ENTER(et); ndi = ND_IFINFO(ifp); send_ns = 0; dst = &ln->r_l3addr.addr6; pdst = dst; if (ln->ln_ntick > 0) { if (ln->ln_ntick > INT_MAX) { ln->ln_ntick -= INT_MAX; nd6_llinfo_settimer_locked(ln, INT_MAX); } else { ln->ln_ntick = 0; nd6_llinfo_settimer_locked(ln, ln->ln_ntick); } goto done; } if (ln->la_flags & LLE_STATIC) { goto done; } if (ln->la_flags & LLE_DELETED) { nd6_free(&ln, 0); goto done; } switch (ln->ln_state) { case ND6_LLINFO_INCOMPLETE: if (ln->la_asked < V_nd6_mmaxtries) { ln->la_asked++; send_ns = 1; /* Send NS to multicast address */ pdst = NULL; } else { struct mbuf *m = ln->la_hold; if (m) { struct mbuf *m0; /* * assuming every packet in la_hold has the * same IP header. Send error after unlock. */ m0 = m->m_nextpkt; m->m_nextpkt = NULL; ln->la_hold = m0; clear_llinfo_pqueue(ln); } nd6_free(&ln, 0); if (m != NULL) icmp6_error2(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0, ifp); } break; case ND6_LLINFO_REACHABLE: if (!ND6_LLINFO_PERMANENT(ln)) nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); break; case ND6_LLINFO_STALE: if (nd6_is_stale(ln, &delay, &do_switch) != 0) { /* * No packet has used this entry and GC timeout * has not been passed. Reshedule timer and * return. */ nd6_llinfo_settimer_locked(ln, delay); break; } if (do_switch == 0) { /* * GC timer has ended and entry hasn't been used. * Run Garbage collector (RFC 4861, 5.3) */ if (!ND6_LLINFO_PERMANENT(ln)) nd6_free(&ln, 1); break; } /* Entry has been used AND delay timer has ended. */ /* FALLTHROUGH */ case ND6_LLINFO_DELAY: if (ndi && (ndi->flags & ND6_IFF_PERFORMNUD) != 0) { /* We need NUD */ ln->la_asked = 1; nd6_llinfo_setstate(ln, ND6_LLINFO_PROBE); send_ns = 1; } else nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); /* XXX */ break; case ND6_LLINFO_PROBE: if (ln->la_asked < V_nd6_umaxtries) { ln->la_asked++; send_ns = 1; } else { nd6_free(&ln, 0); } break; default: panic("%s: paths in a dark night can be confusing: %d", __func__, ln->ln_state); } done: if (ln != NULL) ND6_RUNLOCK(); if (send_ns != 0) { nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000); psrc = nd6_llinfo_get_holdsrc(ln, &src); LLE_FREE_LOCKED(ln); ln = NULL; nd6_ns_output(ifp, psrc, pdst, dst, NULL); } if (ln != NULL) LLE_FREE_LOCKED(ln); + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } /* * ND6 timer routine to expire default route list and prefix list */ void nd6_timer(void *arg) { CURVNET_SET((struct vnet *) arg); + struct epoch_tracker et; struct nd_drhead drq; struct nd_prhead prl; struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; struct ifnet *ifp; struct in6_ifaddr *ia6, *nia6; uint64_t genid; TAILQ_INIT(&drq); LIST_INIT(&prl); ND6_WLOCK(); TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) if (dr->expire && dr->expire < time_uptime) defrouter_unlink(dr, &drq); ND6_WUNLOCK(); while ((dr = TAILQ_FIRST(&drq)) != NULL) { TAILQ_REMOVE(&drq, dr, dr_entry); defrouter_del(dr); } /* * expire interface addresses. * in the past the loop was inside prefix expiry processing. * However, from a stricter speci-confrmance standpoint, we should * rather separate address lifetimes and prefix lifetimes. * * XXXRW: in6_ifaddrhead locking. */ + NET_EPOCH_ENTER(et); addrloop: CK_STAILQ_FOREACH_SAFE(ia6, &V_in6_ifaddrhead, ia_link, nia6) { /* check address lifetime */ if (IFA6_IS_INVALID(ia6)) { int regen = 0; /* * If the expiring address is temporary, try * regenerating a new one. This would be useful when * we suspended a laptop PC, then turned it on after a * period that could invalidate all temporary * addresses. Although we may have to restart the * loop (see below), it must be after purging the * address. Otherwise, we'd see an infinite loop of * regeneration. */ if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) { if (regen_tmpaddr(ia6) == 0) regen = 1; } in6_purgeaddr(&ia6->ia_ifa); if (regen) goto addrloop; /* XXX: see below */ } else if (IFA6_IS_DEPRECATED(ia6)) { int oldflags = ia6->ia6_flags; ia6->ia6_flags |= IN6_IFF_DEPRECATED; /* * If a temporary address has just become deprecated, * regenerate a new one if possible. */ if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (oldflags & IN6_IFF_DEPRECATED) == 0) { if (regen_tmpaddr(ia6) == 0) { /* * A new temporary address is * generated. * XXX: this means the address chain * has changed while we are still in * the loop. Although the change * would not cause disaster (because * it's not a deletion, but an * addition,) we'd rather restart the * loop just for safety. Or does this * significantly reduce performance?? */ goto addrloop; } } } else if ((ia6->ia6_flags & IN6_IFF_TENTATIVE) != 0) { /* * Schedule DAD for a tentative address. This happens * if the interface was down or not running * when the address was configured. */ int delay; delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); nd6_dad_start((struct ifaddr *)ia6, delay); } else { /* * Check status of the interface. If it is down, * mark the address as tentative for future DAD. */ ifp = ia6->ia_ifp; if ((ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0 && ((ifp->if_flags & IFF_UP) == 0 || (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) != 0)){ ia6->ia6_flags &= ~IN6_IFF_DUPLICATED; ia6->ia6_flags |= IN6_IFF_TENTATIVE; } /* * A new RA might have made a deprecated address * preferred. */ ia6->ia6_flags &= ~IN6_IFF_DEPRECATED; } } + NET_EPOCH_EXIT(et); ND6_WLOCK(); restart: LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) { /* * Expire prefixes. Since the pltime is only used for * autoconfigured addresses, pltime processing for prefixes is * not necessary. * * Only unlink after all derived addresses have expired. This * may not occur until two hours after the prefix has expired * per RFC 4862. If the prefix expires before its derived * addresses, mark it off-link. This will be done automatically * after unlinking if no address references remain. */ if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME || time_uptime - pr->ndpr_lastupdate <= pr->ndpr_vltime) continue; if (pr->ndpr_addrcnt == 0) { nd6_prefix_unlink(pr, &prl); continue; } if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { genid = V_nd6_list_genid; nd6_prefix_ref(pr); ND6_WUNLOCK(); ND6_ONLINK_LOCK(); (void)nd6_prefix_offlink(pr); ND6_ONLINK_UNLOCK(); ND6_WLOCK(); nd6_prefix_rele(pr); if (genid != V_nd6_list_genid) goto restart; } } ND6_WUNLOCK(); while ((pr = LIST_FIRST(&prl)) != NULL) { LIST_REMOVE(pr, ndpr_entry); nd6_prefix_del(pr); } callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz, nd6_timer, curvnet); CURVNET_RESTORE(); } /* * ia6 - deprecated/invalidated temporary address */ static int regen_tmpaddr(struct in6_ifaddr *ia6) { - struct epoch_tracker et; struct ifaddr *ifa; struct ifnet *ifp; struct in6_ifaddr *public_ifa6 = NULL; + NET_EPOCH_ASSERT(); + ifp = ia6->ia_ifa.ifa_ifp; - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in6_ifaddr *it6; if (ifa->ifa_addr->sa_family != AF_INET6) continue; it6 = (struct in6_ifaddr *)ifa; /* ignore no autoconf addresses. */ if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; /* ignore autoconf addresses with different prefixes. */ if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr) continue; /* * Now we are looking at an autoconf address with the same * prefix as ours. If the address is temporary and is still * preferred, do not create another one. It would be rare, but * could happen, for example, when we resume a laptop PC after * a long period. */ if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && !IFA6_IS_DEPRECATED(it6)) { public_ifa6 = NULL; break; } /* * This is a public autoconf address that has the same prefix * as ours. If it is preferred, keep it. We can't break the * loop here, because there may be a still-preferred temporary * address with the prefix. */ if (!IFA6_IS_DEPRECATED(it6)) public_ifa6 = it6; } if (public_ifa6 != NULL) ifa_ref(&public_ifa6->ia_ifa); - NET_EPOCH_EXIT(et); if (public_ifa6 != NULL) { int e; if ((e = in6_tmpifadd(public_ifa6, 0, 0)) != 0) { ifa_free(&public_ifa6->ia_ifa); log(LOG_NOTICE, "regen_tmpaddr: failed to create a new" " tmp addr,errno=%d\n", e); return (-1); } ifa_free(&public_ifa6->ia_ifa); return (0); } return (-1); } /* * Remove prefix and default router list entries corresponding to ifp. Neighbor * cache entries are freed in in6_domifdetach(). */ void nd6_purge(struct ifnet *ifp) { struct nd_drhead drq; struct nd_prhead prl; struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; TAILQ_INIT(&drq); LIST_INIT(&prl); /* * Nuke default router list entries toward ifp. * We defer removal of default router list entries that is installed * in the routing table, in order to keep additional side effects as * small as possible. */ ND6_WLOCK(); TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) { if (dr->installed) continue; if (dr->ifp == ifp) defrouter_unlink(dr, &drq); } TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) { if (!dr->installed) continue; if (dr->ifp == ifp) defrouter_unlink(dr, &drq); } /* * Remove prefixes on ifp. We should have already removed addresses on * this interface, so no addresses should be referencing these prefixes. */ LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) { if (pr->ndpr_ifp == ifp) nd6_prefix_unlink(pr, &prl); } ND6_WUNLOCK(); /* Delete the unlinked router and prefix objects. */ while ((dr = TAILQ_FIRST(&drq)) != NULL) { TAILQ_REMOVE(&drq, dr, dr_entry); defrouter_del(dr); } while ((pr = LIST_FIRST(&prl)) != NULL) { LIST_REMOVE(pr, ndpr_entry); nd6_prefix_del(pr); } /* cancel default outgoing interface setting */ if (V_nd6_defifindex == ifp->if_index) nd6_setdefaultiface(0); if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { /* Refresh default router list. */ defrouter_select_fib(ifp->if_fib); } } /* * the caller acquires and releases the lock on the lltbls * Returns the llentry locked */ struct llentry * nd6_lookup(const struct in6_addr *addr6, int flags, struct ifnet *ifp) { struct sockaddr_in6 sin6; struct llentry *ln; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr6; IF_AFDATA_LOCK_ASSERT(ifp); ln = lla_lookup(LLTABLE6(ifp), flags, (struct sockaddr *)&sin6); return (ln); } struct llentry * nd6_alloc(const struct in6_addr *addr6, int flags, struct ifnet *ifp) { struct sockaddr_in6 sin6; struct llentry *ln; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr6; ln = lltable_alloc_entry(LLTABLE6(ifp), 0, (struct sockaddr *)&sin6); if (ln != NULL) ln->ln_state = ND6_LLINFO_NOSTATE; return (ln); } /* * Test whether a given IPv6 address is a neighbor or not, ignoring * the actual neighbor cache. The neighbor cache is ignored in order * to not reenter the routing code from within itself. */ static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { struct nd_prefix *pr; struct ifaddr *ifa; struct rt_addrinfo info; struct sockaddr_in6 rt_key; const struct sockaddr *dst6; uint64_t genid; int error, fibnum; /* * A link-local address is always a neighbor. * XXX: a link does not necessarily specify a single interface. */ if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) { struct sockaddr_in6 sin6_copy; u_int32_t zone; /* * We need sin6_copy since sa6_recoverscope() may modify the * content (XXX). */ sin6_copy = *addr; if (sa6_recoverscope(&sin6_copy)) return (0); /* XXX: should be impossible */ if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone)) return (0); if (sin6_copy.sin6_scope_id == zone) return (1); else return (0); } bzero(&rt_key, sizeof(rt_key)); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key; /* * If the address matches one of our addresses, * it should be a neighbor. * If the address matches one of our on-link prefixes, it should be a * neighbor. */ ND6_RLOCK(); restart: LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { if (pr->ndpr_ifp != ifp) continue; if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { dst6 = (const struct sockaddr *)&pr->ndpr_prefix; /* * We only need to check all FIBs if add_addr_allfibs * is unset. If set, checking any FIB will suffice. */ fibnum = V_rt_add_addr_allfibs ? rt_numfibs - 1 : 0; for (; fibnum < rt_numfibs; fibnum++) { genid = V_nd6_list_genid; ND6_RUNLOCK(); /* * Restore length field before * retrying lookup */ rt_key.sin6_len = sizeof(rt_key); error = rib_lookup_info(fibnum, dst6, 0, 0, &info); ND6_RLOCK(); if (genid != V_nd6_list_genid) goto restart; if (error == 0) break; } if (error != 0) continue; /* * This is the case where multiple interfaces * have the same prefix, but only one is installed * into the routing table and that prefix entry * is not the one being examined here. In the case * where RADIX_MPATH is enabled, multiple route * entries (of the same rt_key value) will be * installed because the interface addresses all * differ. */ if (!IN6_ARE_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, &rt_key.sin6_addr)) continue; } if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, &addr->sin6_addr, &pr->ndpr_mask)) { ND6_RUNLOCK(); return (1); } } ND6_RUNLOCK(); /* * If the address is assigned on the node of the other side of * a p2p interface, the address should be a neighbor. */ if (ifp->if_flags & IFF_POINTOPOINT) { struct epoch_tracker et; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sin6_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { NET_EPOCH_EXIT(et); return 1; } } NET_EPOCH_EXIT(et); } /* * If the default router list is empty, all addresses are regarded * as on-link, and thus, as a neighbor. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && TAILQ_EMPTY(&V_nd_defrouter) && V_nd6_defifindex == ifp->if_index) { return (1); } return (0); } /* * Detect if a given IPv6 address identifies a neighbor on a given link. * XXX: should take care of the destination of a p2p link? */ int nd6_is_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { - struct epoch_tracker et; struct llentry *lle; int rc = 0; + NET_EPOCH_ASSERT(); IF_AFDATA_UNLOCK_ASSERT(ifp); if (nd6_is_new_addr_neighbor(addr, ifp)) return (1); /* * Even if the address matches none of our addresses, it might be * in the neighbor cache. */ - NET_EPOCH_ENTER(et); if ((lle = nd6_lookup(&addr->sin6_addr, 0, ifp)) != NULL) { LLE_RUNLOCK(lle); rc = 1; } - NET_EPOCH_EXIT(et); return (rc); } /* * Free an nd6 llinfo entry. * Since the function would cause significant changes in the kernel, DO NOT * make it global, unless you have a strong reason for the change, and are sure * that the change is safe. * * Set noinline to be dtrace-friendly */ static __noinline void nd6_free(struct llentry **lnp, int gc) { struct ifnet *ifp; struct llentry *ln; struct nd_defrouter *dr; ln = *lnp; *lnp = NULL; LLE_WLOCK_ASSERT(ln); ND6_RLOCK_ASSERT(); ifp = lltable_get_ifp(ln->lle_tbl); if ((ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) != 0) dr = defrouter_lookup_locked(&ln->r_l3addr.addr6, ifp); else dr = NULL; ND6_RUNLOCK(); if ((ln->la_flags & LLE_DELETED) == 0) EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED); /* * we used to have pfctlinput(PRC_HOSTDEAD) here. * even though it is not harmful, it was not really necessary. */ /* cancel timer */ nd6_llinfo_settimer_locked(ln, -1); if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { if (dr != NULL && dr->expire && ln->ln_state == ND6_LLINFO_STALE && gc) { /* * If the reason for the deletion is just garbage * collection, and the neighbor is an active default * router, do not delete it. Instead, reset the GC * timer using the router's lifetime. * Simply deleting the entry would affect default * router selection, which is not necessarily a good * thing, especially when we're using router preference * values. * XXX: the check for ln_state would be redundant, * but we intentionally keep it just in case. */ if (dr->expire > time_uptime) nd6_llinfo_settimer_locked(ln, (dr->expire - time_uptime) * hz); else nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz); LLE_REMREF(ln); LLE_WUNLOCK(ln); defrouter_rele(dr); return; } if (dr) { /* * Unreachablity of a router might affect the default * router selection and on-link detection of advertised * prefixes. */ /* * Temporarily fake the state to choose a new default * router and to perform on-link determination of * prefixes correctly. * Below the state will be set correctly, * or the entry itself will be deleted. */ ln->ln_state = ND6_LLINFO_INCOMPLETE; } if (ln->ln_router || dr) { /* * We need to unlock to avoid a LOR with rt6_flush() with the * rnh and for the calls to pfxlist_onlink_check() and * defrouter_select_fib() in the block further down for calls * into nd6_lookup(). We still hold a ref. */ LLE_WUNLOCK(ln); /* * rt6_flush must be called whether or not the neighbor * is in the Default Router List. * See a corresponding comment in nd6_na_input(). */ rt6_flush(&ln->r_l3addr.addr6, ifp); } if (dr) { /* * Since defrouter_select_fib() does not affect the * on-link determination and MIP6 needs the check * before the default router selection, we perform * the check now. */ pfxlist_onlink_check(); /* * Refresh default router list. */ defrouter_select_fib(dr->ifp->if_fib); } /* * If this entry was added by an on-link redirect, remove the * corresponding host route. */ if (ln->la_flags & LLE_REDIRECT) nd6_free_redirect(ln); if (ln->ln_router || dr) LLE_WLOCK(ln); } /* * Save to unlock. We still hold an extra reference and will not * free(9) in llentry_free() if someone else holds one as well. */ LLE_WUNLOCK(ln); IF_AFDATA_LOCK(ifp); LLE_WLOCK(ln); /* Guard against race with other llentry_free(). */ if (ln->la_flags & LLE_LINKED) { /* Remove callout reference */ LLE_REMREF(ln); lltable_unlink_entry(ln->lle_tbl, ln); } IF_AFDATA_UNLOCK(ifp); llentry_free(ln); if (dr != NULL) defrouter_rele(dr); } static int nd6_isdynrte(const struct rtentry *rt, void *xap) { if (rt->rt_flags == (RTF_UP | RTF_HOST | RTF_DYNAMIC)) return (1); return (0); } /* * Remove the rtentry for the given llentry, * both of which were installed by a redirect. */ static void nd6_free_redirect(const struct llentry *ln) { int fibnum; struct sockaddr_in6 sin6; struct rt_addrinfo info; lltable_fill_sa_entry(ln, (struct sockaddr *)&sin6); memset(&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&sin6; info.rti_filter = nd6_isdynrte; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum); } /* * Rejuvenate this function for routing operations related * processing. */ void nd6_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info) { struct sockaddr_in6 *gateway; struct nd_defrouter *dr; struct ifnet *ifp; gateway = (struct sockaddr_in6 *)rt->rt_gateway; ifp = rt->rt_ifp; switch (req) { case RTM_ADD: break; case RTM_DELETE: if (!ifp) return; /* * Only indirect routes are interesting. */ if ((rt->rt_flags & RTF_GATEWAY) == 0) return; /* * check for default route */ if (IN6_ARE_ADDR_EQUAL(&in6addr_any, &SIN6(rt_key(rt))->sin6_addr)) { dr = defrouter_lookup(&gateway->sin6_addr, ifp); if (dr != NULL) { dr->installed = 0; defrouter_rele(dr); } } break; } } int nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) { struct in6_ndireq *ndi = (struct in6_ndireq *)data; struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data; struct in6_ndifreq *ndif = (struct in6_ndifreq *)data; struct epoch_tracker et; int error = 0; if (ifp->if_afdata[AF_INET6] == NULL) return (EPFNOSUPPORT); switch (cmd) { case OSIOCGIFINFO_IN6: #define ND ndi->ndi /* XXX: old ndp(8) assumes a positive value for linkmtu. */ bzero(&ND, sizeof(ND)); ND.linkmtu = IN6_LINKMTU(ifp); ND.maxmtu = ND_IFINFO(ifp)->maxmtu; ND.basereachable = ND_IFINFO(ifp)->basereachable; ND.reachable = ND_IFINFO(ifp)->reachable; ND.retrans = ND_IFINFO(ifp)->retrans; ND.flags = ND_IFINFO(ifp)->flags; ND.recalctm = ND_IFINFO(ifp)->recalctm; ND.chlim = ND_IFINFO(ifp)->chlim; break; case SIOCGIFINFO_IN6: ND = *ND_IFINFO(ifp); break; case SIOCSIFINFO_IN6: /* * used to change host variables from userland. * intended for a use on router to reflect RA configurations. */ /* 0 means 'unspecified' */ if (ND.linkmtu != 0) { if (ND.linkmtu < IPV6_MMTU || ND.linkmtu > IN6_LINKMTU(ifp)) { error = EINVAL; break; } ND_IFINFO(ifp)->linkmtu = ND.linkmtu; } if (ND.basereachable != 0) { int obasereachable = ND_IFINFO(ifp)->basereachable; ND_IFINFO(ifp)->basereachable = ND.basereachable; if (ND.basereachable != obasereachable) ND_IFINFO(ifp)->reachable = ND_COMPUTE_RTIME(ND.basereachable); } if (ND.retrans != 0) ND_IFINFO(ifp)->retrans = ND.retrans; if (ND.chlim != 0) ND_IFINFO(ifp)->chlim = ND.chlim; /* FALLTHROUGH */ case SIOCSIFINFO_FLAGS: { struct ifaddr *ifa; struct in6_ifaddr *ia; + NET_EPOCH_ENTER(et); if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && !(ND.flags & ND6_IFF_IFDISABLED)) { /* ifdisabled 1->0 transision */ /* * If the interface is marked as ND6_IFF_IFDISABLED and * has an link-local address with IN6_IFF_DUPLICATED, * do not clear ND6_IFF_IFDISABLED. * See RFC 4862, Section 5.4.5. */ - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if ((ia->ia6_flags & IN6_IFF_DUPLICATED) && IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) break; } - NET_EPOCH_EXIT(et); if (ifa != NULL) { /* LLA is duplicated. */ ND.flags |= ND6_IFF_IFDISABLED; log(LOG_ERR, "Cannot enable an interface" " with a link-local address marked" " duplicate.\n"); } else { ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED; if (ifp->if_flags & IFF_UP) in6_if_up(ifp); } } else if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && (ND.flags & ND6_IFF_IFDISABLED)) { /* ifdisabled 0->1 transision */ /* Mark all IPv6 address as tentative. */ ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED; if (V_ip6_dad_count > 0 && (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0) { - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; ia->ia6_flags |= IN6_IFF_TENTATIVE; } - NET_EPOCH_EXIT(et); } } if (ND.flags & ND6_IFF_AUTO_LINKLOCAL) { if (!(ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL)) { /* auto_linklocal 0->1 transision */ /* If no link-local address on ifp, configure */ ND_IFINFO(ifp)->flags |= ND6_IFF_AUTO_LINKLOCAL; in6_ifattach(ifp, NULL); } else if (!(ND.flags & ND6_IFF_IFDISABLED) && ifp->if_flags & IFF_UP) { /* * When the IF already has * ND6_IFF_AUTO_LINKLOCAL, no link-local * address is assigned, and IFF_UP, try to * assign one. */ - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) break; } - NET_EPOCH_EXIT(et); if (ifa != NULL) /* No LLA is configured. */ in6_ifattach(ifp, NULL); } } - } ND_IFINFO(ifp)->flags = ND.flags; + NET_EPOCH_EXIT(et); break; + } #undef ND case SIOCSNDFLUSH_IN6: /* XXX: the ioctl name is confusing... */ /* sync kernel routing table with the default router list */ defrouter_reset(); defrouter_select(); break; case SIOCSPFXFLUSH_IN6: { /* flush all the prefix advertised by routers */ struct in6_ifaddr *ia, *ia_next; struct nd_prefix *pr, *next; struct nd_prhead prl; LIST_INIT(&prl); ND6_WLOCK(); LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, next) { if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) continue; /* XXX */ nd6_prefix_unlink(pr, &prl); } ND6_WUNLOCK(); while ((pr = LIST_FIRST(&prl)) != NULL) { LIST_REMOVE(pr, ndpr_entry); /* XXXRW: in6_ifaddrhead locking. */ CK_STAILQ_FOREACH_SAFE(ia, &V_in6_ifaddrhead, ia_link, ia_next) { if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; if (ia->ia6_ndpr == pr) in6_purgeaddr(&ia->ia_ifa); } nd6_prefix_del(pr); } break; } case SIOCSRTRFLUSH_IN6: { /* flush all the default routers */ struct nd_drhead drq; struct nd_defrouter *dr; TAILQ_INIT(&drq); defrouter_reset(); ND6_WLOCK(); while ((dr = TAILQ_FIRST(&V_nd_defrouter)) != NULL) defrouter_unlink(dr, &drq); ND6_WUNLOCK(); while ((dr = TAILQ_FIRST(&drq)) != NULL) { TAILQ_REMOVE(&drq, dr, dr_entry); defrouter_del(dr); } defrouter_select(); break; } case SIOCGNBRINFO_IN6: { struct llentry *ln; struct in6_addr nb_addr = nbi->addr; /* make local for safety */ if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0) return (error); NET_EPOCH_ENTER(et); ln = nd6_lookup(&nb_addr, 0, ifp); NET_EPOCH_EXIT(et); if (ln == NULL) { error = EINVAL; break; } nbi->state = ln->ln_state; nbi->asked = ln->la_asked; nbi->isrouter = ln->ln_router; if (ln->la_expire == 0) nbi->expire = 0; else nbi->expire = ln->la_expire + ln->lle_remtime / hz + (time_second - time_uptime); LLE_RUNLOCK(ln); break; } case SIOCGDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ ndif->ifindex = V_nd6_defifindex; break; case SIOCSDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ return (nd6_setdefaultiface(ndif->ifindex)); } return (error); } /* * Calculates new isRouter value based on provided parameters and * returns it. */ static int nd6_is_router(int type, int code, int is_new, int old_addr, int new_addr, int ln_router) { /* * ICMP6 type dependent behavior. * * NS: clear IsRouter if new entry * RS: clear IsRouter * RA: set IsRouter if there's lladdr * redir: clear IsRouter if new entry * * RA case, (1): * The spec says that we must set IsRouter in the following cases: * - If lladdr exist, set IsRouter. This means (1-5). * - If it is old entry (!newentry), set IsRouter. This means (7). * So, based on the spec, in (1-5) and (7) cases we must set IsRouter. * A quetion arises for (1) case. (1) case has no lladdr in the * neighbor cache, this is similar to (6). * This case is rare but we figured that we MUST NOT set IsRouter. * * is_new old_addr new_addr NS RS RA redir * D R * 0 n n (1) c ? s * 0 y n (2) c s s * 0 n y (3) c s s * 0 y y (4) c s s * 0 y y (5) c s s * 1 -- n (6) c c c s * 1 -- y (7) c c s c s * * (c=clear s=set) */ switch (type & 0xff) { case ND_NEIGHBOR_SOLICIT: /* * New entry must have is_router flag cleared. */ if (is_new) /* (6-7) */ ln_router = 0; break; case ND_REDIRECT: /* * If the icmp is a redirect to a better router, always set the * is_router flag. Otherwise, if the entry is newly created, * clear the flag. [RFC 2461, sec 8.3] */ if (code == ND_REDIRECT_ROUTER) ln_router = 1; else { if (is_new) /* (6-7) */ ln_router = 0; } break; case ND_ROUTER_SOLICIT: /* * is_router flag must always be cleared. */ ln_router = 0; break; case ND_ROUTER_ADVERT: /* * Mark an entry with lladdr as a router. */ if ((!is_new && (old_addr || new_addr)) || /* (2-5) */ (is_new && new_addr)) { /* (7) */ ln_router = 1; } break; } return (ln_router); } /* * Create neighbor cache entry and cache link-layer address, * on reception of inbound ND6 packets. (RS/RA/NS/redirect) * * type - ICMP6 type * code - type dependent information * */ void nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, int lladdrlen, int type, int code) { struct llentry *ln = NULL, *ln_tmp; int is_newentry; int do_update; int olladdr; int llchange; int flags; uint16_t router = 0; struct sockaddr_in6 sin6; - struct epoch_tracker et; struct mbuf *chain = NULL; u_char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; + NET_EPOCH_ASSERT(); IF_AFDATA_UNLOCK_ASSERT(ifp); KASSERT(ifp != NULL, ("%s: ifp == NULL", __func__)); KASSERT(from != NULL, ("%s: from == NULL", __func__)); /* nothing must be updated for unspecified address */ if (IN6_IS_ADDR_UNSPECIFIED(from)) return; /* * Validation about ifp->if_addrlen and lladdrlen must be done in * the caller. * * XXX If the link does not have link-layer adderss, what should * we do? (ifp->if_addrlen == 0) * Spec says nothing in sections for RA, RS and NA. There's small * description on it in NS section (RFC 2461 7.2.3). */ flags = lladdr ? LLE_EXCLUSIVE : 0; - NET_EPOCH_ENTER(et); ln = nd6_lookup(from, flags, ifp); - NET_EPOCH_EXIT(et); is_newentry = 0; if (ln == NULL) { flags |= LLE_EXCLUSIVE; ln = nd6_alloc(from, 0, ifp); if (ln == NULL) return; /* * Since we already know all the data for the new entry, * fill it before insertion. */ if (lladdr != NULL) { linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET6, lladdr, linkhdr, &linkhdrsize, &lladdr_off) != 0) return; lltable_set_entry_addr(ifp, ln, linkhdr, linkhdrsize, lladdr_off); } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(ln); /* Prefer any existing lle over newly-created one */ ln_tmp = nd6_lookup(from, LLE_EXCLUSIVE, ifp); if (ln_tmp == NULL) lltable_link_entry(LLTABLE6(ifp), ln); IF_AFDATA_WUNLOCK(ifp); if (ln_tmp == NULL) { /* No existing lle, mark as new entry (6,7) */ is_newentry = 1; if (lladdr != NULL) { /* (7) */ nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); } } else { lltable_free_entry(LLTABLE6(ifp), ln); ln = ln_tmp; ln_tmp = NULL; } } /* do nothing if static ndp is set */ if ((ln->la_flags & LLE_STATIC)) { if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(ln); else LLE_RUNLOCK(ln); return; } olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0; if (olladdr && lladdr) { llchange = bcmp(lladdr, ln->ll_addr, ifp->if_addrlen); } else if (!olladdr && lladdr) llchange = 1; else llchange = 0; /* * newentry olladdr lladdr llchange (*=record) * 0 n n -- (1) * 0 y n -- (2) * 0 n y y (3) * STALE * 0 y y n (4) * * 0 y y y (5) * STALE * 1 -- n -- (6) NOSTATE(= PASSIVE) * 1 -- y -- (7) * STALE */ do_update = 0; if (is_newentry == 0 && llchange != 0) { do_update = 1; /* (3,5) */ /* * Record source link-layer address * XXX is it dependent to ifp->if_type? */ linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET6, lladdr, linkhdr, &linkhdrsize, &lladdr_off) != 0) return; if (lltable_try_set_entry_addr(ifp, ln, linkhdr, linkhdrsize, lladdr_off) == 0) { /* Entry was deleted */ return; } nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); if (ln->la_hold != NULL) nd6_grab_holdchain(ln, &chain, &sin6); } /* Calculates new router status */ router = nd6_is_router(type, code, is_newentry, olladdr, lladdr != NULL ? 1 : 0, ln->ln_router); ln->ln_router = router; /* Mark non-router redirects with special flag */ if ((type & 0xFF) == ND_REDIRECT && code != ND_REDIRECT_ROUTER) ln->la_flags |= LLE_REDIRECT; if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(ln); else LLE_RUNLOCK(ln); if (chain != NULL) nd6_flush_holdchain(ifp, chain, &sin6); /* * When the link-layer address of a router changes, select the * best router again. In particular, when the neighbor entry is newly * created, it might affect the selection policy. * Question: can we restrict the first condition to the "is_newentry" * case? * XXX: when we hear an RA from a new router with the link-layer * address option, defrouter_select_fib() is called twice, since * defrtrlist_update called the function as well. However, I believe * we can compromise the overhead, since it only happens the first * time. * XXX: although defrouter_select_fib() should not have a bad effect * for those are not autoconfigured hosts, we explicitly avoid such * cases for safety. */ if ((do_update || is_newentry) && router && ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { /* * guaranteed recursion */ defrouter_select_fib(ifp->if_fib); } } static void nd6_slowtimo(void *arg) { struct epoch_tracker et; CURVNET_SET((struct vnet *) arg); struct nd_ifinfo *nd6if; struct ifnet *ifp; callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, curvnet); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_afdata[AF_INET6] == NULL) continue; nd6if = ND_IFINFO(ifp); if (nd6if->basereachable && /* already initialized */ (nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) { /* * Since reachable time rarely changes by router * advertisements, we SHOULD insure that a new random * value gets recomputed at least once every few hours. * (RFC 2461, 6.3.4) */ nd6if->recalctm = V_nd6_recalc_reachtm_interval; nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable); } } NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } void nd6_grab_holdchain(struct llentry *ln, struct mbuf **chain, struct sockaddr_in6 *sin6) { LLE_WLOCK_ASSERT(ln); *chain = ln->la_hold; ln->la_hold = NULL; lltable_fill_sa_entry(ln, (struct sockaddr *)sin6); if (ln->ln_state == ND6_LLINFO_STALE) { /* * The first time we send a packet to a * neighbor whose entry is STALE, we have * to change the state to DELAY and a sets * a timer to expire in DELAY_FIRST_PROBE_TIME * seconds to ensure do neighbor unreachability * detection on expiration. * (RFC 2461 7.3.3) */ nd6_llinfo_setstate(ln, ND6_LLINFO_DELAY); } } int nd6_output_ifp(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m, struct sockaddr_in6 *dst, struct route *ro) { int error; int ip6len; struct ip6_hdr *ip6; struct m_tag *mtag; #ifdef MAC mac_netinet6_nd6_send(ifp, m); #endif /* * If called from nd6_ns_output() (NS), nd6_na_output() (NA), * icmp6_redirect_output() (REDIRECT) or from rip6_output() (RS, RA * as handled by rtsol and rtadvd), mbufs will be tagged for SeND * to be diverted to user space. When re-injected into the kernel, * send_output() will directly dispatch them to the outgoing interface. */ if (send_sendso_input_hook != NULL) { mtag = m_tag_find(m, PACKET_TAG_ND_OUTGOING, NULL); if (mtag != NULL) { ip6 = mtod(m, struct ip6_hdr *); ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); /* Use the SEND socket */ error = send_sendso_input_hook(m, ifp, SND_OUT, ip6len); /* -1 == no app on SEND socket */ if (error == 0 || error != -1) return (error); } } m_clrprotoflags(m); /* Avoid confusing lower layers. */ IP_PROBE(send, NULL, NULL, mtod(m, struct ip6_hdr *), ifp, NULL, mtod(m, struct ip6_hdr *)); if ((ifp->if_flags & IFF_LOOPBACK) == 0) origifp = ifp; error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, ro); return (error); } /* * Lookup link headerfor @sa_dst address. Stores found * data in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * If destination LLE does not exists or lle state modification * is required, call "slow" version. * * Return values: * - 0 on success (address copied to buffer). * - EWOULDBLOCK (no local error, but address is still unresolved) * - other errors (alloc failure, etc) */ int nd6_resolve(struct ifnet *ifp, int is_gw, struct mbuf *m, const struct sockaddr *sa_dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { - struct epoch_tracker et; struct llentry *ln = NULL; const struct sockaddr_in6 *dst6; + NET_EPOCH_ASSERT(); + if (pflags != NULL) *pflags = 0; dst6 = (const struct sockaddr_in6 *)sa_dst; /* discard the packet if IPv6 operation is disabled on the interface */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) { m_freem(m); return (ENETDOWN); /* better error? */ } if (m != NULL && m->m_flags & M_MCAST) { switch (ifp->if_type) { case IFT_ETHER: case IFT_L2VLAN: case IFT_BRIDGE: ETHER_MAP_IPV6_MULTICAST(&dst6->sin6_addr, desten); return (0); default: m_freem(m); return (EAFNOSUPPORT); } } - NET_EPOCH_ENTER(et); ln = nd6_lookup(&dst6->sin6_addr, plle ? LLE_EXCLUSIVE : LLE_UNLOCKED, ifp); if (ln != NULL && (ln->r_flags & RLLE_VALID) != 0) { /* Entry found, let's copy lle info */ bcopy(ln->r_linkdata, desten, ln->r_hdrlen); if (pflags != NULL) *pflags = LLE_VALID | (ln->r_flags & RLLE_IFADDR); /* Check if we have feedback request from nd6 timer */ if (ln->r_skip_req != 0) { LLE_REQ_LOCK(ln); ln->r_skip_req = 0; /* Notify that entry was used */ ln->lle_hittime = time_uptime; LLE_REQ_UNLOCK(ln); } if (plle) { LLE_ADDREF(ln); *plle = ln; LLE_WUNLOCK(ln); } - NET_EPOCH_EXIT(et); return (0); } else if (plle && ln) LLE_WUNLOCK(ln); - NET_EPOCH_EXIT(et); return (nd6_resolve_slow(ifp, 0, m, dst6, desten, pflags, plle)); } /* * Do L2 address resolution for @sa_dst address. Stores found * address in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * Heavy version. * Function assume that destination LLE does not exist, * is invalid or stale, so LLE_EXCLUSIVE lock needs to be acquired. * * Set noinline to be dtrace-friendly */ static __noinline int nd6_resolve_slow(struct ifnet *ifp, int flags, struct mbuf *m, const struct sockaddr_in6 *dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { struct llentry *lle = NULL, *lle_tmp; struct in6_addr *psrc, src; int send_ns, ll_len; char *lladdr; + NET_EPOCH_ASSERT(); + /* * Address resolution or Neighbor Unreachability Detection * for the next hop. * At this point, the destination of the packet must be a unicast * or an anycast address(i.e. not a multicast). */ if (lle == NULL) { - struct epoch_tracker et; - - NET_EPOCH_ENTER(et); lle = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp); - NET_EPOCH_EXIT(et); if ((lle == NULL) && nd6_is_addr_neighbor(dst, ifp)) { /* * Since nd6_is_addr_neighbor() internally calls nd6_lookup(), * the condition below is not very efficient. But we believe * it is tolerable, because this should be a rare case. */ lle = nd6_alloc(&dst->sin6_addr, 0, ifp); if (lle == NULL) { char ip6buf[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "nd6_output: can't allocate llinfo for %s " "(ln=%p)\n", ip6_sprintf(ip6buf, &dst->sin6_addr), lle); m_freem(m); return (ENOBUFS); } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(lle); /* Prefer any existing entry over newly-created one */ lle_tmp = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp); if (lle_tmp == NULL) lltable_link_entry(LLTABLE6(ifp), lle); IF_AFDATA_WUNLOCK(ifp); if (lle_tmp != NULL) { lltable_free_entry(LLTABLE6(ifp), lle); lle = lle_tmp; lle_tmp = NULL; } } } if (lle == NULL) { if (!(ND_IFINFO(ifp)->flags & ND6_IFF_PERFORMNUD)) { m_freem(m); return (ENOBUFS); } if (m != NULL) m_freem(m); return (ENOBUFS); } LLE_WLOCK_ASSERT(lle); /* * The first time we send a packet to a neighbor whose entry is * STALE, we have to change the state to DELAY and a sets a timer to * expire in DELAY_FIRST_PROBE_TIME seconds to ensure do * neighbor unreachability detection on expiration. * (RFC 2461 7.3.3) */ if (lle->ln_state == ND6_LLINFO_STALE) nd6_llinfo_setstate(lle, ND6_LLINFO_DELAY); /* * If the neighbor cache entry has a state other than INCOMPLETE * (i.e. its link-layer address is already resolved), just * send the packet. */ if (lle->ln_state > ND6_LLINFO_INCOMPLETE) { if (flags & LLE_ADDRONLY) { lladdr = lle->ll_addr; ll_len = ifp->if_addrlen; } else { lladdr = lle->r_linkdata; ll_len = lle->r_hdrlen; } bcopy(lladdr, desten, ll_len); if (pflags != NULL) *pflags = lle->la_flags; if (plle) { LLE_ADDREF(lle); *plle = lle; } LLE_WUNLOCK(lle); return (0); } /* * There is a neighbor cache entry, but no ethernet address * response yet. Append this latest packet to the end of the * packet queue in the mbuf. When it exceeds nd6_maxqueuelen, * the oldest packet in the queue will be removed. */ if (lle->la_hold != NULL) { struct mbuf *m_hold; int i; i = 0; for (m_hold = lle->la_hold; m_hold; m_hold = m_hold->m_nextpkt){ i++; if (m_hold->m_nextpkt == NULL) { m_hold->m_nextpkt = m; break; } } while (i >= V_nd6_maxqueuelen) { m_hold = lle->la_hold; lle->la_hold = lle->la_hold->m_nextpkt; m_freem(m_hold); i--; } } else { lle->la_hold = m; } /* * If there has been no NS for the neighbor after entering the * INCOMPLETE state, send the first solicitation. * Note that for newly-created lle la_asked will be 0, * so we will transition from ND6_LLINFO_NOSTATE to * ND6_LLINFO_INCOMPLETE state here. */ psrc = NULL; send_ns = 0; if (lle->la_asked == 0) { lle->la_asked++; send_ns = 1; psrc = nd6_llinfo_get_holdsrc(lle, &src); nd6_llinfo_setstate(lle, ND6_LLINFO_INCOMPLETE); } LLE_WUNLOCK(lle); if (send_ns != 0) nd6_ns_output(ifp, psrc, NULL, &dst->sin6_addr, NULL); return (EWOULDBLOCK); } /* * Do L2 address resolution for @sa_dst address. Stores found * address in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * Return values: * - 0 on success (address copied to buffer). * - EWOULDBLOCK (no local error, but address is still unresolved) * - other errors (alloc failure, etc) */ int nd6_resolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst, char *desten, uint32_t *pflags) { int error; flags |= LLE_ADDRONLY; error = nd6_resolve_slow(ifp, flags, NULL, (const struct sockaddr_in6 *)dst, desten, pflags, NULL); return (error); } int nd6_flush_holdchain(struct ifnet *ifp, struct mbuf *chain, struct sockaddr_in6 *dst) { struct mbuf *m, *m_head; int error = 0; m_head = chain; while (m_head) { m = m_head; m_head = m_head->m_nextpkt; error = nd6_output_ifp(ifp, ifp, m, dst, NULL); } /* * XXX * note that intermediate errors are blindly ignored */ return (error); } static int nd6_need_cache(struct ifnet *ifp) { /* * XXX: we currently do not make neighbor cache on any interface * other than Ethernet and GIF. * * RFC2893 says: * - unidirectional tunnels needs no ND */ switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE1394: case IFT_L2VLAN: case IFT_INFINIBAND: case IFT_BRIDGE: case IFT_PROPVIRTUAL: return (1); default: return (0); } } /* * Add pernament ND6 link-layer record for given * interface address. * * Very similar to IPv4 arp_ifinit(), but: * 1) IPv6 DAD is performed in different place * 2) It is called by IPv6 protocol stack in contrast to * arp_ifinit() which is typically called in SIOCSIFADDR * driver ioctl handler. * */ int nd6_add_ifa_lle(struct in6_ifaddr *ia) { struct ifnet *ifp; struct llentry *ln, *ln_tmp; struct sockaddr *dst; ifp = ia->ia_ifa.ifa_ifp; if (nd6_need_cache(ifp) == 0) return (0); ia->ia_ifa.ifa_rtrequest = nd6_rtrequest; dst = (struct sockaddr *)&ia->ia_addr; ln = lltable_alloc_entry(LLTABLE6(ifp), LLE_IFADDR, dst); if (ln == NULL) return (ENOBUFS); IF_AFDATA_WLOCK(ifp); LLE_WLOCK(ln); /* Unlink any entry if exists */ ln_tmp = lla_lookup(LLTABLE6(ifp), LLE_EXCLUSIVE, dst); if (ln_tmp != NULL) lltable_unlink_entry(LLTABLE6(ifp), ln_tmp); lltable_link_entry(LLTABLE6(ifp), ln); IF_AFDATA_WUNLOCK(ifp); if (ln_tmp != NULL) EVENTHANDLER_INVOKE(lle_event, ln_tmp, LLENTRY_EXPIRED); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); LLE_WUNLOCK(ln); if (ln_tmp != NULL) llentry_free(ln_tmp); return (0); } /* * Removes either all lle entries for given @ia, or lle * corresponding to @ia address. */ void nd6_rem_ifa_lle(struct in6_ifaddr *ia, int all) { struct sockaddr_in6 mask, addr; struct sockaddr *saddr, *smask; struct ifnet *ifp; ifp = ia->ia_ifa.ifa_ifp; memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr)); memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask)); saddr = (struct sockaddr *)&addr; smask = (struct sockaddr *)&mask; if (all != 0) lltable_prefix_free(AF_INET6, saddr, smask, LLE_STATIC); else lltable_delete_addr(LLTABLE6(ifp), LLE_IFADDR, saddr); } static void clear_llinfo_pqueue(struct llentry *ln) { struct mbuf *m_hold, *m_hold_next; for (m_hold = ln->la_hold; m_hold; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_freem(m_hold); } ln->la_hold = NULL; } static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS); static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS); SYSCTL_DECL(_net_inet6_icmp6); SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, nd6_sysctl_drlist, "S,in6_defrouter", "NDP default router list"); SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, nd6_sysctl_prlist, "S,in6_prefix", "NDP prefix list"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, ""); SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_gctimer, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_gctimer), (60 * 60 * 24), ""); static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS) { struct in6_defrouter d; struct nd_defrouter *dr; int error; if (req->newptr != NULL) return (EPERM); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); bzero(&d, sizeof(d)); d.rtaddr.sin6_family = AF_INET6; d.rtaddr.sin6_len = sizeof(d.rtaddr); ND6_RLOCK(); TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { d.rtaddr.sin6_addr = dr->rtaddr; error = sa6_recoverscope(&d.rtaddr); if (error != 0) break; d.flags = dr->raflags; d.rtlifetime = dr->rtlifetime; d.expire = dr->expire + (time_second - time_uptime); d.if_index = dr->ifp->if_index; error = SYSCTL_OUT(req, &d, sizeof(d)); if (error != 0) break; } ND6_RUNLOCK(); return (error); } static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS) { struct in6_prefix p; struct sockaddr_in6 s6; struct nd_prefix *pr; struct nd_pfxrouter *pfr; time_t maxexpire; int error; char ip6buf[INET6_ADDRSTRLEN]; if (req->newptr) return (EPERM); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); bzero(&p, sizeof(p)); p.origin = PR_ORIG_RA; bzero(&s6, sizeof(s6)); s6.sin6_family = AF_INET6; s6.sin6_len = sizeof(s6); ND6_RLOCK(); LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { p.prefix = pr->ndpr_prefix; if (sa6_recoverscope(&p.prefix)) { log(LOG_ERR, "scope error in prefix list (%s)\n", ip6_sprintf(ip6buf, &p.prefix.sin6_addr)); /* XXX: press on... */ } p.raflags = pr->ndpr_raf; p.prefixlen = pr->ndpr_plen; p.vltime = pr->ndpr_vltime; p.pltime = pr->ndpr_pltime; p.if_index = pr->ndpr_ifp->if_index; if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME) p.expire = 0; else { /* XXX: we assume time_t is signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (pr->ndpr_vltime < maxexpire - pr->ndpr_lastupdate) p.expire = pr->ndpr_lastupdate + pr->ndpr_vltime + (time_second - time_uptime); else p.expire = maxexpire; } p.refcnt = pr->ndpr_addrcnt; p.flags = pr->ndpr_stateflags; p.advrtrs = 0; LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) p.advrtrs++; error = SYSCTL_OUT(req, &p, sizeof(p)); if (error != 0) break; LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) { s6.sin6_addr = pfr->router->rtaddr; if (sa6_recoverscope(&s6)) log(LOG_ERR, "scope error in prefix list (%s)\n", ip6_sprintf(ip6buf, &pfr->router->rtaddr)); error = SYSCTL_OUT(req, &s6, sizeof(s6)); if (error != 0) goto out; } } out: ND6_RUNLOCK(); return (error); } diff --git a/sys/netinet6/nd6_nbr.c b/sys/netinet6/nd6_nbr.c index cbac0dcb724e..3624909cc164 100644 --- a/sys/netinet6/nd6_nbr.c +++ b/sys/netinet6/nd6_nbr.c @@ -1,1549 +1,1553 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: nd6_nbr.c,v 1.86 2002/01/21 02:33:04 jinmei Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #define SDL(s) ((struct sockaddr_dl *)s) struct dadq; static struct dadq *nd6_dad_find(struct ifaddr *, struct nd_opt_nonce *); static void nd6_dad_add(struct dadq *dp); static void nd6_dad_del(struct dadq *dp); static void nd6_dad_rele(struct dadq *); static void nd6_dad_starttimer(struct dadq *, int, int); static void nd6_dad_stoptimer(struct dadq *); static void nd6_dad_timer(struct dadq *); static void nd6_dad_duplicated(struct ifaddr *, struct dadq *); static void nd6_dad_ns_output(struct dadq *); static void nd6_dad_ns_input(struct ifaddr *, struct nd_opt_nonce *); static void nd6_dad_na_input(struct ifaddr *); static void nd6_na_output_fib(struct ifnet *, const struct in6_addr *, const struct in6_addr *, u_long, int, struct sockaddr *, u_int); static void nd6_ns_output_fib(struct ifnet *, const struct in6_addr *, const struct in6_addr *, const struct in6_addr *, uint8_t *, u_int); VNET_DEFINE_STATIC(int, dad_enhanced) = 1; #define V_dad_enhanced VNET(dad_enhanced) SYSCTL_DECL(_net_inet6_ip6); SYSCTL_INT(_net_inet6_ip6, OID_AUTO, dad_enhanced, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dad_enhanced), 0, "Enable Enhanced DAD, which adds a random nonce to NS messages for DAD."); VNET_DEFINE_STATIC(int, dad_maxtry) = 15; /* max # of *tries* to transmit DAD packet */ #define V_dad_maxtry VNET(dad_maxtry) /* * Input a Neighbor Solicitation Message. * * Based on RFC 2461 * Based on RFC 2462 (duplicate address detection) */ void nd6_ns_input(struct mbuf *m, int off, int icmp6len) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_neighbor_solicit *nd_ns; struct in6_addr saddr6 = ip6->ip6_src; struct in6_addr daddr6 = ip6->ip6_dst; struct in6_addr taddr6; struct in6_addr myaddr6; char *lladdr = NULL; struct ifaddr *ifa = NULL; int lladdrlen = 0; int anycast = 0, proxy = 0, tentative = 0; int tlladdr; int rflag; union nd_opts ndopts; struct sockaddr_dl proxydl; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; /* RFC 6980: Nodes MUST silently ignore fragments */ if(m->m_flags & M_FRAGMENTED) goto freeit; rflag = (V_ip6_forwarding) ? ND_NA_FLAG_ROUTER : 0; if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && V_ip6_norbit_raif) rflag = 0; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_ns = (struct nd_neighbor_solicit *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_ns, struct nd_neighbor_solicit *, m, off, icmp6len); if (nd_ns == NULL) { ICMP6STAT_INC(icp6s_tooshort); return; } #endif ip6 = mtod(m, struct ip6_hdr *); /* adjust pointer for safety */ taddr6 = nd_ns->nd_ns_target; if (in6_setscope(&taddr6, ifp, NULL) != 0) goto bad; if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "nd6_ns_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp))); goto bad; } if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) { /* dst has to be a solicited node multicast address. */ if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL && /* don't check ifindex portion */ daddr6.s6_addr32[1] == 0 && daddr6.s6_addr32[2] == IPV6_ADDR_INT32_ONE && daddr6.s6_addr8[12] == 0xff) { ; /* good */ } else { nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet " "(wrong ip6 dst)\n")); goto bad; } } else if (!V_nd6_onlink_ns_rfc4861) { struct sockaddr_in6 src_sa6; /* * According to recent IETF discussions, it is not a good idea * to accept a NS from an address which would not be deemed * to be a neighbor otherwise. This point is expected to be * clarified in future revisions of the specification. */ bzero(&src_sa6, sizeof(src_sa6)); src_sa6.sin6_family = AF_INET6; src_sa6.sin6_len = sizeof(src_sa6); src_sa6.sin6_addr = saddr6; if (nd6_is_addr_neighbor(&src_sa6, ifp) == 0) { nd6log((LOG_INFO, "nd6_ns_input: " "NS packet from non-neighbor\n")); goto bad; } } if (IN6_IS_ADDR_MULTICAST(&taddr6)) { nd6log((LOG_INFO, "nd6_ns_input: bad NS target (multicast)\n")); goto bad; } icmp6len -= sizeof(*nd_ns); nd6_option_init(nd_ns + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "nd6_ns_input: invalid ND option, ignored\n")); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_src_lladdr) { lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; } if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && lladdr) { nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet " "(link-layer address option)\n")); goto bad; } /* * Attaching target link-layer address to the NA? * (RFC 2461 7.2.4) * * NS IP dst is unicast/anycast MUST NOT add * NS IP dst is solicited-node multicast MUST add * * In implementation, we add target link-layer address by default. * We do not add one in MUST NOT cases. */ if (!IN6_IS_ADDR_MULTICAST(&daddr6)) tlladdr = 0; else tlladdr = 1; /* * Target address (taddr6) must be either: * (1) Valid unicast/anycast address for my receiving interface, * (2) Unicast address for which I'm offering proxy service, or * (3) "tentative" address on which DAD is being performed. */ /* (1) and (3) check. */ if (ifp->if_carp) ifa = (*carp_iamatch6_p)(ifp, &taddr6); else ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); /* (2) check. */ if (ifa == NULL) { struct sockaddr_dl rt_gateway; struct rt_addrinfo info; struct sockaddr_in6 dst6; bzero(&dst6, sizeof(dst6)); dst6.sin6_len = sizeof(struct sockaddr_in6); dst6.sin6_family = AF_INET6; dst6.sin6_addr = taddr6; bzero(&rt_gateway, sizeof(rt_gateway)); rt_gateway.sdl_len = sizeof(rt_gateway); bzero(&info, sizeof(info)); info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&rt_gateway; if (rib_lookup_info(ifp->if_fib, (struct sockaddr *)&dst6, 0, 0, &info) == 0) { if ((info.rti_flags & RTF_ANNOUNCE) != 0 && rt_gateway.sdl_family == AF_LINK) { /* * proxy NDP for single entry */ proxydl = *SDL(&rt_gateway); ifa = (struct ifaddr *)in6ifa_ifpforlinklocal( ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); if (ifa) proxy = 1; } } } if (ifa == NULL) { /* * We've got an NS packet, and we don't have that adddress * assigned for us. We MUST silently ignore it. * See RFC2461 7.2.3. */ goto freeit; } myaddr6 = *IFA_IN6(ifa); anycast = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST; tentative = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DUPLICATED) goto freeit; if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "nd6_ns_input: lladdrlen mismatch for %s " "(if %d, NS packet %d)\n", ip6_sprintf(ip6bufs, &taddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } if (IN6_ARE_ADDR_EQUAL(&myaddr6, &saddr6)) { nd6log((LOG_INFO, "nd6_ns_input: duplicate IP6 address %s\n", ip6_sprintf(ip6bufs, &saddr6))); goto freeit; } /* * We have neighbor solicitation packet, with target address equals to * one of my tentative address. * * src addr how to process? * --- --- * multicast of course, invalid (rejected in ip6_input) * unicast somebody is doing address resolution -> ignore * unspec dup address detection * * The processing is defined in RFC 2462. */ if (tentative) { /* * If source address is unspecified address, it is for * duplicate address detection. * * If not, the packet is for addess resolution; * silently ignore it. */ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) nd6_dad_ns_input(ifa, ndopts.nd_opts_nonce); goto freeit; } /* * If the source address is unspecified address, entries must not * be created or updated. * It looks that sender is performing DAD. Output NA toward * all-node multicast address, to tell the sender that I'm using * the address. * S bit ("solicited") must be zero. */ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) { struct in6_addr in6_all; in6_all = in6addr_linklocal_allnodes; if (in6_setscope(&in6_all, ifp, NULL) != 0) goto bad; nd6_na_output_fib(ifp, &in6_all, &taddr6, ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) | rflag, tlladdr, proxy ? (struct sockaddr *)&proxydl : NULL, M_GETFIB(m)); goto freeit; } nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_NEIGHBOR_SOLICIT, 0); nd6_na_output_fib(ifp, &saddr6, &taddr6, ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) | rflag | ND_NA_FLAG_SOLICITED, tlladdr, proxy ? (struct sockaddr *)&proxydl : NULL, M_GETFIB(m)); freeit: if (ifa != NULL) ifa_free(ifa); m_freem(m); return; bad: nd6log((LOG_ERR, "nd6_ns_input: src=%s\n", ip6_sprintf(ip6bufs, &saddr6))); nd6log((LOG_ERR, "nd6_ns_input: dst=%s\n", ip6_sprintf(ip6bufs, &daddr6))); nd6log((LOG_ERR, "nd6_ns_input: tgt=%s\n", ip6_sprintf(ip6bufs, &taddr6))); ICMP6STAT_INC(icp6s_badns); if (ifa != NULL) ifa_free(ifa); m_freem(m); } /* * Output a Neighbor Solicitation Message. Caller specifies: * - ICMP6 header source IP6 address * - ND6 header target IP6 address * - ND6 header source datalink address * * Based on RFC 2461 * Based on RFC 2462 (duplicate address detection) * * ln - for source address determination * nonce - If non-NULL, NS is used for duplicate address detection and * the value (length is ND_OPT_NONCE_LEN) is used as a random nonce. */ static void nd6_ns_output_fib(struct ifnet *ifp, const struct in6_addr *saddr6, const struct in6_addr *daddr6, const struct in6_addr *taddr6, uint8_t *nonce, u_int fibnum) { struct mbuf *m; struct m_tag *mtag; struct ip6_hdr *ip6; struct nd_neighbor_solicit *nd_ns; struct ip6_moptions im6o; int icmp6len; int maxlen; caddr_t mac; if (IN6_IS_ADDR_MULTICAST(taddr6)) return; /* estimate the size of message */ maxlen = sizeof(*ip6) + sizeof(*nd_ns); maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7; KASSERT(max_linkhdr + maxlen <= MCLBYTES, ( "%s: max_linkhdr + maxlen > MCLBYTES (%d + %d > %d)", __func__, max_linkhdr, maxlen, MCLBYTES)); if (max_linkhdr + maxlen > MHLEN) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; M_SETFIB(m, fibnum); if (daddr6 == NULL || IN6_IS_ADDR_MULTICAST(daddr6)) { m->m_flags |= M_MCAST; im6o.im6o_multicast_ifp = ifp; im6o.im6o_multicast_hlim = 255; im6o.im6o_multicast_loop = 0; } icmp6len = sizeof(*nd_ns); m->m_pkthdr.len = m->m_len = sizeof(*ip6) + icmp6len; m->m_data += max_linkhdr; /* or M_ALIGN() equivalent? */ /* fill neighbor solicitation packet */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; if (daddr6) ip6->ip6_dst = *daddr6; else { ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL; ip6->ip6_dst.s6_addr16[1] = 0; ip6->ip6_dst.s6_addr32[1] = 0; ip6->ip6_dst.s6_addr32[2] = IPV6_ADDR_INT32_ONE; ip6->ip6_dst.s6_addr32[3] = taddr6->s6_addr32[3]; ip6->ip6_dst.s6_addr8[12] = 0xff; if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) goto bad; } if (nonce == NULL) { struct ifaddr *ifa = NULL; /* * RFC2461 7.2.2: * "If the source address of the packet prompting the * solicitation is the same as one of the addresses assigned * to the outgoing interface, that address SHOULD be placed * in the IP Source Address of the outgoing solicitation. * Otherwise, any one of the addresses assigned to the * interface should be used." * * We use the source address for the prompting packet * (saddr6), if saddr6 belongs to the outgoing interface. * Otherwise, we perform the source address selection as usual. */ if (saddr6 != NULL) ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, saddr6); if (ifa != NULL) { /* ip6_src set already. */ ip6->ip6_src = *saddr6; ifa_free(ifa); } else { int error; struct in6_addr dst6, src6; uint32_t scopeid; in6_splitscope(&ip6->ip6_dst, &dst6, &scopeid); error = in6_selectsrc_addr(fibnum, &dst6, scopeid, ifp, &src6, NULL); if (error) { char ip6buf[INET6_ADDRSTRLEN]; nd6log((LOG_DEBUG, "%s: source can't be " "determined: dst=%s, error=%d\n", __func__, ip6_sprintf(ip6buf, &dst6), error)); goto bad; } ip6->ip6_src = src6; } } else { /* * Source address for DAD packet must always be IPv6 * unspecified address. (0::0) * We actually don't have to 0-clear the address (we did it * above), but we do so here explicitly to make the intention * clearer. */ bzero(&ip6->ip6_src, sizeof(ip6->ip6_src)); } nd_ns = (struct nd_neighbor_solicit *)(ip6 + 1); nd_ns->nd_ns_type = ND_NEIGHBOR_SOLICIT; nd_ns->nd_ns_code = 0; nd_ns->nd_ns_reserved = 0; nd_ns->nd_ns_target = *taddr6; in6_clearscope(&nd_ns->nd_ns_target); /* XXX */ /* * Add source link-layer address option. * * spec implementation * --- --- * DAD packet MUST NOT do not add the option * there's no link layer address: * impossible do not add the option * there's link layer address: * Multicast NS MUST add one add the option * Unicast NS SHOULD add one add the option */ if (nonce == NULL && (mac = nd6_ifptomac(ifp))) { int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen; struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_ns + 1); /* 8 byte alignments... */ optlen = (optlen + 7) & ~7; m->m_pkthdr.len += optlen; m->m_len += optlen; icmp6len += optlen; bzero((caddr_t)nd_opt, optlen); nd_opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; nd_opt->nd_opt_len = optlen >> 3; bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen); } /* * Add a Nonce option (RFC 3971) to detect looped back NS messages. * This behavior is documented as Enhanced Duplicate Address * Detection in RFC 7527. * net.inet6.ip6.dad_enhanced=0 disables this. */ if (V_dad_enhanced != 0 && nonce != NULL) { int optlen = sizeof(struct nd_opt_hdr) + ND_OPT_NONCE_LEN; struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_ns + 1); /* 8-byte alignment is required. */ optlen = (optlen + 7) & ~7; m->m_pkthdr.len += optlen; m->m_len += optlen; icmp6len += optlen; bzero((caddr_t)nd_opt, optlen); nd_opt->nd_opt_type = ND_OPT_NONCE; nd_opt->nd_opt_len = optlen >> 3; bcopy(nonce, (caddr_t)(nd_opt + 1), ND_OPT_NONCE_LEN); } ip6->ip6_plen = htons((u_short)icmp6len); nd_ns->nd_ns_cksum = 0; nd_ns->nd_ns_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), icmp6len); if (send_sendso_input_hook != NULL) { mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short), M_NOWAIT); if (mtag == NULL) goto bad; *(unsigned short *)(mtag + 1) = nd_ns->nd_ns_type; m_tag_prepend(m, mtag); } ip6_output(m, NULL, NULL, (nonce != NULL) ? IPV6_UNSPECSRC : 0, &im6o, NULL, NULL); icmp6_ifstat_inc(ifp, ifs6_out_msg); icmp6_ifstat_inc(ifp, ifs6_out_neighborsolicit); ICMP6STAT_INC(icp6s_outhist[ND_NEIGHBOR_SOLICIT]); return; bad: m_freem(m); } #ifndef BURN_BRIDGES void nd6_ns_output(struct ifnet *ifp, const struct in6_addr *saddr6, const struct in6_addr *daddr6, const struct in6_addr *taddr6,uint8_t *nonce) { nd6_ns_output_fib(ifp, saddr6, daddr6, taddr6, nonce, RT_DEFAULT_FIB); } #endif /* * Neighbor advertisement input handling. * * Based on RFC 2461 * Based on RFC 2462 (duplicate address detection) * * the following items are not implemented yet: * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD) * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD) */ void nd6_na_input(struct mbuf *m, int off, int icmp6len) { - struct epoch_tracker et; struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_neighbor_advert *nd_na; struct in6_addr daddr6 = ip6->ip6_dst; struct in6_addr taddr6; int flags; int is_router; int is_solicited; int is_override; char *lladdr = NULL; int lladdrlen = 0; int checklink = 0; struct ifaddr *ifa; struct llentry *ln = NULL; union nd_opts ndopts; struct mbuf *chain = NULL; struct sockaddr_in6 sin6; u_char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; + NET_EPOCH_ASSERT(); + /* RFC 6980: Nodes MUST silently ignore fragments */ if(m->m_flags & M_FRAGMENTED) goto freeit; if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "nd6_na_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp))); goto bad; } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_na = (struct nd_neighbor_advert *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_na, struct nd_neighbor_advert *, m, off, icmp6len); if (nd_na == NULL) { ICMP6STAT_INC(icp6s_tooshort); return; } #endif flags = nd_na->nd_na_flags_reserved; is_router = ((flags & ND_NA_FLAG_ROUTER) != 0); is_solicited = ((flags & ND_NA_FLAG_SOLICITED) != 0); is_override = ((flags & ND_NA_FLAG_OVERRIDE) != 0); memset(&sin6, 0, sizeof(sin6)); taddr6 = nd_na->nd_na_target; if (in6_setscope(&taddr6, ifp, NULL)) goto bad; /* XXX: impossible */ if (IN6_IS_ADDR_MULTICAST(&taddr6)) { nd6log((LOG_ERR, "nd6_na_input: invalid target address %s\n", ip6_sprintf(ip6bufs, &taddr6))); goto bad; } if (IN6_IS_ADDR_MULTICAST(&daddr6)) if (is_solicited) { nd6log((LOG_ERR, "nd6_na_input: a solicited adv is multicasted\n")); goto bad; } icmp6len -= sizeof(*nd_na); nd6_option_init(nd_na + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "nd6_na_input: invalid ND option, ignored\n")); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } /* * This effectively disables the DAD check on a non-master CARP * address. */ if (ifp->if_carp) ifa = (*carp_iamatch6_p)(ifp, &taddr6); else ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); /* * Target address matches one of my interface address. * * If my address is tentative, this means that there's somebody * already using the same address as mine. This indicates DAD failure. * This is defined in RFC 2462. * * Otherwise, process as defined in RFC 2461. */ if (ifa && (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE)) { nd6_dad_na_input(ifa); ifa_free(ifa); goto freeit; } /* Just for safety, maybe unnecessary. */ if (ifa) { ifa_free(ifa); log(LOG_ERR, "nd6_na_input: duplicate IP6 address %s\n", ip6_sprintf(ip6bufs, &taddr6)); goto freeit; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "nd6_na_input: lladdrlen mismatch for %s " "(if %d, NA packet %d)\n", ip6_sprintf(ip6bufs, &taddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } /* * If no neighbor cache entry is found, NA SHOULD silently be * discarded. */ - NET_EPOCH_ENTER(et); ln = nd6_lookup(&taddr6, LLE_EXCLUSIVE, ifp); - NET_EPOCH_EXIT(et); if (ln == NULL) { goto freeit; } if (ln->ln_state == ND6_LLINFO_INCOMPLETE) { /* * If the link-layer has address, and no lladdr option came, * discard the packet. */ if (ifp->if_addrlen && lladdr == NULL) { goto freeit; } /* * Record link-layer address, and update the state. */ linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET6, lladdr, linkhdr, &linkhdrsize, &lladdr_off) != 0) return; if (lltable_try_set_entry_addr(ifp, ln, linkhdr, linkhdrsize, lladdr_off) == 0) { ln = NULL; goto freeit; } EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); if (is_solicited) nd6_llinfo_setstate(ln, ND6_LLINFO_REACHABLE); else nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); if ((ln->ln_router = is_router) != 0) { /* * This means a router's state has changed from * non-reachable to probably reachable, and might * affect the status of associated prefixes.. */ checklink = 1; } } else { int llchange; /* * Check if the link-layer address has changed or not. */ if (lladdr == NULL) llchange = 0; else { if (ln->la_flags & LLE_VALID) { if (bcmp(lladdr, ln->ll_addr, ifp->if_addrlen)) llchange = 1; else llchange = 0; } else llchange = 1; } /* * This is VERY complex. Look at it with care. * * override solicit lladdr llchange action * (L: record lladdr) * * 0 0 n -- (2c) * 0 0 y n (2b) L * 0 0 y y (1) REACHABLE->STALE * 0 1 n -- (2c) *->REACHABLE * 0 1 y n (2b) L *->REACHABLE * 0 1 y y (1) REACHABLE->STALE * 1 0 n -- (2a) * 1 0 y n (2a) L * 1 0 y y (2a) L *->STALE * 1 1 n -- (2a) *->REACHABLE * 1 1 y n (2a) L *->REACHABLE * 1 1 y y (2a) L *->REACHABLE */ if (!is_override && (lladdr != NULL && llchange)) { /* (1) */ /* * If state is REACHABLE, make it STALE. * no other updates should be done. */ if (ln->ln_state == ND6_LLINFO_REACHABLE) nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); goto freeit; } else if (is_override /* (2a) */ || (!is_override && (lladdr != NULL && !llchange)) /* (2b) */ || lladdr == NULL) { /* (2c) */ /* * Update link-local address, if any. */ if (lladdr != NULL) { linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET6, lladdr, linkhdr, &linkhdrsize, &lladdr_off) != 0) goto freeit; if (lltable_try_set_entry_addr(ifp, ln, linkhdr, linkhdrsize, lladdr_off) == 0) { ln = NULL; goto freeit; } EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); } /* * If solicited, make the state REACHABLE. * If not solicited and the link-layer address was * changed, make it STALE. */ if (is_solicited) nd6_llinfo_setstate(ln, ND6_LLINFO_REACHABLE); else { if (lladdr != NULL && llchange) nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); } } if (ln->ln_router && !is_router) { /* * The peer dropped the router flag. * Remove the sender from the Default Router List and * update the Destination Cache entries. */ struct ifnet *nd6_ifp; nd6_ifp = lltable_get_ifp(ln->lle_tbl); if (!defrouter_remove(&ln->r_l3addr.addr6, nd6_ifp) && (ND_IFINFO(nd6_ifp)->flags & ND6_IFF_ACCEPT_RTADV) != 0) /* * Even if the neighbor is not in the default * router list, the neighbor may be used as a * next hop for some destinations (e.g. redirect * case). So we must call rt6_flush explicitly. */ rt6_flush(&ip6->ip6_src, ifp); } ln->ln_router = is_router; } /* XXX - QL * Does this matter? * rt->rt_flags &= ~RTF_REJECT; */ ln->la_asked = 0; if (ln->la_hold != NULL) nd6_grab_holdchain(ln, &chain, &sin6); freeit: if (ln != NULL) LLE_WUNLOCK(ln); if (chain != NULL) nd6_flush_holdchain(ifp, chain, &sin6); if (checklink) pfxlist_onlink_check(); m_freem(m); return; bad: if (ln != NULL) LLE_WUNLOCK(ln); ICMP6STAT_INC(icp6s_badna); m_freem(m); } /* * Neighbor advertisement output handling. * * Based on RFC 2461 * * the following items are not implemented yet: * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD) * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD) * * tlladdr - 1 if include target link-layer address * sdl0 - sockaddr_dl (= proxy NA) or NULL */ static void nd6_na_output_fib(struct ifnet *ifp, const struct in6_addr *daddr6_0, const struct in6_addr *taddr6, u_long flags, int tlladdr, struct sockaddr *sdl0, u_int fibnum) { struct mbuf *m; struct m_tag *mtag; struct ip6_hdr *ip6; struct nd_neighbor_advert *nd_na; struct ip6_moptions im6o; struct in6_addr daddr6, dst6, src6; uint32_t scopeid; int icmp6len, maxlen, error; caddr_t mac = NULL; daddr6 = *daddr6_0; /* make a local copy for modification */ /* estimate the size of message */ maxlen = sizeof(*ip6) + sizeof(*nd_na); maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7; KASSERT(max_linkhdr + maxlen <= MCLBYTES, ( "%s: max_linkhdr + maxlen > MCLBYTES (%d + %d > %d)", __func__, max_linkhdr, maxlen, MCLBYTES)); if (max_linkhdr + maxlen > MHLEN) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; M_SETFIB(m, fibnum); if (IN6_IS_ADDR_MULTICAST(&daddr6)) { m->m_flags |= M_MCAST; im6o.im6o_multicast_ifp = ifp; im6o.im6o_multicast_hlim = 255; im6o.im6o_multicast_loop = 0; } icmp6len = sizeof(*nd_na); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + icmp6len; m->m_data += max_linkhdr; /* or M_ALIGN() equivalent? */ /* fill neighbor advertisement packet */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; if (IN6_IS_ADDR_UNSPECIFIED(&daddr6)) { /* reply to DAD */ daddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL; daddr6.s6_addr16[1] = 0; daddr6.s6_addr32[1] = 0; daddr6.s6_addr32[2] = 0; daddr6.s6_addr32[3] = IPV6_ADDR_INT32_ONE; if (in6_setscope(&daddr6, ifp, NULL)) goto bad; flags &= ~ND_NA_FLAG_SOLICITED; } ip6->ip6_dst = daddr6; /* * Select a source whose scope is the same as that of the dest. */ in6_splitscope(&daddr6, &dst6, &scopeid); error = in6_selectsrc_addr(fibnum, &dst6, scopeid, ifp, &src6, NULL); if (error) { char ip6buf[INET6_ADDRSTRLEN]; nd6log((LOG_DEBUG, "nd6_na_output: source can't be " "determined: dst=%s, error=%d\n", ip6_sprintf(ip6buf, &daddr6), error)); goto bad; } ip6->ip6_src = src6; nd_na = (struct nd_neighbor_advert *)(ip6 + 1); nd_na->nd_na_type = ND_NEIGHBOR_ADVERT; nd_na->nd_na_code = 0; nd_na->nd_na_target = *taddr6; in6_clearscope(&nd_na->nd_na_target); /* XXX */ /* * "tlladdr" indicates NS's condition for adding tlladdr or not. * see nd6_ns_input() for details. * Basically, if NS packet is sent to unicast/anycast addr, * target lladdr option SHOULD NOT be included. */ if (tlladdr) { /* * sdl0 != NULL indicates proxy NA. If we do proxy, use * lladdr in sdl0. If we are not proxying (sending NA for * my address) use lladdr configured for the interface. */ if (sdl0 == NULL) { if (ifp->if_carp) mac = (*carp_macmatch6_p)(ifp, m, taddr6); if (mac == NULL) mac = nd6_ifptomac(ifp); } else if (sdl0->sa_family == AF_LINK) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)sdl0; if (sdl->sdl_alen == ifp->if_addrlen) mac = LLADDR(sdl); } } if (tlladdr && mac) { int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen; struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_na + 1); /* roundup to 8 bytes alignment! */ optlen = (optlen + 7) & ~7; m->m_pkthdr.len += optlen; m->m_len += optlen; icmp6len += optlen; bzero((caddr_t)nd_opt, optlen); nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = optlen >> 3; bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen); } else flags &= ~ND_NA_FLAG_OVERRIDE; ip6->ip6_plen = htons((u_short)icmp6len); nd_na->nd_na_flags_reserved = flags; nd_na->nd_na_cksum = 0; nd_na->nd_na_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), icmp6len); if (send_sendso_input_hook != NULL) { mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short), M_NOWAIT); if (mtag == NULL) goto bad; *(unsigned short *)(mtag + 1) = nd_na->nd_na_type; m_tag_prepend(m, mtag); } ip6_output(m, NULL, NULL, 0, &im6o, NULL, NULL); icmp6_ifstat_inc(ifp, ifs6_out_msg); icmp6_ifstat_inc(ifp, ifs6_out_neighboradvert); ICMP6STAT_INC(icp6s_outhist[ND_NEIGHBOR_ADVERT]); return; bad: m_freem(m); } #ifndef BURN_BRIDGES void nd6_na_output(struct ifnet *ifp, const struct in6_addr *daddr6_0, const struct in6_addr *taddr6, u_long flags, int tlladdr, struct sockaddr *sdl0) { nd6_na_output_fib(ifp, daddr6_0, taddr6, flags, tlladdr, sdl0, RT_DEFAULT_FIB); } #endif caddr_t nd6_ifptomac(struct ifnet *ifp) { switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE1394: case IFT_L2VLAN: case IFT_INFINIBAND: case IFT_BRIDGE: return IF_LLADDR(ifp); default: return NULL; } } struct dadq { TAILQ_ENTRY(dadq) dad_list; struct ifaddr *dad_ifa; int dad_count; /* max NS to send */ int dad_ns_tcount; /* # of trials to send NS */ int dad_ns_ocount; /* NS sent so far */ int dad_ns_icount; int dad_na_icount; int dad_ns_lcount; /* looped back NS */ int dad_loopbackprobe; /* probing state for loopback detection */ struct callout dad_timer_ch; struct vnet *dad_vnet; u_int dad_refcnt; #define ND_OPT_NONCE_LEN32 \ ((ND_OPT_NONCE_LEN + sizeof(uint32_t) - 1)/sizeof(uint32_t)) uint32_t dad_nonce[ND_OPT_NONCE_LEN32]; bool dad_ondadq; /* on dadq? Protected by DADQ_WLOCK. */ }; VNET_DEFINE_STATIC(TAILQ_HEAD(, dadq), dadq); VNET_DEFINE_STATIC(struct rwlock, dad_rwlock); #define V_dadq VNET(dadq) #define V_dad_rwlock VNET(dad_rwlock) #define DADQ_RLOCK() rw_rlock(&V_dad_rwlock) #define DADQ_RUNLOCK() rw_runlock(&V_dad_rwlock) #define DADQ_WLOCK() rw_wlock(&V_dad_rwlock) #define DADQ_WUNLOCK() rw_wunlock(&V_dad_rwlock) static void nd6_dad_add(struct dadq *dp) { DADQ_WLOCK(); TAILQ_INSERT_TAIL(&V_dadq, dp, dad_list); dp->dad_ondadq = true; DADQ_WUNLOCK(); } static void nd6_dad_del(struct dadq *dp) { DADQ_WLOCK(); if (dp->dad_ondadq) { /* * Remove dp from the dadq and release the dadq's * reference. */ TAILQ_REMOVE(&V_dadq, dp, dad_list); dp->dad_ondadq = false; DADQ_WUNLOCK(); nd6_dad_rele(dp); } else DADQ_WUNLOCK(); } static struct dadq * nd6_dad_find(struct ifaddr *ifa, struct nd_opt_nonce *n) { struct dadq *dp; DADQ_RLOCK(); TAILQ_FOREACH(dp, &V_dadq, dad_list) { if (dp->dad_ifa != ifa) continue; /* * Skip if the nonce matches the received one. * +2 in the length is required because of type and * length fields are included in a header. */ if (n != NULL && n->nd_opt_nonce_len == (ND_OPT_NONCE_LEN + 2) / 8 && memcmp(&n->nd_opt_nonce[0], &dp->dad_nonce[0], ND_OPT_NONCE_LEN) == 0) { dp->dad_ns_lcount++; continue; } refcount_acquire(&dp->dad_refcnt); break; } DADQ_RUNLOCK(); return (dp); } static void nd6_dad_starttimer(struct dadq *dp, int ticks, int send_ns) { - if (send_ns != 0) + if (send_ns != 0) { + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); nd6_dad_ns_output(dp); + NET_EPOCH_EXIT(et); + } callout_reset(&dp->dad_timer_ch, ticks, (void (*)(void *))nd6_dad_timer, (void *)dp); } static void nd6_dad_stoptimer(struct dadq *dp) { callout_drain(&dp->dad_timer_ch); } static void nd6_dad_rele(struct dadq *dp) { if (refcount_release(&dp->dad_refcnt)) { ifa_free(dp->dad_ifa); free(dp, M_IP6NDP); } } void nd6_dad_init(void) { rw_init(&V_dad_rwlock, "nd6 DAD queue"); TAILQ_INIT(&V_dadq); } /* * Start Duplicate Address Detection (DAD) for specified interface address. */ void nd6_dad_start(struct ifaddr *ifa, int delay) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; char ip6buf[INET6_ADDRSTRLEN]; KASSERT((ia->ia6_flags & IN6_IFF_TENTATIVE) != 0, ("starting DAD on non-tentative address %p", ifa)); /* * If we don't need DAD, don't do it. * There are several cases: * - DAD is disabled globally or on the interface * - the interface address is anycast */ if ((ia->ia6_flags & IN6_IFF_ANYCAST) != 0 || V_ip6_dad_count == 0 || (ND_IFINFO(ifa->ifa_ifp)->flags & ND6_IFF_NO_DAD) != 0) { ia->ia6_flags &= ~IN6_IFF_TENTATIVE; return; } if ((ifa->ifa_ifp->if_flags & IFF_UP) == 0 || (ifa->ifa_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (ND_IFINFO(ifa->ifa_ifp)->flags & ND6_IFF_IFDISABLED) != 0) return; if ((dp = nd6_dad_find(ifa, NULL)) != NULL) { /* * DAD is already in progress. Let the existing entry * finish it. */ nd6_dad_rele(dp); return; } dp = malloc(sizeof(*dp), M_IP6NDP, M_NOWAIT | M_ZERO); if (dp == NULL) { log(LOG_ERR, "nd6_dad_start: memory allocation failed for " "%s(%s)\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); return; } callout_init(&dp->dad_timer_ch, 0); #ifdef VIMAGE dp->dad_vnet = curvnet; #endif nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); /* * Send NS packet for DAD, ip6_dad_count times. * Note that we must delay the first transmission, if this is the * first packet to be sent from the interface after interface * (re)initialization. */ dp->dad_ifa = ifa; ifa_ref(dp->dad_ifa); dp->dad_count = V_ip6_dad_count; dp->dad_ns_icount = dp->dad_na_icount = 0; dp->dad_ns_ocount = dp->dad_ns_tcount = 0; dp->dad_ns_lcount = dp->dad_loopbackprobe = 0; /* Add this to the dadq and add a reference for the dadq. */ refcount_init(&dp->dad_refcnt, 1); nd6_dad_add(dp); nd6_dad_starttimer(dp, delay, 0); } /* * terminate DAD unconditionally. used for address removals. */ void nd6_dad_stop(struct ifaddr *ifa) { struct dadq *dp; dp = nd6_dad_find(ifa, NULL); if (!dp) { /* DAD wasn't started yet */ return; } nd6_dad_stoptimer(dp); nd6_dad_del(dp); /* Release this function's reference, acquired by nd6_dad_find(). */ nd6_dad_rele(dp); } static void nd6_dad_timer(struct dadq *dp) { CURVNET_SET(dp->dad_vnet); struct ifaddr *ifa = dp->dad_ifa; struct ifnet *ifp = dp->dad_ifa->ifa_ifp; struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; char ip6buf[INET6_ADDRSTRLEN]; KASSERT(ia != NULL, ("DAD entry %p with no address", dp)); if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) { /* Do not need DAD for ifdisabled interface. */ log(LOG_ERR, "nd6_dad_timer: cancel DAD on %s because of " "ND6_IFF_IFDISABLED.\n", ifp->if_xname); goto err; } if (ia->ia6_flags & IN6_IFF_DUPLICATED) { log(LOG_ERR, "nd6_dad_timer: called with duplicated address " "%s(%s)\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); goto err; } if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0) { log(LOG_ERR, "nd6_dad_timer: called with non-tentative address " "%s(%s)\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); goto err; } /* Stop DAD if the interface is down even after dad_maxtry attempts. */ if ((dp->dad_ns_tcount > V_dad_maxtry) && (((ifp->if_flags & IFF_UP) == 0) || ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0))) { nd6log((LOG_INFO, "%s: could not run DAD " "because the interface was down or not running.\n", if_name(ifa->ifa_ifp))); goto err; } /* Need more checks? */ if (dp->dad_ns_ocount < dp->dad_count) { /* * We have more NS to go. Send NS packet for DAD. */ nd6_dad_starttimer(dp, (long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000, 1); goto done; } else { /* * We have transmitted sufficient number of DAD packets. * See what we've got. */ if (dp->dad_ns_icount > 0 || dp->dad_na_icount > 0) /* We've seen NS or NA, means DAD has failed. */ nd6_dad_duplicated(ifa, dp); else if (V_dad_enhanced != 0 && dp->dad_ns_lcount > 0 && dp->dad_ns_lcount > dp->dad_loopbackprobe) { /* * Sec. 4.1 in RFC 7527 requires transmission of * additional probes until the loopback condition * becomes clear when a looped back probe is detected. */ log(LOG_ERR, "%s: a looped back NS message is " "detected during DAD for %s. " "Another DAD probes are being sent.\n", if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, IFA_IN6(ifa))); dp->dad_loopbackprobe = dp->dad_ns_lcount; /* * Send an NS immediately and increase dad_count by * V_nd6_mmaxtries - 1. */ dp->dad_count = dp->dad_ns_ocount + V_nd6_mmaxtries - 1; nd6_dad_starttimer(dp, (long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000, 1); goto done; } else { /* * We are done with DAD. No NA came, no NS came. * No duplicate address found. Check IFDISABLED flag * again in case that it is changed between the * beginning of this function and here. */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) == 0) ia->ia6_flags &= ~IN6_IFF_TENTATIVE; nd6log((LOG_DEBUG, "%s: DAD complete for %s - no duplicates found\n", if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); if (dp->dad_ns_lcount > 0) log(LOG_ERR, "%s: DAD completed while " "a looped back NS message is detected " "during DAD for %s.\n", if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, IFA_IN6(ifa))); } } err: nd6_dad_del(dp); done: CURVNET_RESTORE(); } static void nd6_dad_duplicated(struct ifaddr *ifa, struct dadq *dp) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct ifnet *ifp; char ip6buf[INET6_ADDRSTRLEN]; log(LOG_ERR, "%s: DAD detected duplicate IPv6 address %s: " "NS in/out/loopback=%d/%d/%d, NA in=%d\n", if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr), dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_ns_lcount, dp->dad_na_icount); ia->ia6_flags &= ~IN6_IFF_TENTATIVE; ia->ia6_flags |= IN6_IFF_DUPLICATED; ifp = ifa->ifa_ifp; log(LOG_ERR, "%s: DAD complete for %s - duplicate found\n", if_name(ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)); log(LOG_ERR, "%s: manual intervention required\n", if_name(ifp)); /* * If the address is a link-local address formed from an interface * identifier based on the hardware address which is supposed to be * uniquely assigned (e.g., EUI-64 for an Ethernet interface), IP * operation on the interface SHOULD be disabled. * [RFC 4862, Section 5.4.5] */ if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) { struct in6_addr in6; /* * To avoid over-reaction, we only apply this logic when we are * very sure that hardware addresses are supposed to be unique. */ switch (ifp->if_type) { case IFT_ETHER: case IFT_ATM: case IFT_IEEE1394: case IFT_INFINIBAND: in6 = ia->ia_addr.sin6_addr; if (in6_get_hw_ifid(ifp, &in6) == 0 && IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) { ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED; log(LOG_ERR, "%s: possible hardware address " "duplication detected, disable IPv6\n", if_name(ifp)); } break; } } } static void nd6_dad_ns_output(struct dadq *dp) { struct in6_ifaddr *ia = (struct in6_ifaddr *)dp->dad_ifa; struct ifnet *ifp = dp->dad_ifa->ifa_ifp; int i; dp->dad_ns_tcount++; if ((ifp->if_flags & IFF_UP) == 0) { return; } if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { return; } dp->dad_ns_ocount++; if (V_dad_enhanced != 0) { for (i = 0; i < ND_OPT_NONCE_LEN32; i++) dp->dad_nonce[i] = arc4random(); /* * XXXHRS: Note that in the case that * DupAddrDetectTransmits > 1, multiple NS messages with * different nonces can be looped back in an unexpected * order. The current implementation recognizes only * the latest nonce on the sender side. Practically it * should work well in almost all cases. */ } nd6_ns_output(ifp, NULL, NULL, &ia->ia_addr.sin6_addr, (uint8_t *)&dp->dad_nonce[0]); } static void nd6_dad_ns_input(struct ifaddr *ifa, struct nd_opt_nonce *ndopt_nonce) { struct dadq *dp; if (ifa == NULL) panic("ifa == NULL in nd6_dad_ns_input"); /* Ignore Nonce option when Enhanced DAD is disabled. */ if (V_dad_enhanced == 0) ndopt_nonce = NULL; dp = nd6_dad_find(ifa, ndopt_nonce); if (dp == NULL) return; dp->dad_ns_icount++; nd6_dad_rele(dp); } static void nd6_dad_na_input(struct ifaddr *ifa) { struct dadq *dp; if (ifa == NULL) panic("ifa == NULL in nd6_dad_na_input"); dp = nd6_dad_find(ifa, NULL); if (dp != NULL) { dp->dad_na_icount++; nd6_dad_rele(dp); } } diff --git a/sys/netinet6/nd6_rtr.c b/sys/netinet6/nd6_rtr.c index d0b8dc51eba7..03442745f96a 100644 --- a/sys/netinet6/nd6_rtr.c +++ b/sys/netinet6/nd6_rtr.c @@ -1,2522 +1,2525 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: nd6_rtr.c,v 1.111 2001/04/27 01:37:15 jinmei Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int rtpref(struct nd_defrouter *); static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *); static int prelist_update(struct nd_prefixctl *, struct nd_defrouter *, struct mbuf *, int); static struct in6_ifaddr *in6_ifadd(struct nd_prefixctl *, int); static struct nd_pfxrouter *pfxrtr_lookup(struct nd_prefix *, struct nd_defrouter *); static void pfxrtr_add(struct nd_prefix *, struct nd_defrouter *); static void pfxrtr_del(struct nd_pfxrouter *); static struct nd_pfxrouter *find_pfxlist_reachable_router(struct nd_prefix *); static void defrouter_delreq(struct nd_defrouter *); static void nd6_rtmsg(int, struct rtentry *); static int in6_init_prefix_ltimes(struct nd_prefix *); static void in6_init_address_ltimes(struct nd_prefix *, struct in6_addrlifetime *); static int rt6_deleteroute(const struct rtentry *, void *); VNET_DECLARE(int, nd6_recalc_reachtm_interval); #define V_nd6_recalc_reachtm_interval VNET(nd6_recalc_reachtm_interval) VNET_DEFINE_STATIC(struct ifnet *, nd6_defifp); VNET_DEFINE(int, nd6_defifindex); #define V_nd6_defifp VNET(nd6_defifp) VNET_DEFINE(int, ip6_use_tempaddr) = 0; VNET_DEFINE(int, ip6_desync_factor); VNET_DEFINE(u_int32_t, ip6_temp_preferred_lifetime) = DEF_TEMP_PREFERRED_LIFETIME; VNET_DEFINE(u_int32_t, ip6_temp_valid_lifetime) = DEF_TEMP_VALID_LIFETIME; VNET_DEFINE(int, ip6_temp_regen_advance) = TEMPADDR_REGEN_ADVANCE; #ifdef EXPERIMENTAL VNET_DEFINE(int, nd6_ignore_ipv6_only_ra) = 1; #endif /* RTPREF_MEDIUM has to be 0! */ #define RTPREF_HIGH 1 #define RTPREF_MEDIUM 0 #define RTPREF_LOW (-1) #define RTPREF_RESERVED (-2) #define RTPREF_INVALID (-3) /* internal */ /* * Receive Router Solicitation Message - just for routers. * Router solicitation/advertisement is mostly managed by userland program * (rtadvd) so here we have no function like nd6_ra_output(). * * Based on RFC 2461 */ void nd6_rs_input(struct mbuf *m, int off, int icmp6len) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_router_solicit *nd_rs; struct in6_addr saddr6 = ip6->ip6_src; char *lladdr = NULL; int lladdrlen = 0; union nd_opts ndopts; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; /* * Accept RS only when V_ip6_forwarding=1 and the interface has * no ND6_IFF_ACCEPT_RTADV. */ if (!V_ip6_forwarding || ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) goto freeit; /* RFC 6980: Nodes MUST silently ignore fragments */ if(m->m_flags & M_FRAGMENTED) goto freeit; /* Sanity checks */ if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "nd6_rs_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp))); goto bad; } /* * Don't update the neighbor cache, if src = ::. * This indicates that the src has no IP address assigned yet. */ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) goto freeit; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_rs = (struct nd_router_solicit *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_rs, struct nd_router_solicit *, m, off, icmp6len); if (nd_rs == NULL) { ICMP6STAT_INC(icp6s_tooshort); return; } #endif icmp6len -= sizeof(*nd_rs); nd6_option_init(nd_rs + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "nd6_rs_input: invalid ND option, ignored\n")); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_src_lladdr) { lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "nd6_rs_input: lladdrlen mismatch for %s " "(if %d, RS packet %d)\n", ip6_sprintf(ip6bufs, &saddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_ROUTER_SOLICIT, 0); freeit: m_freem(m); return; bad: ICMP6STAT_INC(icp6s_badrs); m_freem(m); } #ifdef EXPERIMENTAL /* * An initial update routine for draft-ietf-6man-ipv6only-flag. * We need to iterate over all default routers for the given * interface to see whether they are all advertising the "S" * (IPv6-Only) flag. If they do set, otherwise unset, the * interface flag we later use to filter on. */ static void defrtr_ipv6_only_ifp(struct ifnet *ifp) { struct nd_defrouter *dr; bool ipv6_only, ipv6_only_old; #ifdef INET struct epoch_tracker et; struct ifaddr *ifa; bool has_ipv4_addr; #endif if (V_nd6_ignore_ipv6_only_ra != 0) return; ipv6_only = true; ND6_RLOCK(); TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) if (dr->ifp == ifp && (dr->raflags & ND_RA_FLAG_IPV6_ONLY) == 0) ipv6_only = false; ND6_RUNLOCK(); IF_AFDATA_WLOCK(ifp); ipv6_only_old = ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY; IF_AFDATA_WUNLOCK(ifp); /* If nothing changed, we have an early exit. */ if (ipv6_only == ipv6_only_old) return; #ifdef INET /* * Should we want to set the IPV6-ONLY flag, check if the * interface has a non-0/0 and non-link-local IPv4 address * configured on it. If it has we will assume working * IPv4 operations and will clear the interface flag. */ has_ipv4_addr = false; if (ipv6_only) { NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; if (in_canforward( satosin(ifa->ifa_addr)->sin_addr)) { has_ipv4_addr = true; break; } } NET_EPOCH_EXIT(et); } if (ipv6_only && has_ipv4_addr) { log(LOG_NOTICE, "%s rcvd RA w/ IPv6-Only flag set but has IPv4 " "configured, ignoring IPv6-Only flag.\n", ifp->if_xname); ipv6_only = false; } #endif IF_AFDATA_WLOCK(ifp); if (ipv6_only) ND_IFINFO(ifp)->flags |= ND6_IFF_IPV6_ONLY; else ND_IFINFO(ifp)->flags &= ~ND6_IFF_IPV6_ONLY; IF_AFDATA_WUNLOCK(ifp); #ifdef notyet /* Send notification of flag change. */ #endif } static void defrtr_ipv6_only_ipf_down(struct ifnet *ifp) { IF_AFDATA_WLOCK(ifp); ND_IFINFO(ifp)->flags &= ~ND6_IFF_IPV6_ONLY; IF_AFDATA_WUNLOCK(ifp); } #endif /* EXPERIMENTAL */ void nd6_ifnet_link_event(void *arg __unused, struct ifnet *ifp, int linkstate) { /* * XXX-BZ we might want to trigger re-evaluation of our default router * availability. E.g., on link down the default router might be * unreachable but a different interface might still have connectivity. */ #ifdef EXPERIMENTAL if (linkstate == LINK_STATE_DOWN) defrtr_ipv6_only_ipf_down(ifp); #endif } /* * Receive Router Advertisement Message. * * Based on RFC 2461 * TODO: on-link bit on prefix information * TODO: ND_RA_FLAG_{OTHER,MANAGED} processing */ void nd6_ra_input(struct mbuf *m, int off, int icmp6len) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct nd_ifinfo *ndi = ND_IFINFO(ifp); struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_router_advert *nd_ra; struct in6_addr saddr6 = ip6->ip6_src; int mcast = 0; union nd_opts ndopts; struct nd_defrouter *dr; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; dr = NULL; /* * We only accept RAs only when the per-interface flag * ND6_IFF_ACCEPT_RTADV is on the receiving interface. */ if (!(ndi->flags & ND6_IFF_ACCEPT_RTADV)) goto freeit; /* RFC 6980: Nodes MUST silently ignore fragments */ if(m->m_flags & M_FRAGMENTED) goto freeit; if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "nd6_ra_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp))); goto bad; } if (!IN6_IS_ADDR_LINKLOCAL(&saddr6)) { nd6log((LOG_ERR, "nd6_ra_input: src %s is not link-local\n", ip6_sprintf(ip6bufs, &saddr6))); goto bad; } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_ra = (struct nd_router_advert *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_ra, struct nd_router_advert *, m, off, icmp6len); if (nd_ra == NULL) { ICMP6STAT_INC(icp6s_tooshort); return; } #endif icmp6len -= sizeof(*nd_ra); nd6_option_init(nd_ra + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "nd6_ra_input: invalid ND option, ignored\n")); /* nd6_options have incremented stats */ goto freeit; } { struct nd_defrouter dr0; u_int32_t advreachable = nd_ra->nd_ra_reachable; /* remember if this is a multicasted advertisement */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) mcast = 1; bzero(&dr0, sizeof(dr0)); dr0.rtaddr = saddr6; dr0.raflags = nd_ra->nd_ra_flags_reserved; /* * Effectively-disable routes from RA messages when * ND6_IFF_NO_RADR enabled on the receiving interface or * (ip6.forwarding == 1 && ip6.rfc6204w3 != 1). */ if (ndi->flags & ND6_IFF_NO_RADR) dr0.rtlifetime = 0; else if (V_ip6_forwarding && !V_ip6_rfc6204w3) dr0.rtlifetime = 0; else dr0.rtlifetime = ntohs(nd_ra->nd_ra_router_lifetime); dr0.expire = time_uptime + dr0.rtlifetime; dr0.ifp = ifp; /* unspecified or not? (RFC 2461 6.3.4) */ if (advreachable) { advreachable = ntohl(advreachable); if (advreachable <= MAX_REACHABLE_TIME && ndi->basereachable != advreachable) { ndi->basereachable = advreachable; ndi->reachable = ND_COMPUTE_RTIME(ndi->basereachable); ndi->recalctm = V_nd6_recalc_reachtm_interval; /* reset */ } } if (nd_ra->nd_ra_retransmit) ndi->retrans = ntohl(nd_ra->nd_ra_retransmit); if (nd_ra->nd_ra_curhoplimit) { if (ndi->chlim < nd_ra->nd_ra_curhoplimit) ndi->chlim = nd_ra->nd_ra_curhoplimit; else if (ndi->chlim != nd_ra->nd_ra_curhoplimit) { log(LOG_ERR, "RA with a lower CurHopLimit sent from " "%s on %s (current = %d, received = %d). " "Ignored.\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), if_name(ifp), ndi->chlim, nd_ra->nd_ra_curhoplimit); } } dr = defrtrlist_update(&dr0); #ifdef EXPERIMENTAL defrtr_ipv6_only_ifp(ifp); #endif } /* * prefix */ if (ndopts.nd_opts_pi) { struct nd_opt_hdr *pt; struct nd_opt_prefix_info *pi = NULL; struct nd_prefixctl pr; for (pt = (struct nd_opt_hdr *)ndopts.nd_opts_pi; pt <= (struct nd_opt_hdr *)ndopts.nd_opts_pi_end; pt = (struct nd_opt_hdr *)((caddr_t)pt + (pt->nd_opt_len << 3))) { if (pt->nd_opt_type != ND_OPT_PREFIX_INFORMATION) continue; pi = (struct nd_opt_prefix_info *)pt; if (pi->nd_opt_pi_len != 4) { nd6log((LOG_INFO, "nd6_ra_input: invalid option " "len %d for prefix information option, " "ignored\n", pi->nd_opt_pi_len)); continue; } if (128 < pi->nd_opt_pi_prefix_len) { nd6log((LOG_INFO, "nd6_ra_input: invalid prefix " "len %d for prefix information option, " "ignored\n", pi->nd_opt_pi_prefix_len)); continue; } if (IN6_IS_ADDR_MULTICAST(&pi->nd_opt_pi_prefix) || IN6_IS_ADDR_LINKLOCAL(&pi->nd_opt_pi_prefix)) { nd6log((LOG_INFO, "nd6_ra_input: invalid prefix " "%s, ignored\n", ip6_sprintf(ip6bufs, &pi->nd_opt_pi_prefix))); continue; } bzero(&pr, sizeof(pr)); pr.ndpr_prefix.sin6_family = AF_INET6; pr.ndpr_prefix.sin6_len = sizeof(pr.ndpr_prefix); pr.ndpr_prefix.sin6_addr = pi->nd_opt_pi_prefix; pr.ndpr_ifp = (struct ifnet *)m->m_pkthdr.rcvif; pr.ndpr_raf_onlink = (pi->nd_opt_pi_flags_reserved & ND_OPT_PI_FLAG_ONLINK) ? 1 : 0; pr.ndpr_raf_auto = (pi->nd_opt_pi_flags_reserved & ND_OPT_PI_FLAG_AUTO) ? 1 : 0; pr.ndpr_plen = pi->nd_opt_pi_prefix_len; pr.ndpr_vltime = ntohl(pi->nd_opt_pi_valid_time); pr.ndpr_pltime = ntohl(pi->nd_opt_pi_preferred_time); (void)prelist_update(&pr, dr, m, mcast); } } if (dr != NULL) { defrouter_rele(dr); dr = NULL; } /* * MTU */ if (ndopts.nd_opts_mtu && ndopts.nd_opts_mtu->nd_opt_mtu_len == 1) { u_long mtu; u_long maxmtu; mtu = (u_long)ntohl(ndopts.nd_opts_mtu->nd_opt_mtu_mtu); /* lower bound */ if (mtu < IPV6_MMTU) { nd6log((LOG_INFO, "nd6_ra_input: bogus mtu option " "mtu=%lu sent from %s, ignoring\n", mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src))); goto skip; } /* upper bound */ maxmtu = (ndi->maxmtu && ndi->maxmtu < ifp->if_mtu) ? ndi->maxmtu : ifp->if_mtu; if (mtu <= maxmtu) { int change = (ndi->linkmtu != mtu); ndi->linkmtu = mtu; if (change) { /* in6_maxmtu may change */ in6_setmaxmtu(); rt_updatemtu(ifp); } } else { nd6log((LOG_INFO, "nd6_ra_input: bogus mtu " "mtu=%lu sent from %s; " "exceeds maxmtu %lu, ignoring\n", mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src), maxmtu)); } } skip: /* * Source link layer address */ { char *lladdr = NULL; int lladdrlen = 0; if (ndopts.nd_opts_src_lladdr) { lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "nd6_ra_input: lladdrlen mismatch for %s " "(if %d, RA packet %d)\n", ip6_sprintf(ip6bufs, &saddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_ROUTER_ADVERT, 0); /* * Installing a link-layer address might change the state of the * router's neighbor cache, which might also affect our on-link * detection of adveritsed prefixes. */ pfxlist_onlink_check(); } freeit: m_freem(m); return; bad: ICMP6STAT_INC(icp6s_badra); m_freem(m); } /* tell the change to user processes watching the routing socket. */ static void nd6_rtmsg(int cmd, struct rtentry *rt) { struct rt_addrinfo info; struct ifnet *ifp; struct ifaddr *ifa; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); ifp = rt->rt_ifp; if (ifp != NULL) { struct epoch_tracker et; NET_EPOCH_ENTER(et); ifa = CK_STAILQ_FIRST(&ifp->if_addrhead); info.rti_info[RTAX_IFP] = ifa->ifa_addr; ifa_ref(ifa); NET_EPOCH_EXIT(et); info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; } else ifa = NULL; rt_missmsg_fib(cmd, &info, rt->rt_flags, 0, rt->rt_fibnum); if (ifa != NULL) ifa_free(ifa); } /* * default router list processing sub routines */ static void defrouter_addreq(struct nd_defrouter *new) { struct sockaddr_in6 def, mask, gate; struct rtentry *newrt = NULL; int error; bzero(&def, sizeof(def)); bzero(&mask, sizeof(mask)); bzero(&gate, sizeof(gate)); def.sin6_len = mask.sin6_len = gate.sin6_len = sizeof(struct sockaddr_in6); def.sin6_family = gate.sin6_family = AF_INET6; gate.sin6_addr = new->rtaddr; error = in6_rtrequest(RTM_ADD, (struct sockaddr *)&def, (struct sockaddr *)&gate, (struct sockaddr *)&mask, RTF_GATEWAY, &newrt, new->ifp->if_fib); if (newrt) { nd6_rtmsg(RTM_ADD, newrt); /* tell user process */ RTFREE(newrt); } if (error == 0) new->installed = 1; } struct nd_defrouter * defrouter_lookup_locked(struct in6_addr *addr, struct ifnet *ifp) { struct nd_defrouter *dr; ND6_LOCK_ASSERT(); TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) if (dr->ifp == ifp && IN6_ARE_ADDR_EQUAL(addr, &dr->rtaddr)) { defrouter_ref(dr); return (dr); } return (NULL); } struct nd_defrouter * defrouter_lookup(struct in6_addr *addr, struct ifnet *ifp) { struct nd_defrouter *dr; ND6_RLOCK(); dr = defrouter_lookup_locked(addr, ifp); ND6_RUNLOCK(); return (dr); } void defrouter_ref(struct nd_defrouter *dr) { refcount_acquire(&dr->refcnt); } void defrouter_rele(struct nd_defrouter *dr) { if (refcount_release(&dr->refcnt)) free(dr, M_IP6NDP); } /* * Remove the default route for a given router. * This is just a subroutine function for defrouter_select_fib(), and * should not be called from anywhere else. */ static void defrouter_delreq(struct nd_defrouter *dr) { struct sockaddr_in6 def, mask, gate; struct rtentry *oldrt = NULL; bzero(&def, sizeof(def)); bzero(&mask, sizeof(mask)); bzero(&gate, sizeof(gate)); def.sin6_len = mask.sin6_len = gate.sin6_len = sizeof(struct sockaddr_in6); def.sin6_family = gate.sin6_family = AF_INET6; gate.sin6_addr = dr->rtaddr; in6_rtrequest(RTM_DELETE, (struct sockaddr *)&def, (struct sockaddr *)&gate, (struct sockaddr *)&mask, RTF_GATEWAY, &oldrt, dr->ifp->if_fib); if (oldrt) { nd6_rtmsg(RTM_DELETE, oldrt); RTFREE(oldrt); } dr->installed = 0; } /* * Remove all default routes from default router list. */ void defrouter_reset(void) { struct nd_defrouter *dr, **dra; int count, i; count = i = 0; /* * We can't delete routes with the ND lock held, so make a copy of the * current default router list and use that when deleting routes. */ ND6_RLOCK(); TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) count++; ND6_RUNLOCK(); dra = malloc(count * sizeof(*dra), M_TEMP, M_WAITOK | M_ZERO); ND6_RLOCK(); TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { if (i == count) break; defrouter_ref(dr); dra[i++] = dr; } ND6_RUNLOCK(); for (i = 0; i < count && dra[i] != NULL; i++) { defrouter_delreq(dra[i]); defrouter_rele(dra[i]); } free(dra, M_TEMP); /* * XXX should we also nuke any default routers in the kernel, by * going through them by rtalloc1()? */ } /* * Look up a matching default router list entry and remove it. Returns true if a * matching entry was found, false otherwise. */ bool defrouter_remove(struct in6_addr *addr, struct ifnet *ifp) { struct nd_defrouter *dr; ND6_WLOCK(); dr = defrouter_lookup_locked(addr, ifp); if (dr == NULL) { ND6_WUNLOCK(); return (false); } defrouter_unlink(dr, NULL); ND6_WUNLOCK(); defrouter_del(dr); defrouter_rele(dr); return (true); } /* * Remove a router from the global list and optionally stash it in a * caller-supplied queue. * * The ND lock must be held. */ void defrouter_unlink(struct nd_defrouter *dr, struct nd_drhead *drq) { ND6_WLOCK_ASSERT(); TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry); V_nd6_list_genid++; if (drq != NULL) TAILQ_INSERT_TAIL(drq, dr, dr_entry); } void defrouter_del(struct nd_defrouter *dr) { struct nd_defrouter *deldr = NULL; struct nd_prefix *pr; struct nd_pfxrouter *pfxrtr; ND6_UNLOCK_ASSERT(); /* * Flush all the routing table entries that use the router * as a next hop. */ if (ND_IFINFO(dr->ifp)->flags & ND6_IFF_ACCEPT_RTADV) rt6_flush(&dr->rtaddr, dr->ifp); #ifdef EXPERIMENTAL defrtr_ipv6_only_ifp(dr->ifp); #endif if (dr->installed) { deldr = dr; defrouter_delreq(dr); } /* * Also delete all the pointers to the router in each prefix lists. */ ND6_WLOCK(); LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { if ((pfxrtr = pfxrtr_lookup(pr, dr)) != NULL) pfxrtr_del(pfxrtr); } ND6_WUNLOCK(); pfxlist_onlink_check(); /* * If the router is the primary one, choose a new one. * Note that defrouter_select_fib() will remove the current * gateway from the routing table. */ if (deldr) defrouter_select_fib(deldr->ifp->if_fib); /* * Release the list reference. */ defrouter_rele(dr); } /* * Default Router Selection according to Section 6.3.6 of RFC 2461 and * draft-ietf-ipngwg-router-selection: * 1) Routers that are reachable or probably reachable should be preferred. * If we have more than one (probably) reachable router, prefer ones * with the highest router preference. * 2) When no routers on the list are known to be reachable or * probably reachable, routers SHOULD be selected in a round-robin * fashion, regardless of router preference values. * 3) If the Default Router List is empty, assume that all * destinations are on-link. * * We assume nd_defrouter is sorted by router preference value. * Since the code below covers both with and without router preference cases, * we do not need to classify the cases by ifdef. * * At this moment, we do not try to install more than one default router, * even when the multipath routing is available, because we're not sure about * the benefits for stub hosts comparing to the risk of making the code * complicated and the possibility of introducing bugs. * * We maintain a single list of routers for multiple FIBs, only considering one * at a time based on the receiving interface's FIB. If @fibnum is RT_ALL_FIBS, * we do the whole thing multiple times. */ void defrouter_select_fib(int fibnum) { struct epoch_tracker et; struct nd_defrouter *dr, *selected_dr, *installed_dr; struct llentry *ln = NULL; if (fibnum == RT_ALL_FIBS) { for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { defrouter_select_fib(fibnum); } } ND6_RLOCK(); /* * Let's handle easy case (3) first: * If default router list is empty, there's nothing to be done. */ if (TAILQ_EMPTY(&V_nd_defrouter)) { ND6_RUNLOCK(); return; } /* * Search for a (probably) reachable router from the list. * We just pick up the first reachable one (if any), assuming that * the ordering rule of the list described in defrtrlist_update(). */ selected_dr = installed_dr = NULL; TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { NET_EPOCH_ENTER(et); if (selected_dr == NULL && dr->ifp->if_fib == fibnum && (ln = nd6_lookup(&dr->rtaddr, 0, dr->ifp)) && ND6_IS_LLINFO_PROBREACH(ln)) { selected_dr = dr; defrouter_ref(selected_dr); } NET_EPOCH_EXIT(et); if (ln != NULL) { LLE_RUNLOCK(ln); ln = NULL; } if (dr->installed && dr->ifp->if_fib == fibnum) { if (installed_dr == NULL) { installed_dr = dr; defrouter_ref(installed_dr); } else { /* * this should not happen. * warn for diagnosis. */ log(LOG_ERR, "defrouter_select_fib: more than " "one router is installed\n"); } } } /* * If none of the default routers was found to be reachable, * round-robin the list regardless of preference. * Otherwise, if we have an installed router, check if the selected * (reachable) router should really be preferred to the installed one. * We only prefer the new router when the old one is not reachable * or when the new one has a really higher preference value. */ if (selected_dr == NULL) { if (installed_dr == NULL || TAILQ_NEXT(installed_dr, dr_entry) == NULL) dr = TAILQ_FIRST(&V_nd_defrouter); else dr = TAILQ_NEXT(installed_dr, dr_entry); /* Ensure we select a router for this FIB. */ TAILQ_FOREACH_FROM(dr, &V_nd_defrouter, dr_entry) { if (dr->ifp->if_fib == fibnum) { selected_dr = dr; defrouter_ref(selected_dr); break; } } } else if (installed_dr != NULL) { NET_EPOCH_ENTER(et); if ((ln = nd6_lookup(&installed_dr->rtaddr, 0, installed_dr->ifp)) && ND6_IS_LLINFO_PROBREACH(ln) && installed_dr->ifp->if_fib == fibnum && rtpref(selected_dr) <= rtpref(installed_dr)) { defrouter_rele(selected_dr); selected_dr = installed_dr; } NET_EPOCH_EXIT(et); if (ln != NULL) LLE_RUNLOCK(ln); } ND6_RUNLOCK(); /* * If we selected a router for this FIB and it's different * than the installed one, remove the installed router and * install the selected one in its place. */ if (installed_dr != selected_dr) { if (installed_dr != NULL) { defrouter_delreq(installed_dr); defrouter_rele(installed_dr); } if (selected_dr != NULL) defrouter_addreq(selected_dr); } if (selected_dr != NULL) defrouter_rele(selected_dr); } /* * Maintain old KPI for default router selection. * If unspecified, we can re-select routers for all FIBs. */ void defrouter_select(void) { defrouter_select_fib(RT_ALL_FIBS); } /* * for default router selection * regards router-preference field as a 2-bit signed integer */ static int rtpref(struct nd_defrouter *dr) { switch (dr->raflags & ND_RA_FLAG_RTPREF_MASK) { case ND_RA_FLAG_RTPREF_HIGH: return (RTPREF_HIGH); case ND_RA_FLAG_RTPREF_MEDIUM: case ND_RA_FLAG_RTPREF_RSV: return (RTPREF_MEDIUM); case ND_RA_FLAG_RTPREF_LOW: return (RTPREF_LOW); default: /* * This case should never happen. If it did, it would mean a * serious bug of kernel internal. We thus always bark here. * Or, can we even panic? */ log(LOG_ERR, "rtpref: impossible RA flag %x\n", dr->raflags); return (RTPREF_INVALID); } /* NOTREACHED */ } static struct nd_defrouter * defrtrlist_update(struct nd_defrouter *new) { struct nd_defrouter *dr, *n; uint64_t genid; int oldpref; bool writelocked; if (new->rtlifetime == 0) { defrouter_remove(&new->rtaddr, new->ifp); return (NULL); } ND6_RLOCK(); writelocked = false; restart: dr = defrouter_lookup_locked(&new->rtaddr, new->ifp); if (dr != NULL) { oldpref = rtpref(dr); /* override */ dr->raflags = new->raflags; /* XXX flag check */ dr->rtlifetime = new->rtlifetime; dr->expire = new->expire; /* * If the preference does not change, there's no need * to sort the entries. Also make sure the selected * router is still installed in the kernel. */ if (dr->installed && rtpref(new) == oldpref) { if (writelocked) ND6_WUNLOCK(); else ND6_RUNLOCK(); return (dr); } } /* * The router needs to be reinserted into the default router * list, so upgrade to a write lock. If that fails and the list * has potentially changed while the lock was dropped, we'll * redo the lookup with the write lock held. */ if (!writelocked) { writelocked = true; if (!ND6_TRY_UPGRADE()) { genid = V_nd6_list_genid; ND6_RUNLOCK(); ND6_WLOCK(); if (genid != V_nd6_list_genid) goto restart; } } if (dr != NULL) { /* * The preferred router may have changed, so relocate this * router. */ TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry); n = dr; } else { n = malloc(sizeof(*n), M_IP6NDP, M_NOWAIT | M_ZERO); if (n == NULL) { ND6_WUNLOCK(); return (NULL); } memcpy(n, new, sizeof(*n)); /* Initialize with an extra reference for the caller. */ refcount_init(&n->refcnt, 2); } /* * Insert the new router in the Default Router List; * The Default Router List should be in the descending order * of router-preferece. Routers with the same preference are * sorted in the arriving time order. */ /* insert at the end of the group */ TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { if (rtpref(n) > rtpref(dr)) break; } if (dr != NULL) TAILQ_INSERT_BEFORE(dr, n, dr_entry); else TAILQ_INSERT_TAIL(&V_nd_defrouter, n, dr_entry); V_nd6_list_genid++; ND6_WUNLOCK(); defrouter_select_fib(new->ifp->if_fib); return (n); } static struct nd_pfxrouter * pfxrtr_lookup(struct nd_prefix *pr, struct nd_defrouter *dr) { struct nd_pfxrouter *search; ND6_LOCK_ASSERT(); LIST_FOREACH(search, &pr->ndpr_advrtrs, pfr_entry) { if (search->router == dr) break; } return (search); } static void pfxrtr_add(struct nd_prefix *pr, struct nd_defrouter *dr) { struct nd_pfxrouter *new; bool update; ND6_UNLOCK_ASSERT(); ND6_RLOCK(); if (pfxrtr_lookup(pr, dr) != NULL) { ND6_RUNLOCK(); return; } ND6_RUNLOCK(); new = malloc(sizeof(*new), M_IP6NDP, M_NOWAIT | M_ZERO); if (new == NULL) return; defrouter_ref(dr); new->router = dr; ND6_WLOCK(); if (pfxrtr_lookup(pr, dr) == NULL) { LIST_INSERT_HEAD(&pr->ndpr_advrtrs, new, pfr_entry); update = true; } else { /* We lost a race to add the reference. */ defrouter_rele(dr); free(new, M_IP6NDP); update = false; } ND6_WUNLOCK(); if (update) pfxlist_onlink_check(); } static void pfxrtr_del(struct nd_pfxrouter *pfr) { ND6_WLOCK_ASSERT(); LIST_REMOVE(pfr, pfr_entry); defrouter_rele(pfr->router); free(pfr, M_IP6NDP); } static struct nd_prefix * nd6_prefix_lookup_locked(struct nd_prefixctl *key) { struct nd_prefix *search; ND6_LOCK_ASSERT(); LIST_FOREACH(search, &V_nd_prefix, ndpr_entry) { if (key->ndpr_ifp == search->ndpr_ifp && key->ndpr_plen == search->ndpr_plen && in6_are_prefix_equal(&key->ndpr_prefix.sin6_addr, &search->ndpr_prefix.sin6_addr, key->ndpr_plen)) { nd6_prefix_ref(search); break; } } return (search); } struct nd_prefix * nd6_prefix_lookup(struct nd_prefixctl *key) { struct nd_prefix *search; ND6_RLOCK(); search = nd6_prefix_lookup_locked(key); ND6_RUNLOCK(); return (search); } void nd6_prefix_ref(struct nd_prefix *pr) { refcount_acquire(&pr->ndpr_refcnt); } void nd6_prefix_rele(struct nd_prefix *pr) { if (refcount_release(&pr->ndpr_refcnt)) { KASSERT(LIST_EMPTY(&pr->ndpr_advrtrs), ("prefix %p has advertising routers", pr)); free(pr, M_IP6NDP); } } int nd6_prelist_add(struct nd_prefixctl *pr, struct nd_defrouter *dr, struct nd_prefix **newp) { struct nd_prefix *new; char ip6buf[INET6_ADDRSTRLEN]; int error; new = malloc(sizeof(*new), M_IP6NDP, M_NOWAIT | M_ZERO); if (new == NULL) return (ENOMEM); refcount_init(&new->ndpr_refcnt, newp != NULL ? 2 : 1); new->ndpr_ifp = pr->ndpr_ifp; new->ndpr_prefix = pr->ndpr_prefix; new->ndpr_plen = pr->ndpr_plen; new->ndpr_vltime = pr->ndpr_vltime; new->ndpr_pltime = pr->ndpr_pltime; new->ndpr_flags = pr->ndpr_flags; if ((error = in6_init_prefix_ltimes(new)) != 0) { free(new, M_IP6NDP); return (error); } new->ndpr_lastupdate = time_uptime; /* initialization */ LIST_INIT(&new->ndpr_advrtrs); in6_prefixlen2mask(&new->ndpr_mask, new->ndpr_plen); /* make prefix in the canonical form */ IN6_MASK_ADDR(&new->ndpr_prefix.sin6_addr, &new->ndpr_mask); ND6_WLOCK(); LIST_INSERT_HEAD(&V_nd_prefix, new, ndpr_entry); V_nd6_list_genid++; ND6_WUNLOCK(); /* ND_OPT_PI_FLAG_ONLINK processing */ if (new->ndpr_raf_onlink) { + struct epoch_tracker et; + ND6_ONLINK_LOCK(); + NET_EPOCH_ENTER(et); if ((error = nd6_prefix_onlink(new)) != 0) { nd6log((LOG_ERR, "nd6_prelist_add: failed to make " "the prefix %s/%d on-link on %s (errno=%d)\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), error)); /* proceed anyway. XXX: is it correct? */ } + NET_EPOCH_EXIT(et); ND6_ONLINK_UNLOCK(); } if (dr != NULL) pfxrtr_add(new, dr); if (newp != NULL) *newp = new; return (0); } /* * Remove a prefix from the prefix list and optionally stash it in a * caller-provided list. * * The ND6 lock must be held. */ void nd6_prefix_unlink(struct nd_prefix *pr, struct nd_prhead *list) { ND6_WLOCK_ASSERT(); LIST_REMOVE(pr, ndpr_entry); V_nd6_list_genid++; if (list != NULL) LIST_INSERT_HEAD(list, pr, ndpr_entry); } /* * Free an unlinked prefix, first marking it off-link if necessary. */ void nd6_prefix_del(struct nd_prefix *pr) { struct nd_pfxrouter *pfr, *next; int e; char ip6buf[INET6_ADDRSTRLEN]; KASSERT(pr->ndpr_addrcnt == 0, ("prefix %p has referencing addresses", pr)); ND6_UNLOCK_ASSERT(); /* * Though these flags are now meaningless, we'd rather keep the value * of pr->ndpr_raf_onlink and pr->ndpr_raf_auto not to confuse users * when executing "ndp -p". */ if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { ND6_ONLINK_LOCK(); if ((e = nd6_prefix_offlink(pr)) != 0) { nd6log((LOG_ERR, "nd6_prefix_del: failed to make %s/%d offlink " "on %s, errno=%d\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), e)); /* what should we do? */ } ND6_ONLINK_UNLOCK(); } /* Release references to routers that have advertised this prefix. */ ND6_WLOCK(); LIST_FOREACH_SAFE(pfr, &pr->ndpr_advrtrs, pfr_entry, next) pfxrtr_del(pfr); ND6_WUNLOCK(); nd6_prefix_rele(pr); pfxlist_onlink_check(); } static int prelist_update(struct nd_prefixctl *new, struct nd_defrouter *dr, struct mbuf *m, int mcast) { struct in6_ifaddr *ia6 = NULL, *ia6_match = NULL; struct ifaddr *ifa; struct ifnet *ifp = new->ndpr_ifp; struct nd_prefix *pr; int error = 0; int auth; struct in6_addrlifetime lt6_tmp; char ip6buf[INET6_ADDRSTRLEN]; - struct epoch_tracker et; + + NET_EPOCH_ASSERT(); auth = 0; if (m) { /* * Authenticity for NA consists authentication for * both IP header and IP datagrams, doesn't it ? */ #if defined(M_AUTHIPHDR) && defined(M_AUTHIPDGM) auth = ((m->m_flags & M_AUTHIPHDR) && (m->m_flags & M_AUTHIPDGM)); #endif } if ((pr = nd6_prefix_lookup(new)) != NULL) { /* * nd6_prefix_lookup() ensures that pr and new have the same * prefix on a same interface. */ /* * Update prefix information. Note that the on-link (L) bit * and the autonomous (A) bit should NOT be changed from 1 * to 0. */ if (new->ndpr_raf_onlink == 1) pr->ndpr_raf_onlink = 1; if (new->ndpr_raf_auto == 1) pr->ndpr_raf_auto = 1; if (new->ndpr_raf_onlink) { pr->ndpr_vltime = new->ndpr_vltime; pr->ndpr_pltime = new->ndpr_pltime; (void)in6_init_prefix_ltimes(pr); /* XXX error case? */ pr->ndpr_lastupdate = time_uptime; } if (new->ndpr_raf_onlink && (pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { ND6_ONLINK_LOCK(); if ((error = nd6_prefix_onlink(pr)) != 0) { nd6log((LOG_ERR, "prelist_update: failed to make " "the prefix %s/%d on-link on %s " "(errno=%d)\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), error)); /* proceed anyway. XXX: is it correct? */ } ND6_ONLINK_UNLOCK(); } if (dr != NULL) pfxrtr_add(pr, dr); } else { if (new->ndpr_vltime == 0) goto end; if (new->ndpr_raf_onlink == 0 && new->ndpr_raf_auto == 0) goto end; error = nd6_prelist_add(new, dr, &pr); if (error != 0) { nd6log((LOG_NOTICE, "prelist_update: " "nd6_prelist_add failed for %s/%d on %s errno=%d\n", ip6_sprintf(ip6buf, &new->ndpr_prefix.sin6_addr), new->ndpr_plen, if_name(new->ndpr_ifp), error)); goto end; /* we should just give up in this case. */ } /* * XXX: from the ND point of view, we can ignore a prefix * with the on-link bit being zero. However, we need a * prefix structure for references from autoconfigured * addresses. Thus, we explicitly make sure that the prefix * itself expires now. */ if (pr->ndpr_raf_onlink == 0) { pr->ndpr_vltime = 0; pr->ndpr_pltime = 0; in6_init_prefix_ltimes(pr); } } /* * Address autoconfiguration based on Section 5.5.3 of RFC 2462. * Note that pr must be non NULL at this point. */ /* 5.5.3 (a). Ignore the prefix without the A bit set. */ if (!new->ndpr_raf_auto) goto end; /* * 5.5.3 (b). the link-local prefix should have been ignored in * nd6_ra_input. */ /* 5.5.3 (c). Consistency check on lifetimes: pltime <= vltime. */ if (new->ndpr_pltime > new->ndpr_vltime) { error = EINVAL; /* XXX: won't be used */ goto end; } /* * 5.5.3 (d). If the prefix advertised is not equal to the prefix of * an address configured by stateless autoconfiguration already in the * list of addresses associated with the interface, and the Valid * Lifetime is not 0, form an address. We first check if we have * a matching prefix. * Note: we apply a clarification in rfc2462bis-02 here. We only * consider autoconfigured addresses while RFC2462 simply said * "address". */ - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in6_ifaddr *ifa6; u_int32_t remaininglifetime; if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; /* * We only consider autoconfigured addresses as per rfc2462bis. */ if (!(ifa6->ia6_flags & IN6_IFF_AUTOCONF)) continue; /* * Spec is not clear here, but I believe we should concentrate * on unicast (i.e. not anycast) addresses. * XXX: other ia6_flags? detached or duplicated? */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0) continue; /* * Ignore the address if it is not associated with a prefix * or is associated with a prefix that is different from this * one. (pr is never NULL here) */ if (ifa6->ia6_ndpr != pr) continue; if (ia6_match == NULL) /* remember the first one */ ia6_match = ifa6; /* * An already autoconfigured address matched. Now that we * are sure there is at least one matched address, we can * proceed to 5.5.3. (e): update the lifetimes according to the * "two hours" rule and the privacy extension. * We apply some clarifications in rfc2462bis: * - use remaininglifetime instead of storedlifetime as a * variable name * - remove the dead code in the "two-hour" rule */ #define TWOHOUR (120*60) lt6_tmp = ifa6->ia6_lifetime; if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME) remaininglifetime = ND6_INFINITE_LIFETIME; else if (time_uptime - ifa6->ia6_updatetime > lt6_tmp.ia6t_vltime) { /* * The case of "invalid" address. We should usually * not see this case. */ remaininglifetime = 0; } else remaininglifetime = lt6_tmp.ia6t_vltime - (time_uptime - ifa6->ia6_updatetime); /* when not updating, keep the current stored lifetime. */ lt6_tmp.ia6t_vltime = remaininglifetime; if (TWOHOUR < new->ndpr_vltime || remaininglifetime < new->ndpr_vltime) { lt6_tmp.ia6t_vltime = new->ndpr_vltime; } else if (remaininglifetime <= TWOHOUR) { if (auth) { lt6_tmp.ia6t_vltime = new->ndpr_vltime; } } else { /* * new->ndpr_vltime <= TWOHOUR && * TWOHOUR < remaininglifetime */ lt6_tmp.ia6t_vltime = TWOHOUR; } /* The 2 hour rule is not imposed for preferred lifetime. */ lt6_tmp.ia6t_pltime = new->ndpr_pltime; in6_init_address_ltimes(pr, <6_tmp); /* * We need to treat lifetimes for temporary addresses * differently, according to * draft-ietf-ipv6-privacy-addrs-v2-01.txt 3.3 (1); * we only update the lifetimes when they are in the maximum * intervals. */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0) { u_int32_t maxvltime, maxpltime; if (V_ip6_temp_valid_lifetime > (u_int32_t)((time_uptime - ifa6->ia6_createtime) + V_ip6_desync_factor)) { maxvltime = V_ip6_temp_valid_lifetime - (time_uptime - ifa6->ia6_createtime) - V_ip6_desync_factor; } else maxvltime = 0; if (V_ip6_temp_preferred_lifetime > (u_int32_t)((time_uptime - ifa6->ia6_createtime) + V_ip6_desync_factor)) { maxpltime = V_ip6_temp_preferred_lifetime - (time_uptime - ifa6->ia6_createtime) - V_ip6_desync_factor; } else maxpltime = 0; if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME || lt6_tmp.ia6t_vltime > maxvltime) { lt6_tmp.ia6t_vltime = maxvltime; } if (lt6_tmp.ia6t_pltime == ND6_INFINITE_LIFETIME || lt6_tmp.ia6t_pltime > maxpltime) { lt6_tmp.ia6t_pltime = maxpltime; } } ifa6->ia6_lifetime = lt6_tmp; ifa6->ia6_updatetime = time_uptime; } - NET_EPOCH_EXIT(et); if (ia6_match == NULL && new->ndpr_vltime) { int ifidlen; /* * 5.5.3 (d) (continued) * No address matched and the valid lifetime is non-zero. * Create a new address. */ /* * Prefix Length check: * If the sum of the prefix length and interface identifier * length does not equal 128 bits, the Prefix Information * option MUST be ignored. The length of the interface * identifier is defined in a separate link-type specific * document. */ ifidlen = in6_if2idlen(ifp); if (ifidlen < 0) { /* this should not happen, so we always log it. */ log(LOG_ERR, "prelist_update: IFID undefined (%s)\n", if_name(ifp)); goto end; } if (ifidlen + pr->ndpr_plen != 128) { nd6log((LOG_INFO, "prelist_update: invalid prefixlen " "%d for %s, ignored\n", pr->ndpr_plen, if_name(ifp))); goto end; } if ((ia6 = in6_ifadd(new, mcast)) != NULL) { /* * note that we should use pr (not new) for reference. */ pr->ndpr_addrcnt++; ia6->ia6_ndpr = pr; /* * RFC 3041 3.3 (2). * When a new public address is created as described * in RFC2462, also create a new temporary address. * * RFC 3041 3.5. * When an interface connects to a new link, a new * randomized interface identifier should be generated * immediately together with a new set of temporary * addresses. Thus, we specifiy 1 as the 2nd arg of * in6_tmpifadd(). */ if (V_ip6_use_tempaddr) { int e; if ((e = in6_tmpifadd(ia6, 1, 1)) != 0) { nd6log((LOG_NOTICE, "prelist_update: " "failed to create a temporary " "address, errno=%d\n", e)); } } ifa_free(&ia6->ia_ifa); /* * A newly added address might affect the status * of other addresses, so we check and update it. * XXX: what if address duplication happens? */ pfxlist_onlink_check(); } else { /* just set an error. do not bark here. */ error = EADDRNOTAVAIL; /* XXX: might be unused. */ } } end: if (pr != NULL) nd6_prefix_rele(pr); return (error); } /* * A supplement function used in the on-link detection below; * detect if a given prefix has a (probably) reachable advertising router. * XXX: lengthy function name... */ static struct nd_pfxrouter * find_pfxlist_reachable_router(struct nd_prefix *pr) { struct epoch_tracker et; struct nd_pfxrouter *pfxrtr; struct llentry *ln; int canreach; ND6_LOCK_ASSERT(); + NET_EPOCH_ENTER(et); LIST_FOREACH(pfxrtr, &pr->ndpr_advrtrs, pfr_entry) { - NET_EPOCH_ENTER(et); ln = nd6_lookup(&pfxrtr->router->rtaddr, 0, pfxrtr->router->ifp); - NET_EPOCH_EXIT(et); if (ln == NULL) continue; canreach = ND6_IS_LLINFO_PROBREACH(ln); LLE_RUNLOCK(ln); if (canreach) break; } + NET_EPOCH_EXIT(et); return (pfxrtr); } /* * Check if each prefix in the prefix list has at least one available router * that advertised the prefix (a router is "available" if its neighbor cache * entry is reachable or probably reachable). * If the check fails, the prefix may be off-link, because, for example, * we have moved from the network but the lifetime of the prefix has not * expired yet. So we should not use the prefix if there is another prefix * that has an available router. * But, if there is no prefix that has an available router, we still regard * all the prefixes as on-link. This is because we can't tell if all the * routers are simply dead or if we really moved from the network and there * is no router around us. */ void pfxlist_onlink_check(void) { struct nd_prefix *pr; struct in6_ifaddr *ifa; struct nd_defrouter *dr; struct nd_pfxrouter *pfxrtr = NULL; struct rm_priotracker in6_ifa_tracker; uint64_t genid; uint32_t flags; ND6_ONLINK_LOCK(); ND6_RLOCK(); /* * Check if there is a prefix that has a reachable advertising * router. */ LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { if (pr->ndpr_raf_onlink && find_pfxlist_reachable_router(pr)) break; } /* * If we have no such prefix, check whether we still have a router * that does not advertise any prefixes. */ if (pr == NULL) { TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { struct nd_prefix *pr0; LIST_FOREACH(pr0, &V_nd_prefix, ndpr_entry) { if ((pfxrtr = pfxrtr_lookup(pr0, dr)) != NULL) break; } if (pfxrtr != NULL) break; } } if (pr != NULL || (!TAILQ_EMPTY(&V_nd_defrouter) && pfxrtr == NULL)) { /* * There is at least one prefix that has a reachable router, * or at least a router which probably does not advertise * any prefixes. The latter would be the case when we move * to a new link where we have a router that does not provide * prefixes and we configure an address by hand. * Detach prefixes which have no reachable advertising * router, and attach other prefixes. */ LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { /* XXX: a link-local prefix should never be detached */ if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr) || pr->ndpr_raf_onlink == 0 || pr->ndpr_raf_auto == 0) continue; if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 && find_pfxlist_reachable_router(pr) == NULL) pr->ndpr_stateflags |= NDPRF_DETACHED; else if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 && find_pfxlist_reachable_router(pr) != NULL) pr->ndpr_stateflags &= ~NDPRF_DETACHED; } } else { /* there is no prefix that has a reachable router */ LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr) || pr->ndpr_raf_onlink == 0 || pr->ndpr_raf_auto == 0) continue; pr->ndpr_stateflags &= ~NDPRF_DETACHED; } } /* * Remove each interface route associated with a (just) detached * prefix, and reinstall the interface route for a (just) attached * prefix. Note that all attempt of reinstallation does not * necessarily success, when a same prefix is shared among multiple * interfaces. Such cases will be handled in nd6_prefix_onlink, * so we don't have to care about them. */ restart: LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { char ip6buf[INET6_ADDRSTRLEN]; int e; if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr) || pr->ndpr_raf_onlink == 0 || pr->ndpr_raf_auto == 0) continue; flags = pr->ndpr_stateflags & (NDPRF_DETACHED | NDPRF_ONLINK); if (flags == 0 || flags == (NDPRF_DETACHED | NDPRF_ONLINK)) { genid = V_nd6_list_genid; ND6_RUNLOCK(); if ((flags & NDPRF_ONLINK) != 0 && (e = nd6_prefix_offlink(pr)) != 0) { nd6log((LOG_ERR, "pfxlist_onlink_check: failed to " "make %s/%d offlink, errno=%d\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, e)); } else if ((flags & NDPRF_ONLINK) == 0 && (e = nd6_prefix_onlink(pr)) != 0) { nd6log((LOG_ERR, "pfxlist_onlink_check: failed to " "make %s/%d onlink, errno=%d\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, e)); } ND6_RLOCK(); if (genid != V_nd6_list_genid) goto restart; } } /* * Changes on the prefix status might affect address status as well. * Make sure that all addresses derived from an attached prefix are * attached, and that all addresses derived from a detached prefix are * detached. Note, however, that a manually configured address should * always be attached. * The precise detection logic is same as the one for prefixes. */ IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_STAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) { if (!(ifa->ia6_flags & IN6_IFF_AUTOCONF)) continue; if (ifa->ia6_ndpr == NULL) { /* * This can happen when we first configure the address * (i.e. the address exists, but the prefix does not). * XXX: complicated relationships... */ continue; } if (find_pfxlist_reachable_router(ifa->ia6_ndpr)) break; } if (ifa) { CK_STAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) { if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; if (ifa->ia6_ndpr == NULL) /* XXX: see above. */ continue; if (find_pfxlist_reachable_router(ifa->ia6_ndpr)) { if (ifa->ia6_flags & IN6_IFF_DETACHED) { ifa->ia6_flags &= ~IN6_IFF_DETACHED; ifa->ia6_flags |= IN6_IFF_TENTATIVE; nd6_dad_start((struct ifaddr *)ifa, 0); } } else { ifa->ia6_flags |= IN6_IFF_DETACHED; } } } else { CK_STAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) { if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; if (ifa->ia6_flags & IN6_IFF_DETACHED) { ifa->ia6_flags &= ~IN6_IFF_DETACHED; ifa->ia6_flags |= IN6_IFF_TENTATIVE; /* Do we need a delay in this case? */ nd6_dad_start((struct ifaddr *)ifa, 0); } } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); ND6_RUNLOCK(); ND6_ONLINK_UNLOCK(); } static int nd6_prefix_onlink_rtrequest(struct nd_prefix *pr, struct ifaddr *ifa) { struct sockaddr_dl sdl; struct rtentry *rt; struct sockaddr_in6 mask6; u_long rtflags; int error, a_failure, fibnum, maxfib; /* * in6_ifinit() sets nd6_rtrequest to ifa_rtrequest for all ifaddrs. * ifa->ifa_rtrequest = nd6_rtrequest; */ bzero(&mask6, sizeof(mask6)); mask6.sin6_len = sizeof(mask6); mask6.sin6_addr = pr->ndpr_mask; rtflags = (ifa->ifa_flags & ~IFA_RTSELF) | RTF_UP; bzero(&sdl, sizeof(struct sockaddr_dl)); sdl.sdl_len = sizeof(struct sockaddr_dl); sdl.sdl_family = AF_LINK; sdl.sdl_type = ifa->ifa_ifp->if_type; sdl.sdl_index = ifa->ifa_ifp->if_index; if(V_rt_add_addr_allfibs) { fibnum = 0; maxfib = rt_numfibs; } else { fibnum = ifa->ifa_ifp->if_fib; maxfib = fibnum + 1; } a_failure = 0; for (; fibnum < maxfib; fibnum++) { rt = NULL; error = in6_rtrequest(RTM_ADD, (struct sockaddr *)&pr->ndpr_prefix, (struct sockaddr *)&sdl, (struct sockaddr *)&mask6, rtflags, &rt, fibnum); if (error == 0) { KASSERT(rt != NULL, ("%s: in6_rtrequest return no " "error(%d) but rt is NULL, pr=%p, ifa=%p", __func__, error, pr, ifa)); RT_LOCK(rt); nd6_rtmsg(RTM_ADD, rt); RT_UNLOCK(rt); pr->ndpr_stateflags |= NDPRF_ONLINK; } else { char ip6buf[INET6_ADDRSTRLEN]; char ip6bufg[INET6_ADDRSTRLEN]; char ip6bufm[INET6_ADDRSTRLEN]; struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; nd6log((LOG_ERR, "nd6_prefix_onlink: failed to add " "route for a prefix (%s/%d) on %s, gw=%s, mask=%s, " "flags=%lx errno = %d\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), ip6_sprintf(ip6bufg, &sin6->sin6_addr), ip6_sprintf(ip6bufm, &mask6.sin6_addr), rtflags, error)); /* Save last error to return, see rtinit(). */ a_failure = error; } if (rt != NULL) { RT_LOCK(rt); RT_REMREF(rt); RT_UNLOCK(rt); } } /* Return the last error we got. */ return (a_failure); } int nd6_prefix_onlink(struct nd_prefix *pr) { struct ifaddr *ifa; struct ifnet *ifp = pr->ndpr_ifp; struct nd_prefix *opr; char ip6buf[INET6_ADDRSTRLEN]; int error; ND6_ONLINK_LOCK_ASSERT(); ND6_UNLOCK_ASSERT(); if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) return (EEXIST); /* * Add the interface route associated with the prefix. Before * installing the route, check if there's the same prefix on another * interface, and the prefix has already installed the interface route. * Although such a configuration is expected to be rare, we explicitly * allow it. */ ND6_RLOCK(); LIST_FOREACH(opr, &V_nd_prefix, ndpr_entry) { if (opr == pr) continue; if ((opr->ndpr_stateflags & NDPRF_ONLINK) == 0) continue; if (!V_rt_add_addr_allfibs && opr->ndpr_ifp->if_fib != pr->ndpr_ifp->if_fib) continue; if (opr->ndpr_plen == pr->ndpr_plen && in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen)) { ND6_RUNLOCK(); return (0); } } ND6_RUNLOCK(); /* * We prefer link-local addresses as the associated interface address. */ /* search for a link-local addr */ ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY | IN6_IFF_ANYCAST); if (ifa == NULL) { struct epoch_tracker et; /* XXX: freebsd does not have ifa_ifwithaf */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET6) { ifa_ref(ifa); break; } } NET_EPOCH_EXIT(et); /* should we care about ia6_flags? */ } if (ifa == NULL) { /* * This can still happen, when, for example, we receive an RA * containing a prefix with the L bit set and the A bit clear, * after removing all IPv6 addresses on the receiving * interface. This should, of course, be rare though. */ nd6log((LOG_NOTICE, "nd6_prefix_onlink: failed to find any ifaddr" " to add route for a prefix(%s/%d) on %s\n", ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(ifp))); return (0); } error = nd6_prefix_onlink_rtrequest(pr, ifa); if (ifa != NULL) ifa_free(ifa); return (error); } int nd6_prefix_offlink(struct nd_prefix *pr) { int error = 0; struct ifnet *ifp = pr->ndpr_ifp; struct nd_prefix *opr; struct sockaddr_in6 sa6, mask6; struct rtentry *rt; char ip6buf[INET6_ADDRSTRLEN]; uint64_t genid; int fibnum, maxfib, a_failure; ND6_ONLINK_LOCK_ASSERT(); ND6_UNLOCK_ASSERT(); if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) return (EEXIST); bzero(&sa6, sizeof(sa6)); sa6.sin6_family = AF_INET6; sa6.sin6_len = sizeof(sa6); bcopy(&pr->ndpr_prefix.sin6_addr, &sa6.sin6_addr, sizeof(struct in6_addr)); bzero(&mask6, sizeof(mask6)); mask6.sin6_family = AF_INET6; mask6.sin6_len = sizeof(sa6); bcopy(&pr->ndpr_mask, &mask6.sin6_addr, sizeof(struct in6_addr)); if (V_rt_add_addr_allfibs) { fibnum = 0; maxfib = rt_numfibs; } else { fibnum = ifp->if_fib; maxfib = fibnum + 1; } a_failure = 0; for (; fibnum < maxfib; fibnum++) { rt = NULL; error = in6_rtrequest(RTM_DELETE, (struct sockaddr *)&sa6, NULL, (struct sockaddr *)&mask6, 0, &rt, fibnum); if (error == 0) { /* report the route deletion to the routing socket. */ if (rt != NULL) nd6_rtmsg(RTM_DELETE, rt); } else { /* Save last error to return, see rtinit(). */ a_failure = error; } if (rt != NULL) { RTFREE(rt); } } error = a_failure; a_failure = 1; if (error == 0) { pr->ndpr_stateflags &= ~NDPRF_ONLINK; /* * There might be the same prefix on another interface, * the prefix which could not be on-link just because we have * the interface route (see comments in nd6_prefix_onlink). * If there's one, try to make the prefix on-link on the * interface. */ ND6_RLOCK(); restart: LIST_FOREACH(opr, &V_nd_prefix, ndpr_entry) { /* * KAME specific: detached prefixes should not be * on-link. */ if (opr == pr || (opr->ndpr_stateflags & (NDPRF_ONLINK | NDPRF_DETACHED)) != 0) continue; if (opr->ndpr_plen == pr->ndpr_plen && in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen)) { int e; genid = V_nd6_list_genid; ND6_RUNLOCK(); if ((e = nd6_prefix_onlink(opr)) != 0) { nd6log((LOG_ERR, "nd6_prefix_offlink: failed to " "recover a prefix %s/%d from %s " "to %s (errno = %d)\n", ip6_sprintf(ip6buf, &opr->ndpr_prefix.sin6_addr), opr->ndpr_plen, if_name(ifp), if_name(opr->ndpr_ifp), e)); } else a_failure = 0; ND6_RLOCK(); if (genid != V_nd6_list_genid) goto restart; } } ND6_RUNLOCK(); } else { /* XXX: can we still set the NDPRF_ONLINK flag? */ nd6log((LOG_ERR, "nd6_prefix_offlink: failed to delete route: " "%s/%d on %s (errno = %d)\n", ip6_sprintf(ip6buf, &sa6.sin6_addr), pr->ndpr_plen, if_name(ifp), error)); } if (a_failure) lltable_prefix_free(AF_INET6, (struct sockaddr *)&sa6, (struct sockaddr *)&mask6, LLE_STATIC); return (error); } static struct in6_ifaddr * in6_ifadd(struct nd_prefixctl *pr, int mcast) { struct ifnet *ifp = pr->ndpr_ifp; struct ifaddr *ifa; struct in6_aliasreq ifra; struct in6_ifaddr *ia, *ib; int error, plen0; struct in6_addr mask; int prefixlen = pr->ndpr_plen; int updateflags; char ip6buf[INET6_ADDRSTRLEN]; in6_prefixlen2mask(&mask, prefixlen); /* * find a link-local address (will be interface ID). * Is it really mandatory? Theoretically, a global or a site-local * address can be configured without a link-local address, if we * have a unique interface identifier... * * it is not mandatory to have a link-local address, we can generate * interface identifier on the fly. we do this because: * (1) it should be the easiest way to find interface identifier. * (2) RFC2462 5.4 suggesting the use of the same interface identifier * for multiple addresses on a single interface, and possible shortcut * of DAD. we omitted DAD for this reason in the past. * (3) a user can prevent autoconfiguration of global address * by removing link-local address by hand (this is partly because we * don't have other way to control the use of IPv6 on an interface. * this has been our design choice - cf. NRL's "ifconfig auto"). * (4) it is easier to manage when an interface has addresses * with the same interface identifier, than to have multiple addresses * with different interface identifiers. */ ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, 0); /* 0 is OK? */ if (ifa) ib = (struct in6_ifaddr *)ifa; else return NULL; /* prefixlen + ifidlen must be equal to 128 */ plen0 = in6_mask2len(&ib->ia_prefixmask.sin6_addr, NULL); if (prefixlen != plen0) { ifa_free(ifa); nd6log((LOG_INFO, "in6_ifadd: wrong prefixlen for %s " "(prefix=%d ifid=%d)\n", if_name(ifp), prefixlen, 128 - plen0)); return NULL; } /* make ifaddr */ in6_prepare_ifra(&ifra, &pr->ndpr_prefix.sin6_addr, &mask); IN6_MASK_ADDR(&ifra.ifra_addr.sin6_addr, &mask); /* interface ID */ ifra.ifra_addr.sin6_addr.s6_addr32[0] |= (ib->ia_addr.sin6_addr.s6_addr32[0] & ~mask.s6_addr32[0]); ifra.ifra_addr.sin6_addr.s6_addr32[1] |= (ib->ia_addr.sin6_addr.s6_addr32[1] & ~mask.s6_addr32[1]); ifra.ifra_addr.sin6_addr.s6_addr32[2] |= (ib->ia_addr.sin6_addr.s6_addr32[2] & ~mask.s6_addr32[2]); ifra.ifra_addr.sin6_addr.s6_addr32[3] |= (ib->ia_addr.sin6_addr.s6_addr32[3] & ~mask.s6_addr32[3]); ifa_free(ifa); /* lifetimes. */ ifra.ifra_lifetime.ia6t_vltime = pr->ndpr_vltime; ifra.ifra_lifetime.ia6t_pltime = pr->ndpr_pltime; /* XXX: scope zone ID? */ ifra.ifra_flags |= IN6_IFF_AUTOCONF; /* obey autoconf */ /* * Make sure that we do not have this address already. This should * usually not happen, but we can still see this case, e.g., if we * have manually configured the exact address to be configured. */ ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); if (ifa != NULL) { ifa_free(ifa); /* this should be rare enough to make an explicit log */ log(LOG_INFO, "in6_ifadd: %s is already configured\n", ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr)); return (NULL); } /* * Allocate ifaddr structure, link into chain, etc. * If we are going to create a new address upon receiving a multicasted * RA, we need to impose a random delay before starting DAD. * [draft-ietf-ipv6-rfc2462bis-02.txt, Section 5.4.2] */ updateflags = 0; if (mcast) updateflags |= IN6_IFAUPDATE_DADDELAY; if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0) { nd6log((LOG_ERR, "in6_ifadd: failed to make ifaddr %s on %s (errno=%d)\n", ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr), if_name(ifp), error)); return (NULL); /* ifaddr must not have been allocated. */ } ia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); /* * XXXRW: Assumption of non-NULLness here might not be true with * fine-grained locking -- should we validate it? Or just return * earlier ifa rather than looking it up again? */ return (ia); /* this is always non-NULL and referenced. */ } /* * ia0 - corresponding public address */ int in6_tmpifadd(const struct in6_ifaddr *ia0, int forcegen, int delay) { struct ifnet *ifp = ia0->ia_ifa.ifa_ifp; struct in6_ifaddr *newia; struct in6_aliasreq ifra; int error; int trylimit = 3; /* XXX: adhoc value */ int updateflags; u_int32_t randid[2]; time_t vltime0, pltime0; in6_prepare_ifra(&ifra, &ia0->ia_addr.sin6_addr, &ia0->ia_prefixmask.sin6_addr); ifra.ifra_addr = ia0->ia_addr; /* XXX: do we need this ? */ /* clear the old IFID */ IN6_MASK_ADDR(&ifra.ifra_addr.sin6_addr, &ifra.ifra_prefixmask.sin6_addr); again: if (in6_get_tmpifid(ifp, (u_int8_t *)randid, (const u_int8_t *)&ia0->ia_addr.sin6_addr.s6_addr[8], forcegen)) { nd6log((LOG_NOTICE, "in6_tmpifadd: failed to find a good " "random IFID\n")); return (EINVAL); } ifra.ifra_addr.sin6_addr.s6_addr32[2] |= (randid[0] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[2])); ifra.ifra_addr.sin6_addr.s6_addr32[3] |= (randid[1] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[3])); /* * in6_get_tmpifid() quite likely provided a unique interface ID. * However, we may still have a chance to see collision, because * there may be a time lag between generation of the ID and generation * of the address. So, we'll do one more sanity check. */ if (in6_localip(&ifra.ifra_addr.sin6_addr) != 0) { if (trylimit-- > 0) { forcegen = 1; goto again; } /* Give up. Something strange should have happened. */ nd6log((LOG_NOTICE, "in6_tmpifadd: failed to " "find a unique random IFID\n")); return (EEXIST); } /* * The Valid Lifetime is the lower of the Valid Lifetime of the * public address or TEMP_VALID_LIFETIME. * The Preferred Lifetime is the lower of the Preferred Lifetime * of the public address or TEMP_PREFERRED_LIFETIME - * DESYNC_FACTOR. */ if (ia0->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { vltime0 = IFA6_IS_INVALID(ia0) ? 0 : (ia0->ia6_lifetime.ia6t_vltime - (time_uptime - ia0->ia6_updatetime)); if (vltime0 > V_ip6_temp_valid_lifetime) vltime0 = V_ip6_temp_valid_lifetime; } else vltime0 = V_ip6_temp_valid_lifetime; if (ia0->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { pltime0 = IFA6_IS_DEPRECATED(ia0) ? 0 : (ia0->ia6_lifetime.ia6t_pltime - (time_uptime - ia0->ia6_updatetime)); if (pltime0 > V_ip6_temp_preferred_lifetime - V_ip6_desync_factor){ pltime0 = V_ip6_temp_preferred_lifetime - V_ip6_desync_factor; } } else pltime0 = V_ip6_temp_preferred_lifetime - V_ip6_desync_factor; ifra.ifra_lifetime.ia6t_vltime = vltime0; ifra.ifra_lifetime.ia6t_pltime = pltime0; /* * A temporary address is created only if this calculated Preferred * Lifetime is greater than REGEN_ADVANCE time units. */ if (ifra.ifra_lifetime.ia6t_pltime <= V_ip6_temp_regen_advance) return (0); /* XXX: scope zone ID? */ ifra.ifra_flags |= (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY); /* allocate ifaddr structure, link into chain, etc. */ updateflags = 0; if (delay) updateflags |= IN6_IFAUPDATE_DADDELAY; if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0) return (error); newia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); if (newia == NULL) { /* XXX: can it happen? */ nd6log((LOG_ERR, "in6_tmpifadd: ifa update succeeded, but we got " "no ifaddr\n")); return (EINVAL); /* XXX */ } newia->ia6_ndpr = ia0->ia6_ndpr; newia->ia6_ndpr->ndpr_addrcnt++; ifa_free(&newia->ia_ifa); /* * A newly added address might affect the status of other addresses. * XXX: when the temporary address is generated with a new public * address, the onlink check is redundant. However, it would be safe * to do the check explicitly everywhere a new address is generated, * and, in fact, we surely need the check when we create a new * temporary address due to deprecation of an old temporary address. */ pfxlist_onlink_check(); return (0); } static int in6_init_prefix_ltimes(struct nd_prefix *ndpr) { if (ndpr->ndpr_pltime == ND6_INFINITE_LIFETIME) ndpr->ndpr_preferred = 0; else ndpr->ndpr_preferred = time_uptime + ndpr->ndpr_pltime; if (ndpr->ndpr_vltime == ND6_INFINITE_LIFETIME) ndpr->ndpr_expire = 0; else ndpr->ndpr_expire = time_uptime + ndpr->ndpr_vltime; return 0; } static void in6_init_address_ltimes(struct nd_prefix *new, struct in6_addrlifetime *lt6) { /* init ia6t_expire */ if (lt6->ia6t_vltime == ND6_INFINITE_LIFETIME) lt6->ia6t_expire = 0; else { lt6->ia6t_expire = time_uptime; lt6->ia6t_expire += lt6->ia6t_vltime; } /* init ia6t_preferred */ if (lt6->ia6t_pltime == ND6_INFINITE_LIFETIME) lt6->ia6t_preferred = 0; else { lt6->ia6t_preferred = time_uptime; lt6->ia6t_preferred += lt6->ia6t_pltime; } } /* * Delete all the routing table entries that use the specified gateway. * XXX: this function causes search through all entries of routing table, so * it shouldn't be called when acting as a router. */ void rt6_flush(struct in6_addr *gateway, struct ifnet *ifp) { /* We'll care only link-local addresses */ if (!IN6_IS_ADDR_LINKLOCAL(gateway)) return; /* XXX Do we really need to walk any but the default FIB? */ rt_foreach_fib_walk_del(AF_INET6, rt6_deleteroute, (void *)gateway); } static int rt6_deleteroute(const struct rtentry *rt, void *arg) { #define SIN6(s) ((struct sockaddr_in6 *)s) struct in6_addr *gate = (struct in6_addr *)arg; if (rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_INET6) return (0); if (!IN6_ARE_ADDR_EQUAL(gate, &SIN6(rt->rt_gateway)->sin6_addr)) { return (0); } /* * Do not delete a static route. * XXX: this seems to be a bit ad-hoc. Should we consider the * 'cloned' bit instead? */ if ((rt->rt_flags & RTF_STATIC) != 0) return (0); /* * We delete only host route. This means, in particular, we don't * delete default route. */ if ((rt->rt_flags & RTF_HOST) == 0) return (0); return (1); #undef SIN6 } int nd6_setdefaultiface(int ifindex) { int error = 0; if (ifindex < 0 || V_if_index < ifindex) return (EINVAL); if (ifindex != 0 && !ifnet_byindex(ifindex)) return (EINVAL); if (V_nd6_defifindex != ifindex) { V_nd6_defifindex = ifindex; if (V_nd6_defifindex > 0) V_nd6_defifp = ifnet_byindex(V_nd6_defifindex); else V_nd6_defifp = NULL; /* * Our current implementation assumes one-to-one maping between * interfaces and links, so it would be natural to use the * default interface as the default link. */ scope6_setdefault(V_nd6_defifp); } return (error); } diff --git a/sys/netinet6/raw_ip6.c b/sys/netinet6/raw_ip6.c index a0d06230e852..ef903bc6bd66 100644 --- a/sys/netinet6/raw_ip6.c +++ b/sys/netinet6/raw_ip6.c @@ -1,912 +1,906 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipsec.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define satosin6(sa) ((struct sockaddr_in6 *)(sa)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) /* * Raw interface to IP6 protocol. */ VNET_DECLARE(struct inpcbhead, ripcb); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcb VNET(ripcb) #define V_ripcbinfo VNET(ripcbinfo) extern u_long rip_sendspace; extern u_long rip_recvspace; VNET_PCPUSTAT_DEFINE(struct rip6stat, rip6stat); VNET_PCPUSTAT_SYSINIT(rip6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(rip6stat); #endif /* VIMAGE */ /* * Hooks for multicast routing. They all default to NULL, so leave them not * initialized and rely on BSS being set to 0. */ /* * The socket used to communicate with the multicast routing daemon. */ VNET_DEFINE(struct socket *, ip6_mrouter); /* * The various mrouter functions. */ int (*ip6_mrouter_set)(struct socket *, struct sockopt *); int (*ip6_mrouter_get)(struct socket *, struct sockopt *); int (*ip6_mrouter_done)(void); int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *); int (*mrt6_ioctl)(u_long, caddr_t); /* * Setup generic address and protocol structures for raw_input routine, then * pass them along with mbuf chain. */ int rip6_input(struct mbuf **mp, int *offp, int proto) { struct ifnet *ifp; struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb *inp; struct inpcb *last = NULL; struct mbuf *opts = NULL; struct sockaddr_in6 fromsa; struct epoch_tracker et; RIP6STAT_INC(rip6s_ipackets); init_sin6(&fromsa, m, 0); /* general init */ ifp = m->m_pkthdr.rcvif; INP_INFO_RLOCK_ET(&V_ripcbinfo, et); CK_LIST_FOREACH(inp, &V_ripcb, inp_list) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (inp->inp_ip_p && inp->inp_ip_p != proto) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) continue; if (last != NULL) { struct mbuf *n = m_copym(m, 0, M_COPYALL, M_NOWAIT); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Check AH/ESP integrity. */ if (IPSEC_ENABLED(ipv6)) { if (n != NULL && IPSEC_CHECK_POLICY(ipv6, n, last) != 0) { m_freem(n); /* Do not inject data into pcb. */ n = NULL; } } #endif /* IPSEC */ if (n) { if (last->inp_flags & INP_CONTROLOPTS || last->inp_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(last, n, &opts); /* strip intermediate headers */ m_adj(n, *offp); if (sbappendaddr(&last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, n, opts) == 0) { m_freem(n); if (opts) m_freem(opts); RIP6STAT_INC(rip6s_fullsock); } else sorwakeup(last->inp_socket); opts = NULL; } INP_RUNLOCK(last); last = NULL; } INP_RLOCK(inp); if (__predict_false(inp->inp_flags2 & INP_FREED)) goto skip_2; if (jailed_without_vnet(inp->inp_cred)) { /* * Allow raw socket in jail to receive multicast; * assume process had PRIV_NETINET_RAW at attach, * and fall through into normal filter path if so. */ if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && prison_check_ip6(inp->inp_cred, &ip6->ip6_dst) != 0) goto skip_2; } if (inp->in6p_cksum != -1) { RIP6STAT_INC(rip6s_isum); if (m->m_pkthdr.len - (*offp + inp->in6p_cksum) < 2 || in6_cksum(m, proto, *offp, m->m_pkthdr.len - *offp)) { RIP6STAT_INC(rip6s_badsum); /* * Drop the received message, don't send an * ICMP6 message. Set proto to IPPROTO_NONE * to achieve that. */ proto = IPPROTO_NONE; goto skip_2; } } /* * If this raw socket has multicast state, and we * have received a multicast, check if this socket * should receive it, as multicast filtering is now * the responsibility of the transport layer. */ if (inp->in6p_moptions && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If the incoming datagram is for MLD, allow it * through unconditionally to the raw socket. * * Use the M_RTALERT_MLD flag to check for MLD * traffic without having to inspect the mbuf chain * more deeply, as all MLDv1/v2 host messages MUST * contain the Router Alert option. * * In the case of MLDv1, we may not have explicitly * joined the group, and may have set IFF_ALLMULTI * on the interface. im6o_mc_filter() may discard * control traffic we actually need to see. * * Userland multicast routing daemons should continue * filter the control traffic appropriately. */ int blocked; blocked = MCAST_PASS; if ((m->m_flags & M_RTALERT_MLD) == 0) { struct sockaddr_in6 mcaddr; bzero(&mcaddr, sizeof(struct sockaddr_in6)); mcaddr.sin6_len = sizeof(struct sockaddr_in6); mcaddr.sin6_family = AF_INET6; mcaddr.sin6_addr = ip6->ip6_dst; blocked = im6o_mc_filter(inp->in6p_moptions, ifp, (struct sockaddr *)&mcaddr, (struct sockaddr *)&fromsa); } if (blocked != MCAST_PASS) { IP6STAT_INC(ip6s_notmember); goto skip_2; } } last = inp; continue; skip_2: INP_RUNLOCK(inp); } INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Check AH/ESP integrity. */ if (IPSEC_ENABLED(ipv6) && last != NULL && IPSEC_CHECK_POLICY(ipv6, m, last) != 0) { m_freem(m); IP6STAT_DEC(ip6s_delivered); /* Do not inject data into pcb. */ INP_RUNLOCK(last); } else #endif /* IPSEC */ if (last != NULL) { if (last->inp_flags & INP_CONTROLOPTS || last->inp_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(last, m, &opts); /* Strip intermediate headers. */ m_adj(m, *offp); if (sbappendaddr(&last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, m, opts) == 0) { m_freem(m); if (opts) m_freem(opts); RIP6STAT_INC(rip6s_fullsock); } else sorwakeup(last->inp_socket); INP_RUNLOCK(last); } else { RIP6STAT_INC(rip6s_nosock); if (m->m_flags & M_MCAST) RIP6STAT_INC(rip6s_nosockmcast); if (proto == IPPROTO_NONE) m_freem(m); else icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, ip6_get_prevhdr(m, *offp)); IP6STAT_DEC(ip6s_delivered); } return (IPPROTO_DONE); } void rip6_ctlinput(int cmd, struct sockaddr *sa, void *d) { struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; void *cmdarg; struct inpcb *(*notify)(struct inpcb *, int) = in6_rtchange; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if ((unsigned)cmd >= PRC_NCMDS) return; if (PRC_IS_REDIRECT(cmd)) notify = in6_rtchange, d = NULL; else if (cmd == PRC_HOSTDEAD) d = NULL; else if (inet6ctlerrmap[cmd] == 0) return; /* * If the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; cmdarg = ip6cp->ip6c_cmdarg; sa6_src = ip6cp->ip6c_src; } else { cmdarg = NULL; sa6_src = &sa6_any; } (void) in6_pcbnotify(&V_ripcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify); } /* * Generate IPv6 header and pass packet to ip6_output. Tack on options user * may have setup with control call. */ int rip6_output(struct mbuf *m, struct socket *so, ...) { struct mbuf *control; struct m_tag *mtag; struct sockaddr_in6 *dstsock; struct ip6_hdr *ip6; struct inpcb *inp; u_int plen = m->m_pkthdr.len; int error = 0; struct ip6_pktopts opt, *optp; struct ifnet *oifp = NULL; int type = 0, code = 0; /* for ICMPv6 output statistics only */ int scope_ambiguous = 0; int use_defzone = 0; int hlim = 0; struct in6_addr in6a; va_list ap; va_start(ap, so); dstsock = va_arg(ap, struct sockaddr_in6 *); control = va_arg(ap, struct mbuf *); va_end(ap); inp = sotoinpcb(so); INP_WLOCK(inp); if (control != NULL) { if ((error = ip6_setpktopts(control, &opt, inp->in6p_outputopts, so->so_cred, so->so_proto->pr_protocol)) != 0) { goto bad; } optp = &opt; } else optp = inp->in6p_outputopts; /* * Check and convert scope zone ID into internal form. * * XXX: we may still need to determine the zone later. */ if (!(so->so_state & SS_ISCONNECTED)) { if (!optp || !optp->ip6po_pktinfo || !optp->ip6po_pktinfo->ipi6_ifindex) use_defzone = V_ip6_use_defzone; if (dstsock->sin6_scope_id == 0 && !use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(dstsock, use_defzone)) != 0) goto bad; } /* * For an ICMPv6 packet, we should know its type and code to update * statistics. */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { struct icmp6_hdr *icmp6; if (m->m_len < sizeof(struct icmp6_hdr) && (m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) { error = ENOBUFS; goto bad; } icmp6 = mtod(m, struct icmp6_hdr *); type = icmp6->icmp6_type; code = icmp6->icmp6_code; } M_PREPEND(m, sizeof(*ip6), M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } ip6 = mtod(m, struct ip6_hdr *); /* * Source address selection. */ error = in6_selectsrc_socket(dstsock, optp, inp, so->so_cred, scope_ambiguous, &in6a, &hlim); if (error) goto bad; error = prison_check_ip6(inp->inp_cred, &in6a); if (error != 0) goto bad; ip6->ip6_src = in6a; ip6->ip6_dst = dstsock->sin6_addr; /* * Fill in the rest of the IPv6 header fields. */ ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); /* * ip6_plen will be filled in ip6_output, so not fill it here. */ ip6->ip6_nxt = inp->inp_ip_p; ip6->ip6_hlim = hlim; if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 || inp->in6p_cksum != -1) { struct mbuf *n; int off; u_int16_t *p; /* Compute checksum. */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) off = offsetof(struct icmp6_hdr, icmp6_cksum); else off = inp->in6p_cksum; if (plen < off + 2) { error = EINVAL; goto bad; } off += sizeof(struct ip6_hdr); n = m; while (n && n->m_len <= off) { off -= n->m_len; n = n->m_next; } if (!n) goto bad; p = (u_int16_t *)(mtod(n, caddr_t) + off); *p = 0; *p = in6_cksum(m, ip6->ip6_nxt, sizeof(*ip6), plen); } /* * Send RA/RS messages to user land for protection, before sending * them to rtadvd/rtsol. */ if ((send_sendso_input_hook != NULL) && so->so_proto->pr_protocol == IPPROTO_ICMPV6) { switch (type) { case ND_ROUTER_ADVERT: case ND_ROUTER_SOLICIT: mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short), M_NOWAIT); if (mtag == NULL) goto bad; m_tag_prepend(m, mtag); } } error = ip6_output(m, optp, NULL, 0, inp->in6p_moptions, &oifp, inp); if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (oifp) icmp6_ifoutstat_inc(oifp, type, code); ICMP6STAT_INC(icp6s_outhist[type]); } else RIP6STAT_INC(rip6s_opackets); goto freectl; bad: if (m) m_freem(m); freectl: if (control != NULL) { ip6_clearpktopts(&opt, -1); m_freem(control); } INP_WUNLOCK(inp); return (error); } /* * Raw IPv6 socket option processing. */ int rip6_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp; int error; if (sopt->sopt_level == IPPROTO_ICMPV6) /* * XXX: is it better to call icmp6_ctloutput() directly * from protosw? */ return (icmp6_ctloutput(so, sopt)); else if (sopt->sopt_level != IPPROTO_IPV6) { if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_SETFIB) { inp = sotoinpcb(so); INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); return (0); } return (EINVAL); } error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: case MRT6_DEL_MIF: case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: error = ip6_mrouter_get ? ip6_mrouter_get(so, sopt) : EOPNOTSUPP; break; case IPV6_CHECKSUM: error = ip6_raw_ctloutput(so, sopt); break; default: error = ip6_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: case MRT6_DEL_MIF: case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: error = ip6_mrouter_set ? ip6_mrouter_set(so, sopt) : EOPNOTSUPP; break; case IPV6_CHECKSUM: error = ip6_raw_ctloutput(so, sopt); break; default: error = ip6_ctloutput(so, sopt); break; } break; } return (error); } static int rip6_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct icmp6_filter *filter; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("rip6_attach: inp != NULL")); error = priv_check(td, PRIV_NETINET_RAW); if (error) return (error); error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return (error); filter = malloc(sizeof(struct icmp6_filter), M_PCB, M_NOWAIT); if (filter == NULL) return (ENOMEM); INP_INFO_WLOCK(&V_ripcbinfo); error = in_pcballoc(so, &V_ripcbinfo); if (error) { INP_INFO_WUNLOCK(&V_ripcbinfo); free(filter, M_PCB); return (error); } inp = (struct inpcb *)so->so_pcb; INP_INFO_WUNLOCK(&V_ripcbinfo); inp->inp_vflag |= INP_IPV6; inp->inp_ip_p = (long)proto; inp->in6p_hops = -1; /* use kernel default */ inp->in6p_cksum = -1; inp->in6p_icmp6filt = filter; ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt); INP_WUNLOCK(inp); return (0); } static void rip6_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_detach: inp == NULL")); if (so == V_ip6_mrouter && ip6_mrouter_done) ip6_mrouter_done(); /* xxx: RSVP */ INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); free(inp->in6p_icmp6filt, M_PCB); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); } /* XXXRW: This can't ever be called. */ static void rip6_abort(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_abort: inp == NULL")); soisdisconnected(so); } static void rip6_close(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_close: inp == NULL")); soisdisconnected(so); } static int rip6_disconnect(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_disconnect: inp == NULL")); if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); inp->in6p_faddr = in6addr_any; rip6_abort(so); return (0); } static int rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { - struct epoch_tracker et; struct inpcb *inp; struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct ifaddr *ifa = NULL; int error = 0; + NET_EPOCH_ASSERT(); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_bind: inp == NULL")); if (nam->sa_len != sizeof(*addr)) return (EINVAL); if ((error = prison_check_ip6(td->td_ucred, &addr->sin6_addr)) != 0) return (error); if (CK_STAILQ_EMPTY(&V_ifnet) || addr->sin6_family != AF_INET6) return (EADDRNOTAVAIL); if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0) return (error); - NET_EPOCH_ENTER(et); if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) && - (ifa = ifa_ifwithaddr((struct sockaddr *)addr)) == NULL) { - NET_EPOCH_EXIT(et); + (ifa = ifa_ifwithaddr((struct sockaddr *)addr)) == NULL) return (EADDRNOTAVAIL); - } if (ifa != NULL && ((struct in6_ifaddr *)ifa)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| - IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { - NET_EPOCH_EXIT(et); + IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) return (EADDRNOTAVAIL); - } - NET_EPOCH_EXIT(et); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); inp->in6p_laddr = addr->sin6_addr; INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct in6_addr in6a; int error = 0, scope_ambiguous = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_connect: inp == NULL")); if (nam->sa_len != sizeof(*addr)) return (EINVAL); if (CK_STAILQ_EMPTY(&V_ifnet)) return (EADDRNOTAVAIL); if (addr->sin6_family != AF_INET6) return (EAFNOSUPPORT); /* * Application should provide a proper zone ID or the use of default * zone IDs should be enabled. Unfortunately, some applications do * not behave as it should, so we need a workaround. Even if an * appropriate ID is not determined, we'll see if we can determine * the outgoing interface. If we can, determine the zone ID based on * the interface below. */ if (addr->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0) return (error); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); /* Source address selection. XXX: need pcblookup? */ error = in6_selectsrc_socket(addr, inp->in6p_outputopts, inp, so->so_cred, scope_ambiguous, &in6a, NULL); if (error) { INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (error); } inp->in6p_faddr = addr->sin6_addr; inp->in6p_laddr = in6a; soisconnected(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip6_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } static int rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct inpcb *inp; struct sockaddr_in6 tmp; struct sockaddr_in6 *dst; int ret; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_send: inp == NULL")); /* Always copy sockaddr to avoid overwrites. */ /* Unlocked read. */ if (so->so_state & SS_ISCONNECTED) { if (nam) { m_freem(m); return (EISCONN); } /* XXX */ bzero(&tmp, sizeof(tmp)); tmp.sin6_family = AF_INET6; tmp.sin6_len = sizeof(struct sockaddr_in6); INP_RLOCK(inp); bcopy(&inp->in6p_faddr, &tmp.sin6_addr, sizeof(struct in6_addr)); INP_RUNLOCK(inp); dst = &tmp; } else { if (nam == NULL) { m_freem(m); return (ENOTCONN); } if (nam->sa_len != sizeof(struct sockaddr_in6)) { m_freem(m); return (EINVAL); } tmp = *(struct sockaddr_in6 *)nam; dst = &tmp; if (dst->sin6_family == AF_UNSPEC) { /* * XXX: we allow this case for backward * compatibility to buggy applications that * rely on old (and wrong) kernel behavior. */ log(LOG_INFO, "rip6 SEND: address family is " "unspec. Assume AF_INET6\n"); dst->sin6_family = AF_INET6; } else if (dst->sin6_family != AF_INET6) { m_freem(m); return(EAFNOSUPPORT); } } ret = rip6_output(m, so, dst, control); return (ret); } struct pr_usrreqs rip6_usrreqs = { .pru_abort = rip6_abort, .pru_attach = rip6_attach, .pru_bind = rip6_bind, .pru_connect = rip6_connect, .pru_control = in6_control, .pru_detach = rip6_detach, .pru_disconnect = rip6_disconnect, .pru_peeraddr = in6_getpeeraddr, .pru_send = rip6_send, .pru_shutdown = rip6_shutdown, .pru_sockaddr = in6_getsockaddr, .pru_close = rip6_close, }; diff --git a/sys/netipsec/xform_ipcomp.c b/sys/netipsec/xform_ipcomp.c index df09eb36cfa3..96cffd6305a4 100644 --- a/sys/netipsec/xform_ipcomp.c +++ b/sys/netipsec/xform_ipcomp.c @@ -1,776 +1,780 @@ /* $FreeBSD$ */ /* $OpenBSD: ip_ipcomp.c,v 1.1 2001/07/05 12:08:52 jjbg Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001 Jean-Jacques Bernard-Gundol (jj@wabbitt.org) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* IP payload compression protocol (IPComp), see RFC 2393 */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include /* XXXGL: net_epoch should move out there */ +#include /* XXXGL: net_epoch should move out there */ #include #include #ifdef INET6 #include #include #include #endif #include #include #include #include #include #include #include VNET_DEFINE(int, ipcomp_enable) = 1; VNET_PCPUSTAT_DEFINE(struct ipcompstat, ipcompstat); VNET_PCPUSTAT_SYSINIT(ipcompstat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ipcompstat); #endif /* VIMAGE */ SYSCTL_DECL(_net_inet_ipcomp); SYSCTL_INT(_net_inet_ipcomp, OID_AUTO, ipcomp_enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipcomp_enable), 0, ""); SYSCTL_VNET_PCPUSTAT(_net_inet_ipcomp, IPSECCTL_STATS, stats, struct ipcompstat, ipcompstat, "IPCOMP statistics (struct ipcompstat, netipsec/ipcomp_var.h"); static int ipcomp_input_cb(struct cryptop *crp); static int ipcomp_output_cb(struct cryptop *crp); /* * RFC 3173 p 2.2. Non-Expansion Policy: * If the total size of a compressed payload and the IPComp header, as * defined in section 3, is not smaller than the size of the original * payload, the IP datagram MUST be sent in the original non-compressed * form. * * When we use IPComp in tunnel mode, for small packets we will receive * encapsulated IP-IP datagrams without any compression and without IPComp * header. */ static int ipcomp_encapcheck(union sockaddr_union *src, union sockaddr_union *dst) { struct secasvar *sav; sav = key_allocsa_tunnel(src, dst, IPPROTO_IPCOMP); if (sav == NULL) return (0); key_freesav(&sav); if (src->sa.sa_family == AF_INET) return (sizeof(struct in_addr) << 4); else return (sizeof(struct in6_addr) << 4); } static int ipcomp_nonexp_input(struct mbuf *m, int off, int proto, void *arg __unused) { int isr; + NET_EPOCH_ASSERT(); + switch (proto) { #ifdef INET case IPPROTO_IPV4: isr = NETISR_IP; break; #endif #ifdef INET6 case IPPROTO_IPV6: isr = NETISR_IPV6; break; #endif default: IPCOMPSTAT_INC(ipcomps_nopf); m_freem(m); return (IPPROTO_DONE); } m_adj(m, off); IPCOMPSTAT_ADD(ipcomps_ibytes, m->m_pkthdr.len); IPCOMPSTAT_INC(ipcomps_input); netisr_dispatch(isr, m); return (IPPROTO_DONE); } /* * ipcomp_init() is called when an CPI is being set up. */ static int ipcomp_init(struct secasvar *sav, struct xformsw *xsp) { const struct comp_algo *tcomp; struct cryptoini cric; /* NB: algorithm really comes in alg_enc and not alg_comp! */ tcomp = comp_algorithm_lookup(sav->alg_enc); if (tcomp == NULL) { DPRINTF(("%s: unsupported compression algorithm %d\n", __func__, sav->alg_comp)); return EINVAL; } sav->alg_comp = sav->alg_enc; /* set for doing histogram */ sav->tdb_xform = xsp; sav->tdb_compalgxform = tcomp; /* Initialize crypto session */ bzero(&cric, sizeof (cric)); cric.cri_alg = sav->tdb_compalgxform->type; return crypto_newsession(&sav->tdb_cryptoid, &cric, V_crypto_support); } /* * ipcomp_zeroize() used when IPCA is deleted */ static int ipcomp_zeroize(struct secasvar *sav) { crypto_freesession(sav->tdb_cryptoid); sav->tdb_cryptoid = NULL; return 0; } /* * ipcomp_input() gets called to uncompress an input packet */ static int ipcomp_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff) { struct xform_data *xd; struct cryptodesc *crdc; struct cryptop *crp; struct ipcomp *ipcomp; caddr_t addr; int error, hlen = IPCOMP_HLENGTH; /* * Check that the next header of the IPComp is not IPComp again, before * doing any real work. Given it is not possible to do double * compression it means someone is playing tricks on us. */ error = ENOBUFS; if (m->m_len < skip + hlen && (m = m_pullup(m, skip + hlen)) == NULL) { IPCOMPSTAT_INC(ipcomps_hdrops); /*XXX*/ DPRINTF(("%s: m_pullup failed\n", __func__)); key_freesav(&sav); return (error); } addr = (caddr_t) mtod(m, struct ip *) + skip; ipcomp = (struct ipcomp *)addr; if (ipcomp->comp_nxt == IPPROTO_IPCOMP) { IPCOMPSTAT_INC(ipcomps_pdrops); /* XXX have our own stats? */ DPRINTF(("%s: recursive compression detected\n", __func__)); error = EINVAL; goto bad; } /* Get crypto descriptors */ crp = crypto_getreq(1); if (crp == NULL) { DPRINTF(("%s: no crypto descriptors\n", __func__)); IPCOMPSTAT_INC(ipcomps_crypto); goto bad; } /* Get IPsec-specific opaque pointer */ xd = malloc(sizeof(*xd), M_XDATA, M_NOWAIT | M_ZERO); if (xd == NULL) { DPRINTF(("%s: cannot allocate xform_data\n", __func__)); IPCOMPSTAT_INC(ipcomps_crypto); crypto_freereq(crp); goto bad; } crdc = crp->crp_desc; crdc->crd_skip = skip + hlen; crdc->crd_len = m->m_pkthdr.len - (skip + hlen); crdc->crd_inject = skip; /* Decompression operation */ crdc->crd_alg = sav->tdb_compalgxform->type; /* Crypto operation descriptor */ crp->crp_ilen = m->m_pkthdr.len - (skip + hlen); crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC; crp->crp_buf = (caddr_t) m; crp->crp_callback = ipcomp_input_cb; crp->crp_opaque = (caddr_t) xd; /* These are passed as-is to the callback */ xd->sav = sav; xd->protoff = protoff; xd->skip = skip; xd->vnet = curvnet; SECASVAR_LOCK(sav); crp->crp_session = xd->cryptoid = sav->tdb_cryptoid; SECASVAR_UNLOCK(sav); return crypto_dispatch(crp); bad: m_freem(m); key_freesav(&sav); return (error); } /* * IPComp input callback from the crypto driver. */ static int ipcomp_input_cb(struct cryptop *crp) { IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]); struct xform_data *xd; struct mbuf *m; struct secasvar *sav; struct secasindex *saidx; caddr_t addr; crypto_session_t cryptoid; int hlen = IPCOMP_HLENGTH, error, clen; int skip, protoff; uint8_t nproto; m = (struct mbuf *) crp->crp_buf; xd = (struct xform_data *) crp->crp_opaque; CURVNET_SET(xd->vnet); sav = xd->sav; skip = xd->skip; protoff = xd->protoff; cryptoid = xd->cryptoid; saidx = &sav->sah->saidx; IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET || saidx->dst.sa.sa_family == AF_INET6, ("unexpected protocol family %u", saidx->dst.sa.sa_family)); /* Check for crypto errors */ if (crp->crp_etype) { if (crp->crp_etype == EAGAIN) { /* Reset the session ID */ if (ipsec_updateid(sav, &crp->crp_session, &cryptoid) != 0) crypto_freesession(cryptoid); xd->cryptoid = crp->crp_session; CURVNET_RESTORE(); return (crypto_dispatch(crp)); } IPCOMPSTAT_INC(ipcomps_noxform); DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } /* Shouldn't happen... */ if (m == NULL) { IPCOMPSTAT_INC(ipcomps_crypto); DPRINTF(("%s: null mbuf returned from crypto\n", __func__)); error = EINVAL; goto bad; } IPCOMPSTAT_INC(ipcomps_hist[sav->alg_comp]); clen = crp->crp_olen; /* Length of data after processing */ /* Release the crypto descriptors */ free(xd, M_XDATA), xd = NULL; crypto_freereq(crp), crp = NULL; /* In case it's not done already, adjust the size of the mbuf chain */ m->m_pkthdr.len = clen + hlen + skip; if (m->m_len < skip + hlen && (m = m_pullup(m, skip + hlen)) == NULL) { IPCOMPSTAT_INC(ipcomps_hdrops); /*XXX*/ DPRINTF(("%s: m_pullup failed\n", __func__)); error = EINVAL; /*XXX*/ goto bad; } /* Keep the next protocol field */ addr = (caddr_t) mtod(m, struct ip *) + skip; nproto = ((struct ipcomp *) addr)->comp_nxt; /* Remove the IPCOMP header */ error = m_striphdr(m, skip, hlen); if (error) { IPCOMPSTAT_INC(ipcomps_hdrops); DPRINTF(("%s: bad mbuf chain, IPCA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)), (u_long) ntohl(sav->spi))); goto bad; } /* Restore the Next Protocol field */ m_copyback(m, protoff, sizeof (u_int8_t), (u_int8_t *) &nproto); switch (saidx->dst.sa.sa_family) { #ifdef INET6 case AF_INET6: error = ipsec6_common_input_cb(m, sav, skip, protoff); break; #endif #ifdef INET case AF_INET: error = ipsec4_common_input_cb(m, sav, skip, protoff); break; #endif default: panic("%s: Unexpected address family: %d saidx=%p", __func__, saidx->dst.sa.sa_family, saidx); } CURVNET_RESTORE(); return error; bad: CURVNET_RESTORE(); if (sav != NULL) key_freesav(&sav); if (m != NULL) m_freem(m); if (xd != NULL) free(xd, M_XDATA); if (crp != NULL) crypto_freereq(crp); return error; } /* * IPComp output routine, called by ipsec[46]_perform_request() */ static int ipcomp_output(struct mbuf *m, struct secpolicy *sp, struct secasvar *sav, u_int idx, int skip, int protoff) { IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]); const struct comp_algo *ipcompx; struct cryptodesc *crdc; struct cryptop *crp; struct xform_data *xd; int error, ralen, maxpacketsize; IPSEC_ASSERT(sav != NULL, ("null SA")); ipcompx = sav->tdb_compalgxform; IPSEC_ASSERT(ipcompx != NULL, ("null compression xform")); /* * Do not touch the packet in case our payload to compress * is lower than the minimal threshold of the compression * alogrithm. We will just send out the data uncompressed. * See RFC 3173, 2.2. Non-Expansion Policy. */ if (m->m_pkthdr.len <= ipcompx->minlen) { IPCOMPSTAT_INC(ipcomps_threshold); return ipsec_process_done(m, sp, sav, idx); } ralen = m->m_pkthdr.len - skip; /* Raw payload length before comp. */ IPCOMPSTAT_INC(ipcomps_output); /* Check for maximum packet size violations. */ switch (sav->sah->saidx.dst.sa.sa_family) { #ifdef INET case AF_INET: maxpacketsize = IP_MAXPACKET; break; #endif /* INET */ #ifdef INET6 case AF_INET6: maxpacketsize = IPV6_MAXPACKET; break; #endif /* INET6 */ default: IPCOMPSTAT_INC(ipcomps_nopf); DPRINTF(("%s: unknown/unsupported protocol family %d, " "IPCA %s/%08lx\n", __func__, sav->sah->saidx.dst.sa.sa_family, ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)), (u_long) ntohl(sav->spi))); error = EPFNOSUPPORT; goto bad; } if (ralen + skip + IPCOMP_HLENGTH > maxpacketsize) { IPCOMPSTAT_INC(ipcomps_toobig); DPRINTF(("%s: packet in IPCA %s/%08lx got too big " "(len %u, max len %u)\n", __func__, ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)), (u_long) ntohl(sav->spi), ralen + skip + IPCOMP_HLENGTH, maxpacketsize)); error = EMSGSIZE; goto bad; } /* Update the counters */ IPCOMPSTAT_ADD(ipcomps_obytes, m->m_pkthdr.len - skip); m = m_unshare(m, M_NOWAIT); if (m == NULL) { IPCOMPSTAT_INC(ipcomps_hdrops); DPRINTF(("%s: cannot clone mbuf chain, IPCA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)), (u_long) ntohl(sav->spi))); error = ENOBUFS; goto bad; } /* Ok now, we can pass to the crypto processing. */ /* Get crypto descriptors */ crp = crypto_getreq(1); if (crp == NULL) { IPCOMPSTAT_INC(ipcomps_crypto); DPRINTF(("%s: failed to acquire crypto descriptor\n",__func__)); error = ENOBUFS; goto bad; } crdc = crp->crp_desc; /* Compression descriptor */ crdc->crd_skip = skip; crdc->crd_len = ralen; crdc->crd_flags = CRD_F_COMP; crdc->crd_inject = skip; /* Compression operation */ crdc->crd_alg = ipcompx->type; /* IPsec-specific opaque crypto info */ xd = malloc(sizeof(struct xform_data), M_XDATA, M_NOWAIT | M_ZERO); if (xd == NULL) { IPCOMPSTAT_INC(ipcomps_crypto); DPRINTF(("%s: failed to allocate xform_data\n", __func__)); crypto_freereq(crp); error = ENOBUFS; goto bad; } xd->sp = sp; xd->sav = sav; xd->idx = idx; xd->skip = skip; xd->protoff = protoff; xd->vnet = curvnet; /* Crypto operation descriptor */ crp->crp_ilen = m->m_pkthdr.len; /* Total input length */ crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC; crp->crp_buf = (caddr_t) m; crp->crp_callback = ipcomp_output_cb; crp->crp_opaque = (caddr_t) xd; SECASVAR_LOCK(sav); crp->crp_session = xd->cryptoid = sav->tdb_cryptoid; SECASVAR_UNLOCK(sav); return crypto_dispatch(crp); bad: if (m) m_freem(m); key_freesav(&sav); key_freesp(&sp); return (error); } /* * IPComp output callback from the crypto driver. */ static int ipcomp_output_cb(struct cryptop *crp) { IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]); struct xform_data *xd; struct secpolicy *sp; struct secasvar *sav; struct mbuf *m; crypto_session_t cryptoid; u_int idx; int error, skip, protoff; m = (struct mbuf *) crp->crp_buf; xd = (struct xform_data *) crp->crp_opaque; CURVNET_SET(xd->vnet); idx = xd->idx; sp = xd->sp; sav = xd->sav; skip = xd->skip; protoff = xd->protoff; cryptoid = xd->cryptoid; /* Check for crypto errors */ if (crp->crp_etype) { if (crp->crp_etype == EAGAIN) { /* Reset the session ID */ if (ipsec_updateid(sav, &crp->crp_session, &cryptoid) != 0) crypto_freesession(cryptoid); xd->cryptoid = crp->crp_session; CURVNET_RESTORE(); return (crypto_dispatch(crp)); } IPCOMPSTAT_INC(ipcomps_noxform); DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } /* Shouldn't happen... */ if (m == NULL) { IPCOMPSTAT_INC(ipcomps_crypto); DPRINTF(("%s: bogus return buffer from crypto\n", __func__)); error = EINVAL; goto bad; } IPCOMPSTAT_INC(ipcomps_hist[sav->alg_comp]); if (crp->crp_ilen - skip > crp->crp_olen) { struct mbuf *mo; struct ipcomp *ipcomp; int roff; uint8_t prot; /* Compression helped, inject IPCOMP header. */ mo = m_makespace(m, skip, IPCOMP_HLENGTH, &roff); if (mo == NULL) { IPCOMPSTAT_INC(ipcomps_wrap); DPRINTF(("%s: IPCOMP header inject failed " "for IPCA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)), (u_long) ntohl(sav->spi))); error = ENOBUFS; goto bad; } ipcomp = (struct ipcomp *)(mtod(mo, caddr_t) + roff); /* Initialize the IPCOMP header */ /* XXX alignment always correct? */ switch (sav->sah->saidx.dst.sa.sa_family) { #ifdef INET case AF_INET: ipcomp->comp_nxt = mtod(m, struct ip *)->ip_p; break; #endif /* INET */ #ifdef INET6 case AF_INET6: ipcomp->comp_nxt = mtod(m, struct ip6_hdr *)->ip6_nxt; break; #endif } ipcomp->comp_flags = 0; ipcomp->comp_cpi = htons((u_int16_t) ntohl(sav->spi)); /* Fix Next Protocol in IPv4/IPv6 header */ prot = IPPROTO_IPCOMP; m_copyback(m, protoff, sizeof(u_int8_t), (u_char *)&prot); /* Adjust the length in the IP header */ switch (sav->sah->saidx.dst.sa.sa_family) { #ifdef INET case AF_INET: mtod(m, struct ip *)->ip_len = htons(m->m_pkthdr.len); break; #endif /* INET */ #ifdef INET6 case AF_INET6: mtod(m, struct ip6_hdr *)->ip6_plen = htons(m->m_pkthdr.len) - sizeof(struct ip6_hdr); break; #endif /* INET6 */ default: IPCOMPSTAT_INC(ipcomps_nopf); DPRINTF(("%s: unknown/unsupported protocol " "family %d, IPCA %s/%08lx\n", __func__, sav->sah->saidx.dst.sa.sa_family, ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)), (u_long) ntohl(sav->spi))); error = EPFNOSUPPORT; goto bad; } } else { /* Compression was useless, we have lost time. */ IPCOMPSTAT_INC(ipcomps_uncompr); DPRINTF(("%s: compressions was useless %d - %d <= %d\n", __func__, crp->crp_ilen, skip, crp->crp_olen)); /* XXX remember state to not compress the next couple * of packets, RFC 3173, 2.2. Non-Expansion Policy */ } /* Release the crypto descriptor */ free(xd, M_XDATA); crypto_freereq(crp); /* NB: m is reclaimed by ipsec_process_done. */ error = ipsec_process_done(m, sp, sav, idx); CURVNET_RESTORE(); return (error); bad: if (m) m_freem(m); CURVNET_RESTORE(); free(xd, M_XDATA); crypto_freereq(crp); key_freesav(&sav); key_freesp(&sp); return (error); } #ifdef INET static int ipcomp4_nonexp_encapcheck(const struct mbuf *m, int off, int proto, void *arg __unused) { union sockaddr_union src, dst; const struct ip *ip; if (V_ipcomp_enable == 0) return (0); if (proto != IPPROTO_IPV4 && proto != IPPROTO_IPV6) return (0); bzero(&src, sizeof(src)); bzero(&dst, sizeof(dst)); src.sa.sa_family = dst.sa.sa_family = AF_INET; src.sin.sin_len = dst.sin.sin_len = sizeof(struct sockaddr_in); ip = mtod(m, const struct ip *); src.sin.sin_addr = ip->ip_src; dst.sin.sin_addr = ip->ip_dst; return (ipcomp_encapcheck(&src, &dst)); } static const struct encaptab *ipe4_cookie = NULL; static const struct encap_config ipv4_encap_cfg = { .proto = -1, .min_length = sizeof(struct ip), .exact_match = sizeof(in_addr_t) << 4, .check = ipcomp4_nonexp_encapcheck, .input = ipcomp_nonexp_input }; #endif #ifdef INET6 static int ipcomp6_nonexp_encapcheck(const struct mbuf *m, int off, int proto, void *arg __unused) { union sockaddr_union src, dst; const struct ip6_hdr *ip6; if (V_ipcomp_enable == 0) return (0); if (proto != IPPROTO_IPV4 && proto != IPPROTO_IPV6) return (0); bzero(&src, sizeof(src)); bzero(&dst, sizeof(dst)); src.sa.sa_family = dst.sa.sa_family = AF_INET; src.sin6.sin6_len = dst.sin6.sin6_len = sizeof(struct sockaddr_in6); ip6 = mtod(m, const struct ip6_hdr *); src.sin6.sin6_addr = ip6->ip6_src; dst.sin6.sin6_addr = ip6->ip6_dst; if (IN6_IS_SCOPE_LINKLOCAL(&src.sin6.sin6_addr)) { /* XXX: sa6_recoverscope() */ src.sin6.sin6_scope_id = ntohs(src.sin6.sin6_addr.s6_addr16[1]); src.sin6.sin6_addr.s6_addr16[1] = 0; } if (IN6_IS_SCOPE_LINKLOCAL(&dst.sin6.sin6_addr)) { /* XXX: sa6_recoverscope() */ dst.sin6.sin6_scope_id = ntohs(dst.sin6.sin6_addr.s6_addr16[1]); dst.sin6.sin6_addr.s6_addr16[1] = 0; } return (ipcomp_encapcheck(&src, &dst)); } static const struct encaptab *ipe6_cookie = NULL; static const struct encap_config ipv6_encap_cfg = { .proto = -1, .min_length = sizeof(struct ip6_hdr), .exact_match = sizeof(struct in6_addr) << 4, .check = ipcomp6_nonexp_encapcheck, .input = ipcomp_nonexp_input }; #endif static struct xformsw ipcomp_xformsw = { .xf_type = XF_IPCOMP, .xf_name = "IPcomp", .xf_init = ipcomp_init, .xf_zeroize = ipcomp_zeroize, .xf_input = ipcomp_input, .xf_output = ipcomp_output, }; static void ipcomp_attach(void) { #ifdef INET ipe4_cookie = ip_encap_attach(&ipv4_encap_cfg, NULL, M_WAITOK); #endif #ifdef INET6 ipe6_cookie = ip6_encap_attach(&ipv6_encap_cfg, NULL, M_WAITOK); #endif xform_attach(&ipcomp_xformsw); } static void ipcomp_detach(void) { #ifdef INET ip_encap_detach(ipe4_cookie); #endif #ifdef INET6 ip6_encap_detach(ipe6_cookie); #endif xform_detach(&ipcomp_xformsw); } SYSINIT(ipcomp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipcomp_attach, NULL); SYSUNINIT(ipcomp_xform_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipcomp_detach, NULL); diff --git a/sys/netpfil/ipfw/ip_dn_io.c b/sys/netpfil/ipfw/ip_dn_io.c index 0bd869076779..c5f0f1518751 100644 --- a/sys/netpfil/ipfw/ip_dn_io.c +++ b/sys/netpfil/ipfw/ip_dn_io.c @@ -1,975 +1,978 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Dummynet portions related to packet handling. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include /* NET_EPOCH_... */ #include #include #include #include /* ip_len, ip_off */ #include /* ip_output(), IP_FORWARDING */ #include #include #include /* various ether_* routines */ #include /* for ip6_input, ip6_output prototypes */ #include #include #include #include #ifdef NEW_AQM #include #endif #include /* * We keep a private variable for the simulation time, but we could * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) * instead of dn_cfg.curr_time */ struct dn_parms dn_cfg; //VNET_DEFINE(struct dn_parms, _base_dn_cfg); static long tick_last; /* Last tick duration (usec). */ static long tick_delta; /* Last vs standard tick diff (usec). */ static long tick_delta_sum; /* Accumulated tick difference (usec).*/ static long tick_adjustment; /* Tick adjustments done. */ static long tick_lost; /* Lost(coalesced) ticks number. */ /* Adjusted vs non-adjusted curr_time difference (ticks). */ static long tick_diff; static unsigned long io_pkt; static unsigned long io_pkt_fast; #ifdef NEW_AQM unsigned long io_pkt_drop; #else static unsigned long io_pkt_drop; #endif /* * We use a heap to store entities for which we have pending timer events. * The heap is checked at every tick and all entities with expired events * are extracted. */ MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); #ifdef SYSCTL_NODE /* * Because of the way the SYSBEGIN/SYSEND macros work on other * platforms, there should not be functions between them. * So keep the handlers outside the block. */ static int sysctl_hash_size(SYSCTL_HANDLER_ARGS) { int error, value; value = dn_cfg.hash_size; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (value < 16 || value > 65536) return (EINVAL); dn_cfg.hash_size = value; return (0); } static int sysctl_limits(SYSCTL_HANDLER_ARGS) { int error; long value; if (arg2 != 0) value = dn_cfg.slot_limit; else value = dn_cfg.byte_limit; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (arg2 != 0) { if (value < 1) return (EINVAL); dn_cfg.slot_limit = value; } else { if (value < 1500) return (EINVAL); dn_cfg.byte_limit = value; } return (0); } SYSBEGIN(f4) SYSCTL_DECL(_net_inet); SYSCTL_DECL(_net_inet_ip); #ifdef NEW_AQM SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); #else static SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); #endif /* wrapper to pass dn_cfg fields to SYSCTL_* */ //#define DC(x) (&(VNET_NAME(_base_dn_cfg).x)) #define DC(x) (&(dn_cfg.x)) /* parameters */ SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, hash_size, CTLTYPE_INT | CTLFLAG_RW, 0, 0, sysctl_hash_size, "I", "Default hash table size"); SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, CTLTYPE_LONG | CTLFLAG_RW, 0, 1, sysctl_limits, "L", "Upper limit in slots for pipe queue."); SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, CTLTYPE_LONG | CTLFLAG_RW, 0, 0, sysctl_limits, "L", "Upper limit in bytes for pipe queue."); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast, CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io."); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, DC(debug), 0, "Dummynet debug level"); /* RED parameters */ SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size"); /* time adjustment */ SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec)."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum, CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec)."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment, CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff, CTLFLAG_RD, &tick_diff, 0, "Adjusted vs non-adjusted curr_time difference (ticks)."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, CTLFLAG_RD, &tick_lost, 0, "Number of ticks coalesced by dummynet taskqueue."); /* Drain parameters */ SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire, CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes"); SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle, CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes"); /* statistics */ SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count, CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count, CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count, CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count, CTLFLAG_RD, DC(queue_count), 0, "Number of queues"); SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt, CTLFLAG_RD, &io_pkt, 0, "Number of packets passed to dummynet."); SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast, CTLFLAG_RD, &io_pkt_fast, 0, "Number of packets bypassed dummynet scheduler."); SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, CTLFLAG_RD, &io_pkt_drop, 0, "Number of packets dropped by dummynet."); #undef DC SYSEND #endif static void dummynet_send(struct mbuf *); /* * Return the mbuf tag holding the dummynet state (it should * be the first one on the list). */ struct dn_pkt_tag * dn_tag_get(struct mbuf *m) { struct m_tag *mtag = m_tag_first(m); #ifdef NEW_AQM /* XXX: to skip ts m_tag. For Debugging only*/ if (mtag != NULL && mtag->m_tag_id == DN_AQM_MTAG_TS) { m_tag_delete(m,mtag); mtag = m_tag_first(m); D("skip TS tag"); } #endif KASSERT(mtag != NULL && mtag->m_tag_cookie == MTAG_ABI_COMPAT && mtag->m_tag_id == PACKET_TAG_DUMMYNET, ("packet on dummynet queue w/o dummynet tag!")); return (struct dn_pkt_tag *)(mtag+1); } #ifndef NEW_AQM static inline void mq_append(struct mq *q, struct mbuf *m) { #ifdef USERSPACE // buffers from netmap need to be copied // XXX note that the routine is not expected to fail ND("append %p to %p", m, q); if (m->m_flags & M_STACK) { struct mbuf *m_new; void *p; int l, ofs; ofs = m->m_data - m->__m_extbuf; // XXX allocate MGETHDR(m_new, M_NOWAIT, MT_DATA); ND("*** WARNING, volatile buf %p ext %p %d dofs %d m_new %p", m, m->__m_extbuf, m->__m_extlen, ofs, m_new); p = m_new->__m_extbuf; /* new pointer */ l = m_new->__m_extlen; /* new len */ if (l <= m->__m_extlen) { panic("extlen too large"); } *m_new = *m; // copy m_new->m_flags &= ~M_STACK; m_new->__m_extbuf = p; // point to new buffer _pkt_copy(m->__m_extbuf, p, m->__m_extlen); m_new->m_data = p + ofs; m = m_new; } #endif /* USERSPACE */ if (q->head == NULL) q->head = m; else q->tail->m_nextpkt = m; q->count++; q->tail = m; m->m_nextpkt = NULL; } #endif /* * Dispose a list of packet. Use a functions so if we need to do * more work, this is a central point to do it. */ void dn_free_pkts(struct mbuf *mnext) { struct mbuf *m; while ((m = mnext) != NULL) { mnext = m->m_nextpkt; FREE_PKT(m); } } static int red_drops (struct dn_queue *q, int len) { /* * RED algorithm * * RED calculates the average queue size (avg) using a low-pass filter * with an exponential weighted (w_q) moving average: * avg <- (1-w_q) * avg + w_q * q_size * where q_size is the queue length (measured in bytes or * packets). * * If q_size == 0, we compute the idle time for the link, and set * avg = (1 - w_q)^(idle/s) * where s is the time needed for transmitting a medium-sized packet. * * Now, if avg < min_th the packet is enqueued. * If avg > max_th the packet is dropped. Otherwise, the packet is * dropped with probability P function of avg. */ struct dn_fsk *fs = q->fs; int64_t p_b = 0; /* Queue in bytes or packets? */ uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ? q->ni.len_bytes : q->ni.length; /* Average queue size estimation. */ if (q_size != 0) { /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */ int diff = SCALE(q_size) - q->avg; int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q); q->avg += (int)v; } else { /* * Queue is empty, find for how long the queue has been * empty and use a lookup table for computing * (1 - * w_q)^(idle_time/s) where s is the time to send a * (small) packet. * XXX check wraps... */ if (q->avg) { u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step); q->avg = (t < fs->lookup_depth) ? SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; } } /* Should i drop? */ if (q->avg < fs->min_th) { q->count = -1; return (0); /* accept packet */ } if (q->avg >= fs->max_th) { /* average queue >= max threshold */ if (fs->fs.flags & DN_IS_ECN) return (1); if (fs->fs.flags & DN_IS_GENTLE_RED) { /* * According to Gentle-RED, if avg is greater than * max_th the packet is dropped with a probability * p_b = c_3 * avg - c_4 * where c_3 = (1 - max_p) / max_th * c_4 = 1 - 2 * max_p */ p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) - fs->c_4; } else { q->count = -1; return (1); } } else if (q->avg > fs->min_th) { if (fs->fs.flags & DN_IS_ECN) return (1); /* * We compute p_b using the linear dropping function * p_b = c_1 * avg - c_2 * where c_1 = max_p / (max_th - min_th) * c_2 = max_p * min_th / (max_th - min_th) */ p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2; } if (fs->fs.flags & DN_QSIZE_BYTES) p_b = div64((p_b * len) , fs->max_pkt_size); if (++q->count == 0) q->random = random() & 0xffff; else { /* * q->count counts packets arrived since last drop, so a greater * value of q->count means a greater packet drop probability. */ if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) { q->count = 0; /* After a drop we calculate a new random value. */ q->random = random() & 0xffff; return (1); /* drop */ } } /* End of RED algorithm. */ return (0); /* accept */ } /* * ECN/ECT Processing (partially adopted from altq) */ #ifndef NEW_AQM static #endif int ecn_mark(struct mbuf* m) { struct ip *ip; ip = (struct ip *)mtodo(m, dn_tag_get(m)->iphdr_off); switch (ip->ip_v) { case IPVERSION: { uint16_t old; if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) return (0); /* not-ECT */ if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) return (1); /* already marked */ /* * ecn-capable but not marked, * mark CE and update checksum */ old = *(uint16_t *)ip; ip->ip_tos |= IPTOS_ECN_CE; ip->ip_sum = cksum_adjust(ip->ip_sum, old, *(uint16_t *)ip); return (1); } #ifdef INET6 case (IPV6_VERSION >> 4): { struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; u_int32_t flowlabel; flowlabel = ntohl(ip6->ip6_flow); if ((flowlabel >> 28) != 6) return (0); /* version mismatch! */ if ((flowlabel & (IPTOS_ECN_MASK << 20)) == (IPTOS_ECN_NOTECT << 20)) return (0); /* not-ECT */ if ((flowlabel & (IPTOS_ECN_MASK << 20)) == (IPTOS_ECN_CE << 20)) return (1); /* already marked */ /* * ecn-capable but not marked, mark CE */ flowlabel |= (IPTOS_ECN_CE << 20); ip6->ip6_flow = htonl(flowlabel); return (1); } #endif } return (0); } /* * Enqueue a packet in q, subject to space and queue management policy * (whose parameters are in q->fs). * Update stats for the queue and the scheduler. * Return 0 on success, 1 on drop. The packet is consumed anyways. */ int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) { struct dn_fs *f; struct dn_flow *ni; /* stats for scheduler instance */ uint64_t len; if (q->fs == NULL || q->_si == NULL) { printf("%s fs %p si %p, dropping\n", __FUNCTION__, q->fs, q->_si); FREE_PKT(m); return 1; } f = &(q->fs->fs); ni = &q->_si->ni; len = m->m_pkthdr.len; /* Update statistics, then check reasons to drop pkt. */ q->ni.tot_bytes += len; q->ni.tot_pkts++; ni->tot_bytes += len; ni->tot_pkts++; if (drop) goto drop; if (f->plr && random() < f->plr) goto drop; #ifdef NEW_AQM /* Call AQM enqueue function */ if (q->fs->aqmfp) return q->fs->aqmfp->enqueue(q ,m); #endif if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) { if (!(f->flags & DN_IS_ECN) || !ecn_mark(m)) goto drop; } if (f->flags & DN_QSIZE_BYTES) { if (q->ni.len_bytes > f->qsize) goto drop; } else if (q->ni.length >= f->qsize) { goto drop; } mq_append(&q->mq, m); q->ni.length++; q->ni.len_bytes += len; ni->length++; ni->len_bytes += len; return (0); drop: io_pkt_drop++; q->ni.drops++; ni->drops++; FREE_PKT(m); return (1); } /* * Fetch packets from the delay line which are due now. If there are * leftover packets, reinsert the delay line in the heap. * Runs under scheduler lock. */ static void transmit_event(struct mq *q, struct delay_line *dline, uint64_t now) { struct mbuf *m; struct dn_pkt_tag *pkt = NULL; dline->oid.subtype = 0; /* not in heap */ while ((m = dline->mq.head) != NULL) { pkt = dn_tag_get(m); if (!DN_KEY_LEQ(pkt->output_time, now)) break; dline->mq.head = m->m_nextpkt; dline->mq.count--; mq_append(q, m); } if (m != NULL) { dline->oid.subtype = 1; /* in heap */ heap_insert(&dn_cfg.evheap, pkt->output_time, dline); } } /* * Convert the additional MAC overheads/delays into an equivalent * number of bits for the given data rate. The samples are * in milliseconds so we need to divide by 1000. */ static uint64_t extra_bits(struct mbuf *m, struct dn_schk *s) { int index; uint64_t bits; struct dn_profile *pf = s->profile; if (!pf || pf->samples_no == 0) return 0; index = random() % pf->samples_no; bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000); if (index >= pf->loss_level) { struct dn_pkt_tag *dt = dn_tag_get(m); if (dt) dt->dn_dir = DIR_DROP; } return bits; } /* * Send traffic from a scheduler instance due by 'now'. * Return a pointer to the head of the queue. */ static struct mbuf * serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now) { struct mq def_q; struct dn_schk *s = si->sched; struct mbuf *m = NULL; int delay_line_idle = (si->dline.mq.head == NULL); int done, bw; if (q == NULL) { q = &def_q; q->head = NULL; } bw = s->link.bandwidth; si->kflags &= ~DN_ACTIVE; if (bw > 0) si->credit += (now - si->sched_time) * bw; else si->credit = 0; si->sched_time = now; done = 0; while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) { uint64_t len_scaled; done++; len_scaled = (bw == 0) ? 0 : hz * (m->m_pkthdr.len * 8 + extra_bits(m, s)); si->credit -= len_scaled; /* Move packet in the delay line */ dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay ; mq_append(&si->dline.mq, m); } /* * If credit >= 0 the instance is idle, mark time. * Otherwise put back in the heap, and adjust the output * time of the last inserted packet, m, which was too early. */ if (si->credit >= 0) { si->idle_time = now; } else { uint64_t t; KASSERT (bw > 0, ("bw=0 and credit<0 ?")); t = div64(bw - 1 - si->credit, bw); if (m) dn_tag_get(m)->output_time += t; si->kflags |= DN_ACTIVE; heap_insert(&dn_cfg.evheap, now + t, si); } if (delay_line_idle && done) transmit_event(q, &si->dline, now); return q->head; } /* * The timer handler for dummynet. Time is computed in ticks, but * but the code is tolerant to the actual rate at which this is called. * Once complete, the function reschedules itself for the next tick. */ void dummynet_task(void *context, int pending) { struct timeval t; struct mq q = { NULL, NULL }; /* queue to accumulate results */ CURVNET_SET((struct vnet *)context); DN_BH_WLOCK(); /* Update number of lost(coalesced) ticks. */ tick_lost += pending - 1; getmicrouptime(&t); /* Last tick duration (usec). */ tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 + (t.tv_usec - dn_cfg.prev_t.tv_usec); /* Last tick vs standard tick difference (usec). */ tick_delta = (tick_last * hz - 1000000) / hz; /* Accumulated tick difference (usec). */ tick_delta_sum += tick_delta; dn_cfg.prev_t = t; /* * Adjust curr_time if the accumulated tick difference is * greater than the 'standard' tick. Since curr_time should * be monotonically increasing, we do positive adjustments * as required, and throttle curr_time in case of negative * adjustment. */ dn_cfg.curr_time++; if (tick_delta_sum - tick >= 0) { int diff = tick_delta_sum / tick; dn_cfg.curr_time += diff; tick_diff += diff; tick_delta_sum %= tick; tick_adjustment++; } else if (tick_delta_sum + tick <= 0) { dn_cfg.curr_time--; tick_diff--; tick_delta_sum += tick; tick_adjustment++; } /* serve pending events, accumulate in q */ for (;;) { struct dn_id *p; /* generic parameter to handler */ if (dn_cfg.evheap.elements == 0 || DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key)) break; p = HEAP_TOP(&dn_cfg.evheap)->object; heap_extract(&dn_cfg.evheap, NULL); if (p->type == DN_SCH_I) { serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time); } else { /* extracted a delay line */ transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time); } } if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) { dn_cfg.expire_cycle = 0; dn_drain_scheduler(); dn_drain_queue(); } dn_reschedule(); DN_BH_WUNLOCK(); if (q.head != NULL) dummynet_send(q.head); CURVNET_RESTORE(); } /* * forward a chain of packets to the proper destination. * This runs outside the dummynet lock. */ static void dummynet_send(struct mbuf *m) { struct mbuf *n; + NET_EPOCH_ASSERT(); + for (; m != NULL; m = n) { struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */ struct m_tag *tag; int dst; n = m->m_nextpkt; m->m_nextpkt = NULL; tag = m_tag_first(m); if (tag == NULL) { /* should not happen */ dst = DIR_DROP; } else { struct dn_pkt_tag *pkt = dn_tag_get(m); /* extract the dummynet info, rename the tag * to carry reinject info. */ if (pkt->dn_dir == (DIR_OUT | PROTO_LAYER2) && pkt->ifp == NULL) { dst = DIR_DROP; } else { dst = pkt->dn_dir; ifp = pkt->ifp; tag->m_tag_cookie = MTAG_IPFW_RULE; tag->m_tag_id = 0; } } switch (dst) { case DIR_OUT: ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); break ; case DIR_IN : netisr_dispatch(NETISR_IP, m); break; #ifdef INET6 case DIR_IN | PROTO_IPV6: netisr_dispatch(NETISR_IPV6, m); break; case DIR_OUT | PROTO_IPV6: ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); break; #endif case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */ if (bridge_dn_p != NULL) ((*bridge_dn_p)(m, ifp)); else printf("dummynet: if_bridge not loaded\n"); break; case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */ /* * The Ethernet code assumes the Ethernet header is * contiguous in the first mbuf header. * Insure this is true. */ if (m->m_len < ETHER_HDR_LEN && (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { printf("dummynet/ether: pullup failed, " "dropping packet\n"); break; } ether_demux(m->m_pkthdr.rcvif, m); break; case DIR_OUT | PROTO_LAYER2: /* DN_TO_ETH_OUT: */ ether_output_frame(ifp, m); break; case DIR_DROP: /* drop the packet after some time */ FREE_PKT(m); break; default: printf("dummynet: bad switch %d!\n", dst); FREE_PKT(m); break; } } } static inline int tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa) { struct dn_pkt_tag *dt; struct m_tag *mtag; mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*dt), M_NOWAIT | M_ZERO); if (mtag == NULL) return 1; /* Cannot allocate packet header. */ m_tag_prepend(m, mtag); /* Attach to mbuf chain. */ dt = (struct dn_pkt_tag *)(mtag + 1); dt->rule = fwa->rule; dt->rule.info &= IPFW_ONEPASS; /* only keep this info */ dt->dn_dir = dir; dt->ifp = fwa->flags & IPFW_ARGS_OUT ? fwa->ifp : NULL; /* dt->output tame is updated as we move through */ dt->output_time = dn_cfg.curr_time; dt->iphdr_off = (dir & PROTO_LAYER2) ? ETHER_HDR_LEN : 0; return 0; } /* * dummynet hook for packets. * We use the argument to locate the flowset fs and the sched_set sch * associated to it. The we apply flow_mask and sched_mask to * determine the queue and scheduler instances. */ int dummynet_io(struct mbuf **m0, struct ip_fw_args *fwa) { struct mbuf *m = *m0; struct dn_fsk *fs = NULL; struct dn_sch_inst *si; struct dn_queue *q = NULL; /* default */ int fs_id, dir; fs_id = (fwa->rule.info & IPFW_INFO_MASK) + ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0); /* XXXGL: convert args to dir */ if (fwa->flags & IPFW_ARGS_IN) dir = DIR_IN; else dir = DIR_OUT; if (fwa->flags & IPFW_ARGS_ETHER) dir |= PROTO_LAYER2; else if (fwa->flags & IPFW_ARGS_IP6) dir |= PROTO_IPV6; DN_BH_WLOCK(); io_pkt++; /* we could actually tag outside the lock, but who cares... */ if (tag_mbuf(m, dir, fwa)) goto dropit; if (dn_cfg.busy) { /* if the upper half is busy doing something expensive, * lets queue the packet and move forward */ mq_append(&dn_cfg.pending, m); m = *m0 = NULL; /* consumed */ goto done; /* already active, nothing to do */ } /* XXX locate_flowset could be optimised with a direct ref. */ fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL); if (fs == NULL) goto dropit; /* This queue/pipe does not exist! */ if (fs->sched == NULL) /* should not happen */ goto dropit; /* find scheduler instance, possibly applying sched_mask */ si = ipdn_si_find(fs->sched, &(fwa->f_id)); if (si == NULL) goto dropit; /* * If the scheduler supports multiple queues, find the right one * (otherwise it will be ignored by enqueue). */ if (fs->sched->fp->flags & DN_MULTIQUEUE) { q = ipdn_q_find(fs, si, &(fwa->f_id)); if (q == NULL) goto dropit; } if (fs->sched->fp->enqueue(si, q, m)) { /* packet was dropped by enqueue() */ m = *m0 = NULL; /* dn_enqueue already increases io_pkt_drop */ io_pkt_drop--; goto dropit; } if (si->kflags & DN_ACTIVE) { m = *m0 = NULL; /* consumed */ goto done; /* already active, nothing to do */ } /* compute the initial allowance */ if (si->idle_time < dn_cfg.curr_time) { /* Do this only on the first packet on an idle pipe */ struct dn_link *p = &fs->sched->link; si->sched_time = dn_cfg.curr_time; si->credit = dn_cfg.io_fast ? p->bandwidth : 0; if (p->burst) { uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth; if (burst > p->burst) burst = p->burst; si->credit += burst; } } /* pass through scheduler and delay line */ m = serve_sched(NULL, si, dn_cfg.curr_time); /* optimization -- pass it back to ipfw for immediate send */ /* XXX Don't call dummynet_send() if scheduler return the packet * just enqueued. This avoid a lock order reversal. * */ if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) { /* fast io, rename the tag * to carry reinject info. */ struct m_tag *tag = m_tag_first(m); tag->m_tag_cookie = MTAG_IPFW_RULE; tag->m_tag_id = 0; io_pkt_fast++; if (m->m_nextpkt != NULL) { printf("dummynet: fast io: pkt chain detected!\n"); m->m_nextpkt = NULL; } m = NULL; } else { *m0 = NULL; } done: DN_BH_WUNLOCK(); if (m) dummynet_send(m); return 0; dropit: io_pkt_drop++; DN_BH_WUNLOCK(); if (m) FREE_PKT(m); *m0 = NULL; return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS; } diff --git a/sys/netpfil/pf/pf_if.c b/sys/netpfil/pf/pf_if.c index f187db2a224c..62653c877122 100644 --- a/sys/netpfil/pf/pf_if.c +++ b/sys/netpfil/pf/pf_if.c @@ -1,944 +1,975 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2001 Daniel Hartmeier * Copyright (c) 2003 Cedric Berger * Copyright (c) 2005 Henning Brauer * Copyright (c) 2005 Ryan McBride * Copyright (c) 2012 Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $OpenBSD: pf_if.c,v 1.54 2008/06/14 16:55:28 mk Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include VNET_DEFINE(struct pfi_kif *, pfi_all); VNET_DEFINE_STATIC(long, pfi_update); #define V_pfi_update VNET(pfi_update) #define PFI_BUFFER_MAX 0x10000 VNET_DECLARE(int, pf_vnet_active); #define V_pf_vnet_active VNET(pf_vnet_active) VNET_DEFINE_STATIC(struct pfr_addr *, pfi_buffer); VNET_DEFINE_STATIC(int, pfi_buffer_cnt); VNET_DEFINE_STATIC(int, pfi_buffer_max); #define V_pfi_buffer VNET(pfi_buffer) #define V_pfi_buffer_cnt VNET(pfi_buffer_cnt) #define V_pfi_buffer_max VNET(pfi_buffer_max) eventhandler_tag pfi_attach_cookie; eventhandler_tag pfi_detach_cookie; eventhandler_tag pfi_attach_group_cookie; eventhandler_tag pfi_change_group_cookie; eventhandler_tag pfi_detach_group_cookie; eventhandler_tag pfi_ifaddr_event_cookie; -static void pfi_attach_ifnet(struct ifnet *); -static void pfi_attach_ifgroup(struct ifg_group *); +static void pfi_attach_ifnet(struct ifnet *, struct pfi_kif *); +static void pfi_attach_ifgroup(struct ifg_group *, struct pfi_kif *); static void pfi_kif_update(struct pfi_kif *); static void pfi_dynaddr_update(struct pfi_dynaddr *dyn); static void pfi_table_update(struct pfr_ktable *, struct pfi_kif *, int, int); static void pfi_instance_add(struct ifnet *, int, int); static void pfi_address_add(struct sockaddr *, int, int); static int pfi_if_compare(struct pfi_kif *, struct pfi_kif *); static int pfi_skip_if(const char *, struct pfi_kif *); static int pfi_unmask(void *); static void pfi_attach_ifnet_event(void * __unused, struct ifnet *); static void pfi_detach_ifnet_event(void * __unused, struct ifnet *); static void pfi_attach_group_event(void * __unused, struct ifg_group *); static void pfi_change_group_event(void * __unused, char *); static void pfi_detach_group_event(void * __unused, struct ifg_group *); static void pfi_ifaddr_event(void * __unused, struct ifnet *); RB_HEAD(pfi_ifhead, pfi_kif); static RB_PROTOTYPE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare); static RB_GENERATE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare); VNET_DEFINE_STATIC(struct pfi_ifhead, pfi_ifs); #define V_pfi_ifs VNET(pfi_ifs) #define PFI_BUFFER_MAX 0x10000 MALLOC_DEFINE(PFI_MTYPE, "pf_ifnet", "pf(4) interface database"); LIST_HEAD(pfi_list, pfi_kif); VNET_DEFINE_STATIC(struct pfi_list, pfi_unlinked_kifs); #define V_pfi_unlinked_kifs VNET(pfi_unlinked_kifs) static struct mtx pfi_unlnkdkifs_mtx; MTX_SYSINIT(pfi_unlnkdkifs_mtx, &pfi_unlnkdkifs_mtx, "pf unlinked interfaces", MTX_DEF); void pfi_initialize_vnet(void) { + struct pfi_list kifs = LIST_HEAD_INITIALIZER(); + struct epoch_tracker et; + struct pfi_kif *kif; struct ifg_group *ifg; struct ifnet *ifp; - struct pfi_kif *kif; + int nkifs; V_pfi_buffer_max = 64; V_pfi_buffer = malloc(V_pfi_buffer_max * sizeof(*V_pfi_buffer), PFI_MTYPE, M_WAITOK); - kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); - PF_RULES_WLOCK(); - V_pfi_all = pfi_kif_attach(kif, IFG_ALL); - PF_RULES_WUNLOCK(); - + nkifs = 1; /* one for V_pfi_all */ IFNET_RLOCK(); CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) - pfi_attach_ifgroup(ifg); + nkifs++; CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) - pfi_attach_ifnet(ifp); + nkifs++; + + for (int n = 0; n < nkifs; n++) { + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + LIST_INSERT_HEAD(&kifs, kif, pfik_list); + } + + NET_EPOCH_ENTER(et); + PF_RULES_WLOCK(); + kif = LIST_FIRST(&kifs); + LIST_REMOVE(kif, pfik_list); + V_pfi_all = pfi_kif_attach(kif, IFG_ALL); + CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) { + kif = LIST_FIRST(&kifs); + LIST_REMOVE(kif, pfik_list); + pfi_attach_ifgroup(ifg, kif); + } + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { + kif = LIST_FIRST(&kifs); + LIST_REMOVE(kif, pfik_list); + pfi_attach_ifnet(ifp, kif); + } + PF_RULES_WUNLOCK(); + NET_EPOCH_EXIT(et); IFNET_RUNLOCK(); + + MPASS(LIST_EMPTY(&kifs)); } void pfi_initialize(void) { pfi_attach_cookie = EVENTHANDLER_REGISTER(ifnet_arrival_event, pfi_attach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY); pfi_detach_cookie = EVENTHANDLER_REGISTER(ifnet_departure_event, pfi_detach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY); pfi_attach_group_cookie = EVENTHANDLER_REGISTER(group_attach_event, pfi_attach_group_event, NULL, EVENTHANDLER_PRI_ANY); pfi_change_group_cookie = EVENTHANDLER_REGISTER(group_change_event, pfi_change_group_event, NULL, EVENTHANDLER_PRI_ANY); pfi_detach_group_cookie = EVENTHANDLER_REGISTER(group_detach_event, pfi_detach_group_event, NULL, EVENTHANDLER_PRI_ANY); pfi_ifaddr_event_cookie = EVENTHANDLER_REGISTER(ifaddr_event, pfi_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY); } void pfi_cleanup_vnet(void) { struct pfi_kif *kif; PF_RULES_WASSERT(); V_pfi_all = NULL; while ((kif = RB_MIN(pfi_ifhead, &V_pfi_ifs))) { RB_REMOVE(pfi_ifhead, &V_pfi_ifs, kif); if (kif->pfik_group) kif->pfik_group->ifg_pf_kif = NULL; if (kif->pfik_ifp) { if_rele(kif->pfik_ifp); kif->pfik_ifp->if_pf_kif = NULL; } free(kif, PFI_MTYPE); } mtx_lock(&pfi_unlnkdkifs_mtx); while ((kif = LIST_FIRST(&V_pfi_unlinked_kifs))) { LIST_REMOVE(kif, pfik_list); free(kif, PFI_MTYPE); } mtx_unlock(&pfi_unlnkdkifs_mtx); free(V_pfi_buffer, PFI_MTYPE); } void pfi_cleanup(void) { EVENTHANDLER_DEREGISTER(ifnet_arrival_event, pfi_attach_cookie); EVENTHANDLER_DEREGISTER(ifnet_departure_event, pfi_detach_cookie); EVENTHANDLER_DEREGISTER(group_attach_event, pfi_attach_group_cookie); EVENTHANDLER_DEREGISTER(group_change_event, pfi_change_group_cookie); EVENTHANDLER_DEREGISTER(group_detach_event, pfi_detach_group_cookie); EVENTHANDLER_DEREGISTER(ifaddr_event, pfi_ifaddr_event_cookie); } struct pfi_kif * pfi_kif_find(const char *kif_name) { struct pfi_kif_cmp s; PF_RULES_ASSERT(); bzero(&s, sizeof(s)); strlcpy(s.pfik_name, kif_name, sizeof(s.pfik_name)); return (RB_FIND(pfi_ifhead, &V_pfi_ifs, (struct pfi_kif *)&s)); } struct pfi_kif * pfi_kif_attach(struct pfi_kif *kif, const char *kif_name) { struct pfi_kif *kif1; PF_RULES_WASSERT(); KASSERT(kif != NULL, ("%s: null kif", __func__)); kif1 = pfi_kif_find(kif_name); if (kif1 != NULL) { free(kif, PFI_MTYPE); return (kif1); } bzero(kif, sizeof(*kif)); strlcpy(kif->pfik_name, kif_name, sizeof(kif->pfik_name)); /* * It seems that the value of time_second is in unintialzied state * when pf sets interface statistics clear time in boot phase if pf * was statically linked to kernel. Instead of setting the bogus * time value have pfi_get_ifaces handle this case. In * pfi_get_ifaces it uses time_second if it sees the time is 0. */ kif->pfik_tzero = time_second > 1 ? time_second : 0; TAILQ_INIT(&kif->pfik_dynaddrs); RB_INSERT(pfi_ifhead, &V_pfi_ifs, kif); return (kif); } void pfi_kif_ref(struct pfi_kif *kif) { PF_RULES_WASSERT(); kif->pfik_rulerefs++; } void pfi_kif_unref(struct pfi_kif *kif) { PF_RULES_WASSERT(); KASSERT(kif->pfik_rulerefs > 0, ("%s: %p has zero refs", __func__, kif)); kif->pfik_rulerefs--; if (kif->pfik_rulerefs > 0) return; /* kif referencing an existing ifnet or group should exist. */ if (kif->pfik_ifp != NULL || kif->pfik_group != NULL || kif == V_pfi_all) return; RB_REMOVE(pfi_ifhead, &V_pfi_ifs, kif); kif->pfik_flags |= PFI_IFLAG_REFS; mtx_lock(&pfi_unlnkdkifs_mtx); LIST_INSERT_HEAD(&V_pfi_unlinked_kifs, kif, pfik_list); mtx_unlock(&pfi_unlnkdkifs_mtx); } void pfi_kif_purge(void) { struct pfi_kif *kif, *kif1; /* * Do naive mark-and-sweep garbage collecting of old kifs. * Reference flag is raised by pf_purge_expired_states(). */ mtx_lock(&pfi_unlnkdkifs_mtx); LIST_FOREACH_SAFE(kif, &V_pfi_unlinked_kifs, pfik_list, kif1) { if (!(kif->pfik_flags & PFI_IFLAG_REFS)) { LIST_REMOVE(kif, pfik_list); free(kif, PFI_MTYPE); } else kif->pfik_flags &= ~PFI_IFLAG_REFS; } mtx_unlock(&pfi_unlnkdkifs_mtx); } int pfi_kif_match(struct pfi_kif *rule_kif, struct pfi_kif *packet_kif) { struct ifg_list *p; + NET_EPOCH_ASSERT(); + if (rule_kif == NULL || rule_kif == packet_kif) return (1); if (rule_kif->pfik_group != NULL) { - struct epoch_tracker et; - - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(p, &packet_kif->pfik_ifp->if_groups, ifgl_next) - if (p->ifgl_group == rule_kif->pfik_group) { - NET_EPOCH_EXIT(et); + if (p->ifgl_group == rule_kif->pfik_group) return (1); - } - NET_EPOCH_EXIT(et); } - return (0); } static void -pfi_attach_ifnet(struct ifnet *ifp) +pfi_attach_ifnet(struct ifnet *ifp, struct pfi_kif *kif) { - struct pfi_kif *kif; - kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + PF_RULES_WASSERT(); - PF_RULES_WLOCK(); V_pfi_update++; kif = pfi_kif_attach(kif, ifp->if_xname); - if_ref(ifp); - kif->pfik_ifp = ifp; ifp->if_pf_kif = kif; - pfi_kif_update(kif); - PF_RULES_WUNLOCK(); } static void -pfi_attach_ifgroup(struct ifg_group *ifg) +pfi_attach_ifgroup(struct ifg_group *ifg, struct pfi_kif *kif) { - struct pfi_kif *kif; - kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + PF_RULES_WASSERT(); - PF_RULES_WLOCK(); V_pfi_update++; kif = pfi_kif_attach(kif, ifg->ifg_group); - kif->pfik_group = ifg; ifg->ifg_pf_kif = kif; - PF_RULES_WUNLOCK(); } int pfi_match_addr(struct pfi_dynaddr *dyn, struct pf_addr *a, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: switch (dyn->pfid_acnt4) { case 0: return (0); case 1: return (PF_MATCHA(0, &dyn->pfid_addr4, &dyn->pfid_mask4, a, AF_INET)); default: return (pfr_match_addr(dyn->pfid_kt, a, AF_INET)); } break; #endif /* INET */ #ifdef INET6 case AF_INET6: switch (dyn->pfid_acnt6) { case 0: return (0); case 1: return (PF_MATCHA(0, &dyn->pfid_addr6, &dyn->pfid_mask6, a, AF_INET6)); default: return (pfr_match_addr(dyn->pfid_kt, a, AF_INET6)); } break; #endif /* INET6 */ default: return (0); } } int pfi_dynaddr_setup(struct pf_addr_wrap *aw, sa_family_t af) { + struct epoch_tracker et; struct pfi_dynaddr *dyn; char tblname[PF_TABLE_NAME_SIZE]; struct pf_ruleset *ruleset = NULL; struct pfi_kif *kif; int rv = 0; PF_RULES_WASSERT(); KASSERT(aw->type == PF_ADDR_DYNIFTL, ("%s: type %u", __func__, aw->type)); KASSERT(aw->p.dyn == NULL, ("%s: dyn is %p", __func__, aw->p.dyn)); if ((dyn = malloc(sizeof(*dyn), PFI_MTYPE, M_NOWAIT | M_ZERO)) == NULL) return (ENOMEM); if ((kif = malloc(sizeof(*kif), PFI_MTYPE, M_NOWAIT)) == NULL) { free(dyn, PFI_MTYPE); return (ENOMEM); } if (!strcmp(aw->v.ifname, "self")) dyn->pfid_kif = pfi_kif_attach(kif, IFG_ALL); else dyn->pfid_kif = pfi_kif_attach(kif, aw->v.ifname); pfi_kif_ref(dyn->pfid_kif); dyn->pfid_net = pfi_unmask(&aw->v.a.mask); if (af == AF_INET && dyn->pfid_net == 32) dyn->pfid_net = 128; strlcpy(tblname, aw->v.ifname, sizeof(tblname)); if (aw->iflags & PFI_AFLAG_NETWORK) strlcat(tblname, ":network", sizeof(tblname)); if (aw->iflags & PFI_AFLAG_BROADCAST) strlcat(tblname, ":broadcast", sizeof(tblname)); if (aw->iflags & PFI_AFLAG_PEER) strlcat(tblname, ":peer", sizeof(tblname)); if (aw->iflags & PFI_AFLAG_NOALIAS) strlcat(tblname, ":0", sizeof(tblname)); if (dyn->pfid_net != 128) snprintf(tblname + strlen(tblname), sizeof(tblname) - strlen(tblname), "/%d", dyn->pfid_net); if ((ruleset = pf_find_or_create_ruleset(PF_RESERVED_ANCHOR)) == NULL) { rv = ENOMEM; goto _bad; } if ((dyn->pfid_kt = pfr_attach_table(ruleset, tblname)) == NULL) { rv = ENOMEM; goto _bad; } dyn->pfid_kt->pfrkt_flags |= PFR_TFLAG_ACTIVE; dyn->pfid_iflags = aw->iflags; dyn->pfid_af = af; TAILQ_INSERT_TAIL(&dyn->pfid_kif->pfik_dynaddrs, dyn, entry); aw->p.dyn = dyn; + NET_EPOCH_ENTER(et); pfi_kif_update(dyn->pfid_kif); + NET_EPOCH_EXIT(et); return (0); _bad: if (dyn->pfid_kt != NULL) pfr_detach_table(dyn->pfid_kt); if (ruleset != NULL) pf_remove_if_empty_ruleset(ruleset); if (dyn->pfid_kif != NULL) pfi_kif_unref(dyn->pfid_kif); free(dyn, PFI_MTYPE); return (rv); } static void pfi_kif_update(struct pfi_kif *kif) { struct ifg_list *ifgl; struct pfi_dynaddr *p; + NET_EPOCH_ASSERT(); PF_RULES_WASSERT(); /* update all dynaddr */ TAILQ_FOREACH(p, &kif->pfik_dynaddrs, entry) pfi_dynaddr_update(p); /* again for all groups kif is member of */ if (kif->pfik_ifp != NULL) { - struct epoch_tracker et; - - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifgl, &kif->pfik_ifp->if_groups, ifgl_next) pfi_kif_update((struct pfi_kif *) ifgl->ifgl_group->ifg_pf_kif); - NET_EPOCH_EXIT(et); } } static void pfi_dynaddr_update(struct pfi_dynaddr *dyn) { struct pfi_kif *kif; struct pfr_ktable *kt; PF_RULES_WASSERT(); KASSERT(dyn && dyn->pfid_kif && dyn->pfid_kt, ("%s: bad argument", __func__)); kif = dyn->pfid_kif; kt = dyn->pfid_kt; if (kt->pfrkt_larg != V_pfi_update) { /* this table needs to be brought up-to-date */ pfi_table_update(kt, kif, dyn->pfid_net, dyn->pfid_iflags); kt->pfrkt_larg = V_pfi_update; } pfr_dynaddr_update(kt, dyn); } static void pfi_table_update(struct pfr_ktable *kt, struct pfi_kif *kif, int net, int flags) { int e, size2 = 0; struct ifg_member *ifgm; + NET_EPOCH_ASSERT(); + V_pfi_buffer_cnt = 0; if (kif->pfik_ifp != NULL) pfi_instance_add(kif->pfik_ifp, net, flags); else if (kif->pfik_group != NULL) { - struct epoch_tracker et; - - NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifgm, &kif->pfik_group->ifg_members, ifgm_next) pfi_instance_add(ifgm->ifgm_ifp, net, flags); - NET_EPOCH_EXIT(et); } if ((e = pfr_set_addrs(&kt->pfrkt_t, V_pfi_buffer, V_pfi_buffer_cnt, &size2, NULL, NULL, NULL, 0, PFR_TFLAG_ALLMASK))) printf("%s: cannot set %d new addresses into table %s: %d\n", __func__, V_pfi_buffer_cnt, kt->pfrkt_name, e); } static void pfi_instance_add(struct ifnet *ifp, int net, int flags) { - struct epoch_tracker et; struct ifaddr *ia; int got4 = 0, got6 = 0; int net2, af; - NET_EPOCH_ENTER(et); + NET_EPOCH_ASSERT(); + CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr == NULL) continue; af = ia->ifa_addr->sa_family; if (af != AF_INET && af != AF_INET6) continue; /* * XXX: For point-to-point interfaces, (ifname:0) and IPv4, * jump over addresses without a proper route to work * around a problem with ppp not fully removing the * address used during IPCP. */ if ((ifp->if_flags & IFF_POINTOPOINT) && !(ia->ifa_flags & IFA_ROUTE) && (flags & PFI_AFLAG_NOALIAS) && (af == AF_INET)) continue; if ((flags & PFI_AFLAG_BROADCAST) && af == AF_INET6) continue; if ((flags & PFI_AFLAG_BROADCAST) && !(ifp->if_flags & IFF_BROADCAST)) continue; if ((flags & PFI_AFLAG_PEER) && !(ifp->if_flags & IFF_POINTOPOINT)) continue; if ((flags & (PFI_AFLAG_NETWORK | PFI_AFLAG_NOALIAS)) && af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL( &((struct sockaddr_in6 *)ia->ifa_addr)->sin6_addr)) continue; if (flags & PFI_AFLAG_NOALIAS) { if (af == AF_INET && got4) continue; if (af == AF_INET6 && got6) continue; } if (af == AF_INET) got4 = 1; else if (af == AF_INET6) got6 = 1; net2 = net; if (net2 == 128 && (flags & PFI_AFLAG_NETWORK)) { if (af == AF_INET) net2 = pfi_unmask(&((struct sockaddr_in *) ia->ifa_netmask)->sin_addr); else if (af == AF_INET6) net2 = pfi_unmask(&((struct sockaddr_in6 *) ia->ifa_netmask)->sin6_addr); } if (af == AF_INET && net2 > 32) net2 = 32; if (flags & PFI_AFLAG_BROADCAST) pfi_address_add(ia->ifa_broadaddr, af, net2); else if (flags & PFI_AFLAG_PEER) pfi_address_add(ia->ifa_dstaddr, af, net2); else pfi_address_add(ia->ifa_addr, af, net2); } - NET_EPOCH_EXIT(et); } static void pfi_address_add(struct sockaddr *sa, int af, int net) { struct pfr_addr *p; int i; if (V_pfi_buffer_cnt >= V_pfi_buffer_max) { int new_max = V_pfi_buffer_max * 2; if (new_max > PFI_BUFFER_MAX) { printf("%s: address buffer full (%d/%d)\n", __func__, V_pfi_buffer_cnt, PFI_BUFFER_MAX); return; } p = malloc(new_max * sizeof(*V_pfi_buffer), PFI_MTYPE, M_NOWAIT); if (p == NULL) { printf("%s: no memory to grow buffer (%d/%d)\n", __func__, V_pfi_buffer_cnt, PFI_BUFFER_MAX); return; } memcpy(p, V_pfi_buffer, V_pfi_buffer_max * sizeof(*V_pfi_buffer)); /* no need to zero buffer */ free(V_pfi_buffer, PFI_MTYPE); V_pfi_buffer = p; V_pfi_buffer_max = new_max; } if (af == AF_INET && net > 32) net = 128; p = V_pfi_buffer + V_pfi_buffer_cnt++; bzero(p, sizeof(*p)); p->pfra_af = af; p->pfra_net = net; if (af == AF_INET) p->pfra_ip4addr = ((struct sockaddr_in *)sa)->sin_addr; else if (af == AF_INET6) { p->pfra_ip6addr = ((struct sockaddr_in6 *)sa)->sin6_addr; if (IN6_IS_SCOPE_EMBED(&p->pfra_ip6addr)) p->pfra_ip6addr.s6_addr16[1] = 0; } /* mask network address bits */ if (net < 128) ((caddr_t)p)[p->pfra_net/8] &= ~(0xFF >> (p->pfra_net%8)); for (i = (p->pfra_net+7)/8; i < sizeof(p->pfra_u); i++) ((caddr_t)p)[i] = 0; } void pfi_dynaddr_remove(struct pfi_dynaddr *dyn) { KASSERT(dyn->pfid_kif != NULL, ("%s: null pfid_kif", __func__)); KASSERT(dyn->pfid_kt != NULL, ("%s: null pfid_kt", __func__)); TAILQ_REMOVE(&dyn->pfid_kif->pfik_dynaddrs, dyn, entry); pfi_kif_unref(dyn->pfid_kif); pfr_detach_table(dyn->pfid_kt); free(dyn, PFI_MTYPE); } void pfi_dynaddr_copyout(struct pf_addr_wrap *aw) { KASSERT(aw->type == PF_ADDR_DYNIFTL, ("%s: type %u", __func__, aw->type)); if (aw->p.dyn == NULL || aw->p.dyn->pfid_kif == NULL) return; aw->p.dyncnt = aw->p.dyn->pfid_acnt4 + aw->p.dyn->pfid_acnt6; } static int pfi_if_compare(struct pfi_kif *p, struct pfi_kif *q) { return (strncmp(p->pfik_name, q->pfik_name, IFNAMSIZ)); } void pfi_update_status(const char *name, struct pf_status *pfs) { struct pfi_kif *p; struct pfi_kif_cmp key; struct ifg_member p_member, *ifgm; CK_STAILQ_HEAD(, ifg_member) ifg_members; int i, j, k; strlcpy(key.pfik_name, name, sizeof(key.pfik_name)); p = RB_FIND(pfi_ifhead, &V_pfi_ifs, (struct pfi_kif *)&key); if (p == NULL) return; if (p->pfik_group != NULL) { bcopy(&p->pfik_group->ifg_members, &ifg_members, sizeof(ifg_members)); } else { /* build a temporary list for p only */ bzero(&p_member, sizeof(p_member)); p_member.ifgm_ifp = p->pfik_ifp; CK_STAILQ_INIT(&ifg_members); CK_STAILQ_INSERT_TAIL(&ifg_members, &p_member, ifgm_next); } if (pfs) { bzero(pfs->pcounters, sizeof(pfs->pcounters)); bzero(pfs->bcounters, sizeof(pfs->bcounters)); } CK_STAILQ_FOREACH(ifgm, &ifg_members, ifgm_next) { if (ifgm->ifgm_ifp == NULL || ifgm->ifgm_ifp->if_pf_kif == NULL) continue; p = (struct pfi_kif *)ifgm->ifgm_ifp->if_pf_kif; /* just clear statistics */ if (pfs == NULL) { bzero(p->pfik_packets, sizeof(p->pfik_packets)); bzero(p->pfik_bytes, sizeof(p->pfik_bytes)); p->pfik_tzero = time_second; continue; } for (i = 0; i < 2; i++) for (j = 0; j < 2; j++) for (k = 0; k < 2; k++) { pfs->pcounters[i][j][k] += p->pfik_packets[i][j][k]; pfs->bcounters[i][j] += p->pfik_bytes[i][j][k]; } } } void pfi_get_ifaces(const char *name, struct pfi_kif *buf, int *size) { + struct epoch_tracker et; struct pfi_kif *p, *nextp; int n = 0; + NET_EPOCH_ENTER(et); for (p = RB_MIN(pfi_ifhead, &V_pfi_ifs); p; p = nextp) { nextp = RB_NEXT(pfi_ifhead, &V_pfi_ifs, p); if (pfi_skip_if(name, p)) continue; if (*size <= n++) break; if (!p->pfik_tzero) p->pfik_tzero = time_second; bcopy(p, buf++, sizeof(*buf)); nextp = RB_NEXT(pfi_ifhead, &V_pfi_ifs, p); } *size = n; + NET_EPOCH_EXIT(et); } static int pfi_skip_if(const char *filter, struct pfi_kif *p) { struct ifg_list *i; int n; + NET_EPOCH_ASSERT(); + if (filter == NULL || !*filter) return (0); if (!strcmp(p->pfik_name, filter)) return (0); /* exact match */ n = strlen(filter); if (n < 1 || n >= IFNAMSIZ) return (1); /* sanity check */ if (filter[n-1] >= '0' && filter[n-1] <= '9') return (1); /* group names may not end in a digit */ - if (p->pfik_ifp != NULL) { - struct epoch_tracker et; - - NET_EPOCH_ENTER(et); - CK_STAILQ_FOREACH(i, &p->pfik_ifp->if_groups, ifgl_next) { - if (!strncmp(i->ifgl_group->ifg_group, filter, - IFNAMSIZ)) { - NET_EPOCH_EXIT(et); - return (0); /* iface is in group "filter" */ - } - } - NET_EPOCH_EXIT(et); - } + if (p->pfik_ifp == NULL) + return (1); + CK_STAILQ_FOREACH(i, &p->pfik_ifp->if_groups, ifgl_next) + if (!strncmp(i->ifgl_group->ifg_group, filter, IFNAMSIZ)) + return (0); /* iface is in group "filter" */ return (1); } int pfi_set_flags(const char *name, int flags) { + struct epoch_tracker et; struct pfi_kif *p; + NET_EPOCH_ENTER(et); RB_FOREACH(p, pfi_ifhead, &V_pfi_ifs) { if (pfi_skip_if(name, p)) continue; p->pfik_flags |= flags; } + NET_EPOCH_EXIT(et); return (0); } int pfi_clear_flags(const char *name, int flags) { + struct epoch_tracker et; struct pfi_kif *p; + NET_EPOCH_ENTER(et); RB_FOREACH(p, pfi_ifhead, &V_pfi_ifs) { if (pfi_skip_if(name, p)) continue; p->pfik_flags &= ~flags; } + NET_EPOCH_EXIT(et); return (0); } /* from pf_print_state.c */ static int pfi_unmask(void *addr) { struct pf_addr *m = addr; int i = 31, j = 0, b = 0; u_int32_t tmp; while (j < 4 && m->addr32[j] == 0xffffffff) { b += 32; j++; } if (j < 4) { tmp = ntohl(m->addr32[j]); for (i = 31; tmp & (1 << i); --i) b++; } return (b); } static void pfi_attach_ifnet_event(void *arg __unused, struct ifnet *ifp) { + struct epoch_tracker et; + struct pfi_kif *kif; if (V_pf_vnet_active == 0) { /* Avoid teardown race in the least expensive way. */ return; } - pfi_attach_ifnet(ifp); -#ifdef ALTQ + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + NET_EPOCH_ENTER(et); PF_RULES_WLOCK(); + pfi_attach_ifnet(ifp, kif); +#ifdef ALTQ pf_altq_ifnet_event(ifp, 0); - PF_RULES_WUNLOCK(); #endif + PF_RULES_WUNLOCK(); + NET_EPOCH_EXIT(et); } static void pfi_detach_ifnet_event(void *arg __unused, struct ifnet *ifp) { + struct epoch_tracker et; struct pfi_kif *kif = (struct pfi_kif *)ifp->if_pf_kif; if (pfsync_detach_ifnet_ptr) pfsync_detach_ifnet_ptr(ifp); if (kif == NULL) return; if (V_pf_vnet_active == 0) { /* Avoid teardown race in the least expensive way. */ return; } + NET_EPOCH_ENTER(et); PF_RULES_WLOCK(); V_pfi_update++; pfi_kif_update(kif); if (kif->pfik_ifp) if_rele(kif->pfik_ifp); kif->pfik_ifp = NULL; ifp->if_pf_kif = NULL; #ifdef ALTQ pf_altq_ifnet_event(ifp, 1); #endif PF_RULES_WUNLOCK(); + NET_EPOCH_EXIT(et); } static void pfi_attach_group_event(void *arg __unused, struct ifg_group *ifg) { + struct epoch_tracker et; + struct pfi_kif *kif; if (V_pf_vnet_active == 0) { /* Avoid teardown race in the least expensive way. */ return; } - pfi_attach_ifgroup(ifg); + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + NET_EPOCH_ENTER(et); + PF_RULES_WLOCK(); + pfi_attach_ifgroup(ifg, kif); + PF_RULES_WUNLOCK(); + NET_EPOCH_EXIT(et); } static void pfi_change_group_event(void *arg __unused, char *gname) { + struct epoch_tracker et; struct pfi_kif *kif; if (V_pf_vnet_active == 0) { /* Avoid teardown race in the least expensive way. */ return; } kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + NET_EPOCH_ENTER(et); PF_RULES_WLOCK(); V_pfi_update++; kif = pfi_kif_attach(kif, gname); pfi_kif_update(kif); PF_RULES_WUNLOCK(); + NET_EPOCH_EXIT(et); } static void pfi_detach_group_event(void *arg __unused, struct ifg_group *ifg) { struct pfi_kif *kif = (struct pfi_kif *)ifg->ifg_pf_kif; if (kif == NULL) return; if (V_pf_vnet_active == 0) { /* Avoid teardown race in the least expensive way. */ return; } PF_RULES_WLOCK(); V_pfi_update++; kif->pfik_group = NULL; ifg->ifg_pf_kif = NULL; PF_RULES_WUNLOCK(); } static void pfi_ifaddr_event(void *arg __unused, struct ifnet *ifp) { KASSERT(ifp, ("ifp == NULL")); if (ifp->if_pf_kif == NULL) return; if (V_pf_vnet_active == 0) { /* Avoid teardown race in the least expensive way. */ return; } PF_RULES_WLOCK(); if (ifp->if_pf_kif) { + struct epoch_tracker et; + V_pfi_update++; + NET_EPOCH_ENTER(et); pfi_kif_update(ifp->if_pf_kif); + NET_EPOCH_EXIT(et); } PF_RULES_WUNLOCK(); } diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c index bb1efcc666a1..e411000f84b3 100644 --- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1,1759 +1,1762 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "ipoib.h" #include static int ipoib_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); #include #include #include #include #include /* For ARPHRD_xxx */ #include #include #include #include MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); int ipoib_sendq_size = IPOIB_TX_RING_SIZE; int ipoib_recvq_size = IPOIB_RX_RING_SIZE; module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level = 1; module_param_named(debug_level, ipoib_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif struct ipoib_path_iter { struct ipoib_dev_priv *priv; struct ipoib_path path; }; static const u8 ipv4_bcast_addr[] = { 0x00, 0xff, 0xff, 0xff, 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }; struct workqueue_struct *ipoib_workqueue; struct ib_sa_client ipoib_sa_client; static void ipoib_add_one(struct ib_device *device); static void ipoib_remove_one(struct ib_device *device, void *client_data); static struct net_device *ipoib_get_net_dev_by_params( struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data); static void ipoib_start(struct ifnet *dev); static int ipoib_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro); static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data); static void ipoib_input(struct ifnet *ifp, struct mbuf *m); #define IPOIB_MTAP(_ifp, _m) \ do { \ if (bpf_peers_present((_ifp)->if_bpf)) { \ M_ASSERTVALID(_m); \ ipoib_mtap_mb((_ifp), (_m)); \ } \ } while (0) static struct unrhdr *ipoib_unrhdr; static void ipoib_unrhdr_init(void *arg) { ipoib_unrhdr = new_unrhdr(0, 65535, NULL); } SYSINIT(ipoib_unrhdr_init, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_init, NULL); static void ipoib_unrhdr_uninit(void *arg) { if (ipoib_unrhdr != NULL) { struct unrhdr *hdr; hdr = ipoib_unrhdr; ipoib_unrhdr = NULL; delete_unrhdr(hdr); } } SYSUNINIT(ipoib_unrhdr_uninit, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_uninit, NULL); /* * This is for clients that have an ipoib_header in the mbuf. */ static void ipoib_mtap_mb(struct ifnet *ifp, struct mbuf *mb) { struct ipoib_header *ih; struct ether_header eh; ih = mtod(mb, struct ipoib_header *); eh.ether_type = ih->proto; bcopy(ih->hwaddr, &eh.ether_dhost, ETHER_ADDR_LEN); bzero(&eh.ether_shost, ETHER_ADDR_LEN); mb->m_data += sizeof(struct ipoib_header); mb->m_len -= sizeof(struct ipoib_header); bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); mb->m_data -= sizeof(struct ipoib_header); mb->m_len += sizeof(struct ipoib_header); } void ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto) { struct ether_header eh; eh.ether_type = proto; bzero(&eh.ether_shost, ETHER_ADDR_LEN); bzero(&eh.ether_dhost, ETHER_ADDR_LEN); bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); } static struct ib_client ipoib_client = { .name = "ipoib", .add = ipoib_add_one, .remove = ipoib_remove_one, .get_net_dev_by_params = ipoib_get_net_dev_by_params, }; int ipoib_open(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; ipoib_dbg(priv, "bringing up interface\n"); set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); if (ipoib_pkey_dev_delay_open(priv)) return 0; if (ipoib_ib_dev_open(priv)) goto err_disable; if (ipoib_ib_dev_up(priv)) goto err_stop; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(cpriv, &priv->child_intfs, list) if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) == 0) ipoib_open(cpriv); mutex_unlock(&priv->vlan_mutex); } dev->if_drv_flags |= IFF_DRV_RUNNING; dev->if_drv_flags &= ~IFF_DRV_OACTIVE; return 0; err_stop: ipoib_ib_dev_stop(priv, 1); err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); return -EINVAL; } static void ipoib_init(void *arg) { struct ifnet *dev; struct ipoib_dev_priv *priv; priv = arg; dev = priv->dev; if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) ipoib_open(priv); queue_work(ipoib_workqueue, &priv->flush_light); } static int ipoib_stop(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; ipoib_dbg(priv, "stopping interface\n"); clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); dev->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); ipoib_ib_dev_down(priv, 0); ipoib_ib_dev_stop(priv, 0); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(cpriv, &priv->child_intfs, list) if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) != 0) ipoib_stop(cpriv); mutex_unlock(&priv->vlan_mutex); } return 0; } static int ipoib_propagate_ifnet_mtu(struct ipoib_dev_priv *priv, int new_mtu, bool propagate) { struct ifnet *ifp; struct ifreq ifr; int error; ifp = priv->dev; if (ifp->if_mtu == new_mtu) return (0); if (propagate) { strlcpy(ifr.ifr_name, if_name(ifp), IFNAMSIZ); ifr.ifr_mtu = new_mtu; CURVNET_SET(ifp->if_vnet); error = ifhwioctl(SIOCSIFMTU, ifp, (caddr_t)&ifr, curthread); CURVNET_RESTORE(); } else { ifp->if_mtu = new_mtu; error = 0; } return (error); } int ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu, bool propagate) { int error, prev_admin_mtu; /* dev->if_mtu > 2K ==> connected mode */ if (ipoib_cm_admin_enabled(priv)) { if (new_mtu > IPOIB_CM_MTU(ipoib_cm_max_mtu(priv))) return -EINVAL; if (new_mtu > priv->mcast_mtu) ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", priv->mcast_mtu); return (ipoib_propagate_ifnet_mtu(priv, new_mtu, propagate)); } if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) return -EINVAL; prev_admin_mtu = priv->admin_mtu; priv->admin_mtu = new_mtu; error = ipoib_propagate_ifnet_mtu(priv, min(priv->mcast_mtu, priv->admin_mtu), propagate); if (error == 0) { /* check for MTU change to avoid infinite loop */ if (prev_admin_mtu != new_mtu) queue_work(ipoib_workqueue, &priv->flush_light); } else priv->admin_mtu = prev_admin_mtu; return (error); } static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ipoib_dev_priv *priv = ifp->if_softc; struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; /* check if detaching */ if (priv == NULL || priv->gone != 0) return (ENXIO); switch (command) { case SIOCSIFFLAGS: if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) error = -ipoib_open(priv); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) ipoib_stop(priv); break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifp->if_drv_flags & IFF_DRV_RUNNING) queue_work(ipoib_workqueue, &priv->restart_task); break; case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0], INFINIBAND_ALEN); break; case SIOCSIFMTU: /* * Set the interface MTU. */ error = -ipoib_change_mtu(priv, ifr->ifr_mtu, false); break; default: error = EINVAL; break; } return (error); } static struct ipoib_path * __path_find(struct ipoib_dev_priv *priv, void *gid) { struct rb_node *n = priv->path_tree.rb_node; struct ipoib_path *path; int ret; while (n) { path = rb_entry(n, struct ipoib_path, rb_node); ret = memcmp(gid, path->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = n->rb_left; else if (ret > 0) n = n->rb_right; else return path; } return NULL; } static int __path_add(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct rb_node **n = &priv->path_tree.rb_node; struct rb_node *pn = NULL; struct ipoib_path *tpath; int ret; while (*n) { pn = *n; tpath = rb_entry(pn, struct ipoib_path, rb_node); ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = &pn->rb_left; else if (ret > 0) n = &pn->rb_right; else return -EEXIST; } rb_link_node(&path->rb_node, pn, n); rb_insert_color(&path->rb_node, &priv->path_tree); list_add_tail(&path->list, &priv->path_list); return 0; } void ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path) { _IF_DRAIN(&path->queue); if (path->ah) ipoib_put_ah(path->ah); if (ipoib_cm_get(path)) ipoib_cm_destroy_tx(ipoib_cm_get(path)); kfree(path); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_path_iter * ipoib_path_iter_init(struct ipoib_dev_priv *priv) { struct ipoib_path_iter *iter; iter = kmalloc(sizeof *iter, GFP_KERNEL); if (!iter) return NULL; iter->priv = priv; memset(iter->path.pathrec.dgid.raw, 0, 16); if (ipoib_path_iter_next(iter)) { kfree(iter); return NULL; } return iter; } int ipoib_path_iter_next(struct ipoib_path_iter *iter) { struct ipoib_dev_priv *priv = iter->priv; struct rb_node *n; struct ipoib_path *path; int ret = 1; spin_lock_irq(&priv->lock); n = rb_first(&priv->path_tree); while (n) { path = rb_entry(n, struct ipoib_path, rb_node); if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, sizeof (union ib_gid)) < 0) { iter->path = *path; ret = 0; break; } n = rb_next(n); } spin_unlock_irq(&priv->lock); return ret; } void ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path) { *path = iter->path; } #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ void ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv) { struct ipoib_path *path, *tp; spin_lock_irq(&priv->lock); list_for_each_entry_safe(path, tp, &priv->path_list, list) { ipoib_dbg(priv, "mark path LID 0x%04x GID %16D invalid\n", be16_to_cpu(path->pathrec.dlid), path->pathrec.dgid.raw, ":"); path->valid = 0; } spin_unlock_irq(&priv->lock); } void ipoib_flush_paths(struct ipoib_dev_priv *priv) { struct ipoib_path *path, *tp; LIST_HEAD(remove_list); unsigned long flags; spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->path_list, &remove_list); list_for_each_entry(path, &remove_list, list) rb_erase(&path->rb_node, &priv->path_tree); list_for_each_entry_safe(path, tp, &remove_list, list) { if (path->query) ib_sa_cancel_query(path->query_id, path->query); spin_unlock_irqrestore(&priv->lock, flags); wait_for_completion(&path->done); ipoib_path_free(priv, path); spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); } static void path_rec_completion(int status, struct ib_sa_path_rec *pathrec, void *path_ptr) { struct ipoib_path *path = path_ptr; struct ipoib_dev_priv *priv = path->priv; struct ifnet *dev = priv->dev; struct ipoib_ah *ah = NULL; struct ipoib_ah *old_ah = NULL; struct ifqueue mbqueue; struct mbuf *mb; unsigned long flags; if (!status) ipoib_dbg(priv, "PathRec LID 0x%04x for GID %16D\n", be16_to_cpu(pathrec->dlid), pathrec->dgid.raw, ":"); else ipoib_dbg(priv, "PathRec status %d for GID %16D\n", status, path->pathrec.dgid.raw, ":"); bzero(&mbqueue, sizeof(mbqueue)); if (!status) { struct ib_ah_attr av; if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av)) ah = ipoib_create_ah(priv, priv->pd, &av); } spin_lock_irqsave(&priv->lock, flags); if (ah) { path->pathrec = *pathrec; old_ah = path->ah; path->ah = ah; ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", ah, be16_to_cpu(pathrec->dlid), pathrec->sl); for (;;) { _IF_DEQUEUE(&path->queue, mb); if (mb == NULL) break; _IF_ENQUEUE(&mbqueue, mb); } #ifdef CONFIG_INFINIBAND_IPOIB_CM if (ipoib_cm_enabled(priv, path->hwaddr) && !ipoib_cm_get(path)) ipoib_cm_set(path, ipoib_cm_create_tx(priv, path)); #endif path->valid = 1; } path->query = NULL; complete(&path->done); spin_unlock_irqrestore(&priv->lock, flags); if (old_ah) ipoib_put_ah(old_ah); for (;;) { _IF_DEQUEUE(&mbqueue, mb); if (mb == NULL) break; mb->m_pkthdr.rcvif = dev; if (dev->if_transmit(dev, mb)) ipoib_warn(priv, "dev_queue_xmit failed " "to requeue packet\n"); } } static struct ipoib_path * path_rec_create(struct ipoib_dev_priv *priv, uint8_t *hwaddr) { struct ipoib_path *path; if (!priv->broadcast) return NULL; path = kzalloc(sizeof *path, GFP_ATOMIC); if (!path) return NULL; path->priv = priv; bzero(&path->queue, sizeof(path->queue)); #ifdef CONFIG_INFINIBAND_IPOIB_CM memcpy(&path->hwaddr, hwaddr, INFINIBAND_ALEN); #endif memcpy(path->pathrec.dgid.raw, &hwaddr[4], sizeof (union ib_gid)); path->pathrec.sgid = priv->local_gid; path->pathrec.pkey = cpu_to_be16(priv->pkey); path->pathrec.numb_path = 1; path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; return path; } static int path_rec_start(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct ifnet *dev = priv->dev; ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU; struct ib_sa_path_rec p_rec; p_rec = path->pathrec; p_rec.mtu_selector = IB_SA_GT; switch (roundup_pow_of_two(dev->if_mtu + IPOIB_ENCAP_LEN)) { case 512: p_rec.mtu = IB_MTU_256; break; case 1024: p_rec.mtu = IB_MTU_512; break; case 2048: p_rec.mtu = IB_MTU_1024; break; case 4096: p_rec.mtu = IB_MTU_2048; break; default: /* Wildcard everything */ comp_mask = 0; p_rec.mtu = 0; p_rec.mtu_selector = 0; } ipoib_dbg(priv, "Start path record lookup for %16D MTU > %d\n", p_rec.dgid.raw, ":", comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0); init_completion(&path->done); path->query_id = ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, &p_rec, comp_mask | IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_TRAFFIC_CLASS | IB_SA_PATH_REC_PKEY, 1000, GFP_ATOMIC, path_rec_completion, path, &path->query); if (path->query_id < 0) { ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); path->query = NULL; complete(&path->done); return path->query_id; } return 0; } static void ipoib_unicast_send(struct mbuf *mb, struct ipoib_dev_priv *priv, struct ipoib_header *eh) { struct ipoib_path *path; path = __path_find(priv, eh->hwaddr + 4); if (!path || !path->valid) { int new_path = 0; if (!path) { path = path_rec_create(priv, eh->hwaddr); new_path = 1; } if (path) { if (_IF_QLEN(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) _IF_ENQUEUE(&path->queue, mb); else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } if (!path->query && path_rec_start(priv, path)) { spin_unlock_irqrestore(&priv->lock, flags); if (new_path) ipoib_path_free(priv, path); return; } else __path_add(priv, path); } else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } return; } if (ipoib_cm_get(path) && ipoib_cm_up(path)) { ipoib_cm_send(priv, mb, ipoib_cm_get(path)); } else if (path->ah) { ipoib_send(priv, mb, path->ah, IPOIB_QPN(eh->hwaddr)); } else if ((path->query || !path_rec_start(priv, path)) && path->queue.ifq_len < IPOIB_MAX_PATH_REC_QUEUE) { _IF_ENQUEUE(&path->queue, mb); } else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } } static int ipoib_send_one(struct ipoib_dev_priv *priv, struct mbuf *mb) { struct ipoib_header *eh; eh = mtod(mb, struct ipoib_header *); if (IPOIB_IS_MULTICAST(eh->hwaddr)) { /* Add in the P_Key for multicast*/ eh->hwaddr[8] = (priv->pkey >> 8) & 0xff; eh->hwaddr[9] = priv->pkey & 0xff; ipoib_mcast_send(priv, eh->hwaddr + 4, mb); } else ipoib_unicast_send(mb, priv, eh); return 0; } void ipoib_start_locked(struct ifnet *dev, struct ipoib_dev_priv *priv) { struct mbuf *mb; assert_spin_locked(&priv->lock); while (!IFQ_DRV_IS_EMPTY(&dev->if_snd) && (dev->if_drv_flags & IFF_DRV_OACTIVE) == 0) { IFQ_DRV_DEQUEUE(&dev->if_snd, mb); if (mb == NULL) break; IPOIB_MTAP(dev, mb); ipoib_send_one(priv, mb); } } static void _ipoib_start(struct ifnet *dev, struct ipoib_dev_priv *priv) { if ((dev->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return; spin_lock(&priv->lock); ipoib_start_locked(dev, priv); spin_unlock(&priv->lock); } static void ipoib_start(struct ifnet *dev) { _ipoib_start(dev, dev->if_softc); } static void ipoib_vlan_start(struct ifnet *dev) { struct ipoib_dev_priv *priv; struct mbuf *mb; priv = VLAN_COOKIE(dev); if (priv != NULL) return _ipoib_start(dev, priv); while (!IFQ_DRV_IS_EMPTY(&dev->if_snd)) { IFQ_DRV_DEQUEUE(&dev->if_snd, mb); if (mb == NULL) break; m_freem(mb); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); } } int ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port) { /* Allocate RX/TX "rings" to hold queued mbs */ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", ca->name, ipoib_recvq_size); goto out; } priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, GFP_KERNEL); if (!priv->tx_ring) { printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring); /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ if (ipoib_ib_dev_init(priv, ca, port)) goto out_tx_ring_cleanup; return 0; out_tx_ring_cleanup: kfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); out: return -ENOMEM; } static void ipoib_detach(struct ipoib_dev_priv *priv) { struct ifnet *dev; dev = priv->dev; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { priv->gone = 1; bpfdetach(dev); if_detach(dev); if_free(dev); free_unr(ipoib_unrhdr, priv->unit); } else VLAN_SETCOOKIE(priv->dev, NULL); free(priv, M_TEMP); } void ipoib_dev_cleanup(struct ipoib_dev_priv *priv) { struct ipoib_dev_priv *cpriv, *tcpriv; /* Delete any child interfaces first */ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { ipoib_dev_cleanup(cpriv); ipoib_detach(cpriv); } ipoib_ib_dev_cleanup(priv); kfree(priv->rx_ring); kfree(priv->tx_ring); priv->rx_ring = NULL; priv->tx_ring = NULL; } static struct ipoib_dev_priv * ipoib_priv_alloc(void) { struct ipoib_dev_priv *priv; priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK); spin_lock_init(&priv->lock); spin_lock_init(&priv->drain_lock); mutex_init(&priv->vlan_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); memcpy(priv->broadcastaddr, ipv4_bcast_addr, INFINIBAND_ALEN); return (priv); } struct ipoib_dev_priv * ipoib_intf_alloc(const char *name) { struct ipoib_dev_priv *priv; struct sockaddr_dl *sdl; struct ifnet *dev; priv = ipoib_priv_alloc(); dev = priv->dev = if_alloc(IFT_INFINIBAND); if (!dev) { free(priv, M_TEMP); return NULL; } dev->if_softc = priv; priv->unit = alloc_unr(ipoib_unrhdr); if (priv->unit == -1) { if_free(dev); free(priv, M_TEMP); return NULL; } if_initname(dev, name, priv->unit); dev->if_flags = IFF_BROADCAST | IFF_MULTICAST; dev->if_addrlen = INFINIBAND_ALEN; dev->if_hdrlen = IPOIB_HEADER_LEN; if_attach(dev); dev->if_init = ipoib_init; dev->if_ioctl = ipoib_ioctl; dev->if_start = ipoib_start; dev->if_output = ipoib_output; dev->if_input = ipoib_input; dev->if_resolvemulti = ipoib_resolvemulti; dev->if_baudrate = IF_Gbps(10); dev->if_broadcastaddr = priv->broadcastaddr; dev->if_snd.ifq_maxlen = ipoib_sendq_size * 2; sdl = (struct sockaddr_dl *)dev->if_addr->ifa_addr; sdl->sdl_type = IFT_INFINIBAND; sdl->sdl_alen = dev->if_addrlen; priv->dev = dev; if_link_state_change(dev, LINK_STATE_DOWN); bpfattach(dev, DLT_EN10MB, ETHER_HDR_LEN); return dev->if_softc; } int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) { struct ib_device_attr *device_attr = &hca->attrs; priv->hca_caps = device_attr->device_cap_flags; priv->dev->if_hwassist = 0; priv->dev->if_capabilities = 0; #ifndef CONFIG_INFINIBAND_IPOIB_CM if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { set_bit(IPOIB_FLAG_CSUM, &priv->flags); priv->dev->if_hwassist = CSUM_IP | CSUM_TCP | CSUM_UDP; priv->dev->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; } #if 0 if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) { priv->dev->if_capabilities |= IFCAP_TSO4; priv->dev->if_hwassist |= CSUM_TSO; } #endif #endif priv->dev->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_LINKSTATE; priv->dev->if_capenable = priv->dev->if_capabilities; return 0; } static struct ifnet * ipoib_add_port(const char *format, struct ib_device *hca, u8 port) { struct ipoib_dev_priv *priv; struct ib_port_attr attr; int result = -ENOMEM; priv = ipoib_intf_alloc(format); if (!priv) goto alloc_mem_failed; if (!ib_query_port(hca, port, &attr)) priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); else { printk(KERN_WARNING "%s: ib_query_port %d failed\n", hca->name, port); goto device_init_failed; } /* MTU will be reset when mcast join happens */ priv->dev->if_mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = priv->dev->if_mtu; result = ib_query_pkey(hca, port, 0, &priv->pkey); if (result) { printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } if (ipoib_set_dev_features(priv, hca)) goto device_init_failed; /* * Set the full membership bit, so that we join the right * broadcast group, etc. */ priv->pkey |= 0x8000; priv->broadcastaddr[8] = priv->pkey >> 8; priv->broadcastaddr[9] = priv->pkey & 0xff; result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL); if (result) { printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } memcpy(IF_LLADDR(priv->dev) + 4, priv->local_gid.raw, sizeof (union ib_gid)); result = ipoib_dev_init(priv, hca, port); if (result < 0) { printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", hca->name, port, result); goto device_init_failed; } if (ipoib_cm_admin_enabled(priv)) priv->dev->if_mtu = IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)); INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); result = ib_register_event_handler(&priv->event_handler); if (result < 0) { printk(KERN_WARNING "%s: ib_register_event_handler failed for " "port %d (ret = %d)\n", hca->name, port, result); goto event_failed; } if_printf(priv->dev, "Attached to %s port %d\n", hca->name, port); return priv->dev; event_failed: ipoib_dev_cleanup(priv); device_init_failed: ipoib_detach(priv); alloc_mem_failed: return ERR_PTR(result); } static void ipoib_add_one(struct ib_device *device) { struct list_head *dev_list; struct ifnet *dev; struct ipoib_dev_priv *priv; int s, e, p; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); if (!dev_list) return; INIT_LIST_HEAD(dev_list); if (device->node_type == RDMA_NODE_IB_SWITCH) { s = 0; e = 0; } else { s = 1; e = device->phys_port_cnt; } for (p = s; p <= e; ++p) { if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) continue; dev = ipoib_add_port("ib", device, p); if (!IS_ERR(dev)) { priv = dev->if_softc; list_add_tail(&priv->list, dev_list); } } ib_set_client_data(device, &ipoib_client, dev_list); } static void ipoib_remove_one(struct ib_device *device, void *client_data) { struct ipoib_dev_priv *priv, *tmp; struct list_head *dev_list = client_data; if (!dev_list) return; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; list_for_each_entry_safe(priv, tmp, dev_list, list) { if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND) continue; ipoib_stop(priv); ib_unregister_event_handler(&priv->event_handler); /* dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); */ flush_workqueue(ipoib_workqueue); ipoib_dev_cleanup(priv); ipoib_detach(priv); } kfree(dev_list); } static int ipoib_match_dev_addr(const struct sockaddr *addr, struct net_device *dev) { struct epoch_tracker et; struct ifaddr *ifa; int retval = 0; CURVNET_SET(dev->if_vnet); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != addr->sa_family || ifa->ifa_addr->sa_len != addr->sa_len) { continue; } if (memcmp(ifa->ifa_addr, addr, addr->sa_len) == 0) { retval = 1; break; } } NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (retval); } /* * ipoib_match_gid_pkey_addr - returns the number of IPoIB netdevs on * top a given ipoib device matching a pkey_index and address, if one * exists. * * @found_net_dev: contains a matching net_device if the return value * >= 1, with a reference held. */ static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv, const union ib_gid *gid, u16 pkey_index, const struct sockaddr *addr, struct net_device **found_net_dev) { struct ipoib_dev_priv *child_priv; int matches = 0; if (priv->pkey_index == pkey_index && (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) { if (addr == NULL || ipoib_match_dev_addr(addr, priv->dev) != 0) { if (*found_net_dev == NULL) { struct net_device *net_dev; if (priv->parent != NULL) net_dev = priv->parent; else net_dev = priv->dev; *found_net_dev = net_dev; dev_hold(net_dev); } matches++; } } /* Check child interfaces */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(child_priv, &priv->child_intfs, list) { matches += ipoib_match_gid_pkey_addr(child_priv, gid, pkey_index, addr, found_net_dev); if (matches > 1) break; } mutex_unlock(&priv->vlan_mutex); return matches; } /* * __ipoib_get_net_dev_by_params - returns the number of matching * net_devs found (between 0 and 2). Also return the matching * net_device in the @net_dev parameter, holding a reference to the * net_device, if the number of matches >= 1 */ static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port, u16 pkey_index, const union ib_gid *gid, const struct sockaddr *addr, struct net_device **net_dev) { struct ipoib_dev_priv *priv; int matches = 0; *net_dev = NULL; list_for_each_entry(priv, dev_list, list) { if (priv->port != port) continue; matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index, addr, net_dev); if (matches > 1) break; } return matches; } static struct net_device * ipoib_get_net_dev_by_params(struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data) { struct net_device *net_dev; struct list_head *dev_list = client_data; u16 pkey_index; int matches; int ret; if (!rdma_protocol_ib(dev, port)) return NULL; ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index); if (ret) return NULL; if (!dev_list) return NULL; /* See if we can find a unique device matching the L2 parameters */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, NULL, &net_dev); switch (matches) { case 0: return NULL; case 1: return net_dev; } dev_put(net_dev); /* Couldn't find a unique device with L2 parameters only. Use L3 * address to uniquely match the net device */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, addr, &net_dev); switch (matches) { case 0: return NULL; default: dev_warn_ratelimited(&dev->dev, "duplicate IP address detected\n"); /* Fall through */ case 1: return net_dev; } } static void ipoib_config_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct ipoib_dev_priv *parent; struct ipoib_dev_priv *priv; struct ifnet *dev; uint16_t pkey; int error; if (ifp->if_type != IFT_INFINIBAND) return; dev = VLAN_DEVAT(ifp, vtag); if (dev == NULL) return; priv = NULL; error = 0; parent = ifp->if_softc; /* We only support 15 bits of pkey. */ if (vtag & 0x8000) return; pkey = vtag | 0x8000; /* Set full membership bit. */ if (pkey == parent->pkey) return; /* Check for dups */ mutex_lock(&parent->vlan_mutex); list_for_each_entry(priv, &parent->child_intfs, list) { if (priv->pkey == pkey) { priv = NULL; error = EBUSY; goto out; } } priv = ipoib_priv_alloc(); priv->dev = dev; priv->max_ib_mtu = parent->max_ib_mtu; priv->mcast_mtu = priv->admin_mtu = parent->dev->if_mtu; set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); error = ipoib_set_dev_features(priv, parent->ca); if (error) goto out; priv->pkey = pkey; priv->broadcastaddr[8] = pkey >> 8; priv->broadcastaddr[9] = pkey & 0xff; dev->if_broadcastaddr = priv->broadcastaddr; error = ipoib_dev_init(priv, parent->ca, parent->port); if (error) goto out; priv->parent = parent->dev; list_add_tail(&priv->list, &parent->child_intfs); VLAN_SETCOOKIE(dev, priv); dev->if_start = ipoib_vlan_start; dev->if_drv_flags &= ~IFF_DRV_RUNNING; dev->if_hdrlen = IPOIB_HEADER_LEN; if (ifp->if_drv_flags & IFF_DRV_RUNNING) ipoib_open(priv); mutex_unlock(&parent->vlan_mutex); return; out: mutex_unlock(&parent->vlan_mutex); if (priv) free(priv, M_TEMP); if (error) ipoib_warn(parent, "failed to initialize subinterface: device %s, port %d vtag 0x%X", parent->ca->name, parent->port, vtag); return; } static void ipoib_unconfig_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct ipoib_dev_priv *parent; struct ipoib_dev_priv *priv; struct ifnet *dev; uint16_t pkey; if (ifp->if_type != IFT_INFINIBAND) return; dev = VLAN_DEVAT(ifp, vtag); if (dev) VLAN_SETCOOKIE(dev, NULL); pkey = vtag | 0x8000; parent = ifp->if_softc; mutex_lock(&parent->vlan_mutex); list_for_each_entry(priv, &parent->child_intfs, list) { if (priv->pkey == pkey) { ipoib_dev_cleanup(priv); list_del(&priv->list); break; } } mutex_unlock(&parent->vlan_mutex); } eventhandler_tag ipoib_vlan_attach; eventhandler_tag ipoib_vlan_detach; static int __init ipoib_init_module(void) { int ret; ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE)); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); #endif ipoib_vlan_attach = EVENTHANDLER_REGISTER(vlan_config, ipoib_config_vlan, NULL, EVENTHANDLER_PRI_FIRST); ipoib_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, ipoib_unconfig_vlan, NULL, EVENTHANDLER_PRI_FIRST); /* * We create our own workqueue mainly because we want to be * able to flush it when devices are being removed. We can't * use schedule_work()/flush_scheduled_work() because both * unregister_netdev() and linkwatch_event take the rtnl lock, * so flush_scheduled_work() can deadlock during device * removal. */ ipoib_workqueue = create_singlethread_workqueue("ipoib"); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; } ib_sa_register_client(&ipoib_sa_client); ret = ib_register_client(&ipoib_client); if (ret) goto err_sa; return 0; err_sa: ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); err_fs: return ret; } static void __exit ipoib_cleanup_module(void) { EVENTHANDLER_DEREGISTER(vlan_config, ipoib_vlan_attach); EVENTHANDLER_DEREGISTER(vlan_unconfig, ipoib_vlan_detach); ib_unregister_client(&ipoib_client); ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); } /* * Infiniband output routine. */ static int ipoib_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { u_char edst[INFINIBAND_ALEN]; #if defined(INET) || defined(INET6) struct llentry *lle = NULL; #endif struct ipoib_header *eh; int error = 0, is_gw = 0; short type; if (ro != NULL) is_gw = (ro->ro_flags & RT_HAS_GW) != 0; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) goto bad; #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) { error = ENETDOWN; goto bad; } if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) { error = ENETDOWN; goto bad; } switch (dst->sa_family) { #ifdef INET case AF_INET: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, lle->ll_addr, sizeof(edst)); else if (m->m_flags & M_MCAST) ip_ib_mc_map(((struct sockaddr_in *)dst)->sin_addr.s_addr, ifp->if_broadcastaddr, edst); else error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IP); break; case AF_ARP: { struct arphdr *ah; ah = mtod(m, struct arphdr *); ah->ar_hrd = htons(ARPHRD_INFINIBAND); switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: type = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: type = htons(ETHERTYPE_ARP); break; } if (m->m_flags & M_BCAST) bcopy(ifp->if_broadcastaddr, edst, INFINIBAND_ALEN); else bcopy(ar_tha(ah), edst, INFINIBAND_ALEN); } break; #endif #ifdef INET6 case AF_INET6: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, lle->ll_addr, sizeof(edst)); else if (m->m_flags & M_MCAST) ipv6_ib_mc_map(&((struct sockaddr_in6 *)dst)->sin6_addr, ifp->if_broadcastaddr, edst); else error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return error; type = htons(ETHERTYPE_IPV6); break; #endif default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); error = EAFNOSUPPORT; goto bad; } /* * Add local net header. If no space in first mbuf, * allocate another. */ M_PREPEND(m, IPOIB_HEADER_LEN, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } eh = mtod(m, struct ipoib_header *); (void)memcpy(&eh->proto, &type, sizeof(eh->proto)); (void)memcpy(&eh->hwaddr, edst, sizeof (edst)); /* * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ return ((ifp->if_transmit)(ifp, m)); bad: if (m != NULL) m_freem(m); return (error); } /* * Upper layer processing for a received Infiniband packet. */ void ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto) { + struct epoch_tracker et; int isr; #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { if_printf(ifp, "discard frame at IFF_MONITOR\n"); m_freem(m); return; } /* * Dispatch frame to upper layer. */ switch (proto) { #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } isr = NETISR_ARP; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif default: goto discard; } + NET_EPOCH_ENTER(et); netisr_dispatch(isr, m); + NET_EPOCH_EXIT(et); return; discard: m_freem(m); } /* * Process a received Infiniband packet. */ static void ipoib_input(struct ifnet *ifp, struct mbuf *m) { struct ipoib_header *eh; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } CURVNET_SET_QUIET(ifp->if_vnet); /* Let BPF have it before we strip the header. */ IPOIB_MTAP(ifp, m); eh = mtod(m, struct ipoib_header *); /* * Reset layer specific mbuf flags to avoid confusing upper layers. * Strip off Infiniband header. */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); m_adj(m, IPOIB_HEADER_LEN); if (IPOIB_IS_MULTICAST(eh->hwaddr)) { if (memcmp(eh->hwaddr, ifp->if_broadcastaddr, ifp->if_addrlen) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } ipoib_demux(ifp, m, ntohs(eh->proto)); CURVNET_RESTORE(); } static int ipoib_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!IPOIB_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; *llsa = NULL; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ALEN; e_addr = LLADDR(sdl); ip_ib_mc_map(sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; /* * An IP6 address of 0 means listen to all * of the multicast address used for IP6. * This has no meaning in ipoib. */ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) return EADDRNOTAVAIL; if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ALEN; e_addr = LLADDR(sdl); ipv6_ib_mc_map(&sin6->sin6_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif default: return EAFNOSUPPORT; } } module_init(ipoib_init_module); module_exit(ipoib_cleanup_module); static int ipoib_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t ipoib_mod = { .name = "ipoib", .evhand = ipoib_evhand, }; DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_LAST, SI_ORDER_ANY); MODULE_DEPEND(ipoib, ibcore, 1, 1, 1); MODULE_DEPEND(ipoib, linuxkpi, 1, 1, 1);