Index: head/sys/net/if_stf.c =================================================================== --- head/sys/net/if_stf.c (revision 362899) +++ head/sys/net/if_stf.c (revision 362900) @@ -1,759 +1,762 @@ /* $FreeBSD$ */ /* $KAME: if_stf.c,v 1.73 2001/12/03 11:08:30 keiichi Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 2000 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * 6to4 interface, based on RFC3056. * * 6to4 interface is NOT capable of link-layer (I mean, IPv4) multicasting. * There is no address mapping defined from IPv6 multicast address to IPv4 * address. Therefore, we do not have IFF_MULTICAST on the interface. * * Due to the lack of address mapping for link-local addresses, we cannot * throw packets toward link-local addresses (fe80::x). Also, we cannot throw * packets to link-local multicast addresses (ff02::x). * * Here are interesting symptoms due to the lack of link-local address: * * Unicast routing exchange: * - RIPng: Impossible. Uses link-local multicast packet toward ff02::9, * and link-local addresses as nexthop. * - OSPFv6: Impossible. OSPFv6 assumes that there's link-local address * assigned to the link, and makes use of them. Also, HELLO packets use * link-local multicast addresses (ff02::5 and ff02::6). * - BGP4+: Maybe. You can only use global address as nexthop, and global * address as TCP endpoint address. * * Multicast routing protocols: * - PIM: Hello packet cannot be used to discover adjacent PIM routers. * Adjacent PIM routers must be configured manually (is it really spec-wise * correct thing to do?). * * ICMPv6: * - Redirects cannot be used due to the lack of link-local address. * * stf interface does not have, and will not need, a link-local address. * It seems to have no real benefit and does not help the above symptoms much. * Even if we assign link-locals to interface, we cannot really * use link-local unicast/multicast on top of 6to4 cloud (since there's no * encapsulation defined for link-local address), and the above analysis does * not change. RFC3056 does not mandate the assignment of link-local address * either. * * 6to4 interface has security issues. Refer to * http://playground.iijlab.net/i-d/draft-itojun-ipv6-transition-abuse-00.txt * for details. The code tries to filter out some of malicious packets. * Note that there is no way to be 100% secure. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_STF, stf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "6to4 Interface"); static int stf_permit_rfc1918 = 0; SYSCTL_INT(_net_link_stf, OID_AUTO, permit_rfc1918, CTLFLAG_RWTUN, &stf_permit_rfc1918, 0, "Permit the use of private IPv4 addresses"); #define STFUNIT 0 #define IN6_IS_ADDR_6TO4(x) (ntohs((x)->s6_addr16[0]) == 0x2002) /* * XXX: Return a pointer with 16-bit aligned. Don't cast it to * struct in_addr *; use bcopy() instead. */ #define GET_V4(x) (&(x)->s6_addr16[1]) struct stf_softc { struct ifnet *sc_ifp; u_int sc_fibnum; const struct encaptab *encap_cookie; }; #define STF2IFP(sc) ((sc)->sc_ifp) static const char stfname[] = "stf"; static MALLOC_DEFINE(M_STF, stfname, "6to4 Tunnel Interface"); static const int ip_stf_ttl = 40; static int in_stf_input(struct mbuf *, int, int, void *); static char *stfnames[] = {"stf0", "stf", "6to4", NULL}; static int stfmodevent(module_t, int, void *); static int stf_encapcheck(const struct mbuf *, int, int, void *); static int stf_getsrcifa6(struct ifnet *, struct in6_addr *, struct in6_addr *); static int stf_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int isrfc1918addr(struct in_addr *); static int stf_checkaddr4(struct stf_softc *, struct in_addr *, struct ifnet *); static int stf_checkaddr6(struct stf_softc *, struct in6_addr *, struct ifnet *); static int stf_ioctl(struct ifnet *, u_long, caddr_t); static int stf_clone_match(struct if_clone *, const char *); static int stf_clone_create(struct if_clone *, char *, size_t, caddr_t); static int stf_clone_destroy(struct if_clone *, struct ifnet *); static struct if_clone *stf_cloner; static const struct encap_config ipv4_encap_cfg = { .proto = IPPROTO_IPV6, .min_length = sizeof(struct ip), .exact_match = (sizeof(in_addr_t) << 3) + 8, .check = stf_encapcheck, .input = in_stf_input }; static int stf_clone_match(struct if_clone *ifc, const char *name) { int i; for(i = 0; stfnames[i] != NULL; i++) { if (strcmp(stfnames[i], name) == 0) return (1); } return (0); } static int stf_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { char *dp; int err, unit, wildcard; struct stf_softc *sc; struct ifnet *ifp; err = ifc_name2unit(name, &unit); if (err != 0) return (err); wildcard = (unit < 0); /* * We can only have one unit, but since unit allocation is * already locked, we use it to keep from allocating extra * interfaces. */ unit = STFUNIT; err = ifc_alloc_unit(ifc, &unit); if (err != 0) return (err); sc = malloc(sizeof(struct stf_softc), M_STF, M_WAITOK | M_ZERO); ifp = STF2IFP(sc) = if_alloc(IFT_STF); if (ifp == NULL) { free(sc, M_STF); ifc_free_unit(ifc, unit); return (ENOSPC); } ifp->if_softc = sc; sc->sc_fibnum = curthread->td_proc->p_fibnum; /* * Set the name manually rather then using if_initname because * we don't conform to the default naming convention for interfaces. * In the wildcard case, we need to update the name. */ if (wildcard) { for (dp = name; *dp != '\0'; dp++); if (snprintf(dp, len - (dp-name), "%d", unit) > len - (dp-name) - 1) { /* * This can only be a programmer error and * there's no straightforward way to recover if * it happens. */ panic("if_clone_create(): interface name too long"); } } strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = stfname; ifp->if_dunit = IF_DUNIT_NONE; sc->encap_cookie = ip_encap_attach(&ipv4_encap_cfg, sc, M_WAITOK); if (sc->encap_cookie == NULL) { if_printf(ifp, "attach failed\n"); free(sc, M_STF); ifc_free_unit(ifc, unit); return (ENOMEM); } ifp->if_mtu = IPV6_MMTU; ifp->if_ioctl = stf_ioctl; ifp->if_output = stf_output; ifp->if_snd.ifq_maxlen = ifqmaxlen; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); return (0); } static int stf_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) { struct stf_softc *sc = ifp->if_softc; int err __unused; err = ip_encap_detach(sc->encap_cookie); KASSERT(err == 0, ("Unexpected error detaching encap_cookie")); bpfdetach(ifp); if_detach(ifp); if_free(ifp); free(sc, M_STF); ifc_free_unit(ifc, STFUNIT); return (0); } static int stfmodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: stf_cloner = if_clone_advanced(stfname, 0, stf_clone_match, stf_clone_create, stf_clone_destroy); break; case MOD_UNLOAD: if_clone_detach(stf_cloner); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t stf_mod = { "if_stf", stfmodevent, 0 }; DECLARE_MODULE(if_stf, stf_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); static int stf_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { struct ip ip; struct stf_softc *sc; struct in_addr a, b, mask; struct in6_addr addr6, mask6; sc = (struct stf_softc *)arg; if (sc == NULL) return 0; if ((STF2IFP(sc)->if_flags & IFF_UP) == 0) return 0; /* IFF_LINK0 means "no decapsulation" */ if ((STF2IFP(sc)->if_flags & IFF_LINK0) != 0) return 0; if (proto != IPPROTO_IPV6) return 0; m_copydata(m, 0, sizeof(ip), (caddr_t)&ip); if (ip.ip_v != 4) return 0; if (stf_getsrcifa6(STF2IFP(sc), &addr6, &mask6) != 0) return (0); /* * check if IPv4 dst matches the IPv4 address derived from the * local 6to4 address. * success on: dst = 10.1.1.1, ia6->ia_addr = 2002:0a01:0101:... */ if (bcmp(GET_V4(&addr6), &ip.ip_dst, sizeof(ip.ip_dst)) != 0) return 0; /* * check if IPv4 src matches the IPv4 address derived from the * local 6to4 address masked by prefixmask. * success on: src = 10.1.1.1, ia6->ia_addr = 2002:0a00:.../24 * fail on: src = 10.1.1.1, ia6->ia_addr = 2002:0b00:.../24 */ bzero(&a, sizeof(a)); bcopy(GET_V4(&addr6), &a, sizeof(a)); bcopy(GET_V4(&mask6), &mask, sizeof(mask)); a.s_addr &= mask.s_addr; b = ip.ip_src; b.s_addr &= mask.s_addr; if (a.s_addr != b.s_addr) return 0; /* stf interface makes single side match only */ return 32; } static int stf_getsrcifa6(struct ifnet *ifp, struct in6_addr *addr, struct in6_addr *mask) { struct rm_priotracker in_ifa_tracker; struct ifaddr *ia; struct in_ifaddr *ia4; struct in6_ifaddr *ia6; struct sockaddr_in6 *sin6; struct in_addr in; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ia->ifa_addr; if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) continue; bcopy(GET_V4(&sin6->sin6_addr), &in, sizeof(in)); IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia4, INADDR_HASH(in.s_addr), ia_hash) if (ia4->ia_addr.sin_addr.s_addr == in.s_addr) break; IN_IFADDR_RUNLOCK(&in_ifa_tracker); if (ia4 == NULL) continue; ia6 = (struct in6_ifaddr *)ia; *addr = sin6->sin6_addr; *mask = ia6->ia_prefixmask.sin6_addr; return (0); } return (ENOENT); } static int stf_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct stf_softc *sc; const struct sockaddr_in6 *dst6; struct in_addr in4; const void *ptr; u_int8_t tos; struct ip *ip; struct ip6_hdr *ip6; struct in6_addr addr6, mask6; int error; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); return (error); } #endif sc = ifp->if_softc; dst6 = (const struct sockaddr_in6 *)dst; /* just in case */ if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETDOWN; } /* * If we don't have an ip4 address that match my inner ip6 address, * we shouldn't generate output. Without this check, we'll end up * using wrong IPv4 source. */ if (stf_getsrcifa6(ifp, &addr6, &mask6) != 0) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETDOWN; } if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENOBUFS; } } ip6 = mtod(m, struct ip6_hdr *); tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; /* * Pickup the right outer dst addr from the list of candidates. * ip6_dst has priority as it may be able to give us shorter IPv4 hops. */ ptr = NULL; if (IN6_IS_ADDR_6TO4(&ip6->ip6_dst)) ptr = GET_V4(&ip6->ip6_dst); else if (IN6_IS_ADDR_6TO4(&dst6->sin6_addr)) ptr = GET_V4(&dst6->sin6_addr); else { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETUNREACH; } bcopy(ptr, &in4, sizeof(in4)); if (bpf_peers_present(ifp->if_bpf)) { /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ u_int af = AF_INET6; bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); } M_PREPEND(m, sizeof(struct ip), M_NOWAIT); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENOBUFS; } ip = mtod(m, struct ip *); bzero(ip, sizeof(*ip)); bcopy(GET_V4(&addr6), &ip->ip_src, sizeof(ip->ip_src)); bcopy(&in4, &ip->ip_dst, sizeof(ip->ip_dst)); ip->ip_p = IPPROTO_IPV6; ip->ip_ttl = ip_stf_ttl; ip->ip_len = htons(m->m_pkthdr.len); if (ifp->if_flags & IFF_LINK1) ip_ecn_ingress(ECN_ALLOWED, &ip->ip_tos, &tos); else ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos); M_SETFIB(m, sc->sc_fibnum); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); error = ip_output(m, NULL, NULL, 0, NULL, NULL); return error; } static int isrfc1918addr(struct in_addr *in) { /* * returns 1 if private address range: * 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16 */ if (stf_permit_rfc1918 == 0 && ( (ntohl(in->s_addr) & 0xff000000) >> 24 == 10 || (ntohl(in->s_addr) & 0xfff00000) >> 16 == 172 * 256 + 16 || (ntohl(in->s_addr) & 0xffff0000) >> 16 == 192 * 256 + 168)) return 1; return 0; } static int stf_checkaddr4(struct stf_softc *sc, struct in_addr *in, struct ifnet *inifp) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia4; /* * reject packets with the following address: * 224.0.0.0/4 0.0.0.0/8 127.0.0.0/8 255.0.0.0/8 */ if (IN_MULTICAST(ntohl(in->s_addr))) return -1; switch ((ntohl(in->s_addr) & 0xff000000) >> 24) { case 0: case 127: case 255: return -1; } /* * reject packets with private address range. * (requirement from RFC3056 section 2 1st paragraph) */ if (isrfc1918addr(in)) return -1; /* * reject packets with broadcast */ IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia4, &V_in_ifaddrhead, ia_link) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) continue; if (in->s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return -1; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * perform ingress filter */ if (sc && (STF2IFP(sc)->if_flags & IFF_LINK2) == 0 && inifp) { - struct nhop4_basic nh4; + struct nhop_object *nh; - if (fib4_lookup_nh_basic(sc->sc_fibnum, *in, 0, 0, &nh4) != 0) + NET_EPOCH_ASSERT(); + nh = fib4_lookup(sc->sc_fibnum, *in, 0, 0, 0); + if (nh == NULL) return (-1); - if (nh4.nh_ifp != inifp) + if (nh->nh_ifp != inifp) return (-1); } return 0; } static int stf_checkaddr6(struct stf_softc *sc, struct in6_addr *in6, struct ifnet *inifp) { /* * check 6to4 addresses */ if (IN6_IS_ADDR_6TO4(in6)) { struct in_addr in4; bcopy(GET_V4(in6), &in4, sizeof(in4)); return stf_checkaddr4(sc, &in4, inifp); } /* * reject anything that look suspicious. the test is implemented * in ip6_input too, but we check here as well to * (1) reject bad packets earlier, and * (2) to be safe against future ip6_input change. */ if (IN6_IS_ADDR_V4COMPAT(in6) || IN6_IS_ADDR_V4MAPPED(in6)) return -1; return 0; } static int in_stf_input(struct mbuf *m, int off, int proto, void *arg) { struct stf_softc *sc = arg; struct ip *ip; struct ip6_hdr *ip6; u_int8_t otos, itos; struct ifnet *ifp; NET_EPOCH_ASSERT(); if (proto != IPPROTO_IPV6) { m_freem(m); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); if (sc == NULL || (STF2IFP(sc)->if_flags & IFF_UP) == 0) { m_freem(m); return (IPPROTO_DONE); } ifp = STF2IFP(sc); #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif /* * perform sanity check against outer src/dst. * for source, perform ingress filter as well. */ if (stf_checkaddr4(sc, &ip->ip_dst, NULL) < 0 || stf_checkaddr4(sc, &ip->ip_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); return (IPPROTO_DONE); } otos = ip->ip_tos; m_adj(m, off); if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) return (IPPROTO_DONE); } ip6 = mtod(m, struct ip6_hdr *); /* * perform sanity check against inner src/dst. * for source, perform ingress filter as well. */ if (stf_checkaddr6(sc, &ip6->ip6_dst, NULL) < 0 || stf_checkaddr6(sc, &ip6->ip6_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); return (IPPROTO_DONE); } itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; if ((ifp->if_flags & IFF_LINK1) != 0) ip_ecn_egress(ECN_ALLOWED, &otos, &itos); else ip_ecn_egress(ECN_NOCARE, &otos, &itos); ip6->ip6_flow &= ~htonl(0xff << 20); ip6->ip6_flow |= htonl((u_int32_t)itos << 20); m->m_pkthdr.rcvif = ifp; if (bpf_peers_present(ifp->if_bpf)) { /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ u_int32_t af = AF_INET6; bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); } /* * Put the packet to the network layer input queue according to the * specified address family. * See net/if_gif.c for possible issues with packet processing * reorder due to extra queueing. */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); M_SETFIB(m, ifp->if_fib); netisr_dispatch(NETISR_IPV6, m); return (IPPROTO_DONE); } static int stf_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifaddr *ifa; struct ifreq *ifr; struct sockaddr_in6 *sin6; struct in_addr addr; int error, mtu; error = 0; switch (cmd) { case SIOCSIFADDR: ifa = (struct ifaddr *)data; if (ifa == NULL || ifa->ifa_addr->sa_family != AF_INET6) { error = EAFNOSUPPORT; break; } sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) { error = EINVAL; break; } bcopy(GET_V4(&sin6->sin6_addr), &addr, sizeof(addr)); if (isrfc1918addr(&addr)) { error = EINVAL; break; } ifp->if_flags |= IFF_UP; ifp->if_drv_flags |= IFF_DRV_RUNNING; break; case SIOCADDMULTI: case SIOCDELMULTI: ifr = (struct ifreq *)data; if (ifr && ifr->ifr_addr.sa_family == AF_INET6) ; else error = EAFNOSUPPORT; break; case SIOCGIFMTU: break; case SIOCSIFMTU: ifr = (struct ifreq *)data; mtu = ifr->ifr_mtu; /* RFC 4213 3.2 ideal world MTU */ if (mtu < IPV6_MINMTU || mtu > IF_MAXMTU - 20) return (EINVAL); ifp->if_mtu = mtu; break; default: error = EINVAL; break; } return error; } Index: head/sys/netinet/if_ether.c =================================================================== --- head/sys/netinet/if_ether.c (revision 362899) +++ head/sys/netinet/if_ether.c (revision 362900) @@ -1,1521 +1,1523 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ether.c 8.1 (Berkeley) 6/10/93 */ /* * Ethernet address resolution protocol. * TODO: * add "inuse/lock" bit (or ref. count) along with valid bit */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #ifdef INET #include #endif #include #define SIN(s) ((const struct sockaddr_in *)(s)) static struct timeval arp_lastlog; static int arp_curpps; static int arp_maxpps = 1; /* Simple ARP state machine */ enum arp_llinfo_state { ARP_LLINFO_INCOMPLETE = 0, /* No LLE data */ ARP_LLINFO_REACHABLE, /* LLE is valid */ ARP_LLINFO_VERIFY, /* LLE is valid, need refresh */ ARP_LLINFO_DELETED, /* LLE is deleted */ }; SYSCTL_DECL(_net_link_ether); static SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); static SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); /* timer values */ VNET_DEFINE_STATIC(int, arpt_keep) = (20*60); /* once resolved, good for 20 * minutes */ VNET_DEFINE_STATIC(int, arp_maxtries) = 5; VNET_DEFINE_STATIC(int, arp_proxyall) = 0; VNET_DEFINE_STATIC(int, arpt_down) = 20; /* keep incomplete entries for * 20 seconds */ VNET_DEFINE_STATIC(int, arpt_rexmit) = 1; /* retransmit arp entries, sec*/ VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */ VNET_PCPUSTAT_SYSINIT(arpstat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(arpstat); #endif /* VIMAGE */ VNET_DEFINE_STATIC(int, arp_maxhold) = 1; #define V_arpt_keep VNET(arpt_keep) #define V_arpt_down VNET(arpt_down) #define V_arpt_rexmit VNET(arpt_rexmit) #define V_arp_maxtries VNET(arp_maxtries) #define V_arp_proxyall VNET(arp_proxyall) #define V_arp_maxhold VNET(arp_maxhold) SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arpt_keep), 0, "ARP entry lifetime in seconds"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_maxtries), 0, "ARP resolution attempts before returning error"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_proxyall), 0, "Enable proxy ARP for all suitable requests"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arpt_down), 0, "Incomplete ARP entry lifetime in seconds"); SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat, arpstat, "ARP statistics (struct arpstat, net/if_arp.h)"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_maxhold), 0, "Number of packets to hold per ARP entry"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second, CTLFLAG_RW, &arp_maxpps, 0, "Maximum number of remotely triggered ARP messages that can be " "logged per second"); /* * Due to the exponential backoff algorithm used for the interval between GARP * retransmissions, the maximum number of retransmissions is limited for * sanity. This limit corresponds to a maximum interval between retransmissions * of 2^16 seconds ~= 18 hours. * * Making this limit more dynamic is more complicated than worthwhile, * especially since sending out GARPs spaced days apart would be of little * use. A maximum dynamic limit would look something like: * * const int max = fls(INT_MAX / hz) - 1; */ #define MAX_GARP_RETRANSMITS 16 static int sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS); static int garp_rexmit_count = 0; /* GARP retransmission setting. */ SYSCTL_PROC(_net_link_ether_inet, OID_AUTO, garp_rexmit_count, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, &garp_rexmit_count, 0, sysctl_garp_rexmit, "I", "Number of times to retransmit GARP packets;" " 0 to disable, maximum of 16"); VNET_DEFINE_STATIC(int, arp_log_level) = LOG_INFO; /* Min. log(9) level. */ #define V_arp_log_level VNET(arp_log_level) SYSCTL_INT(_net_link_ether_arp, OID_AUTO, log_level, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_log_level), 0, "Minimum log(9) level for recording rate limited arp log messages. " "The higher will be log more (emerg=0, info=6 (default), debug=7)."); #define ARP_LOG(pri, ...) do { \ if ((pri) <= V_arp_log_level && \ ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps)) \ log((pri), "arp: " __VA_ARGS__); \ } while (0) static void arpintr(struct mbuf *); static void arptimer(void *); #ifdef INET static void in_arpinput(struct mbuf *); #endif static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp, int bridged, struct llentry *la); static void arp_mark_lle_reachable(struct llentry *la); static void arp_iflladdr(void *arg __unused, struct ifnet *ifp); static eventhandler_tag iflladdr_tag; static const struct netisr_handler arp_nh = { .nh_name = "arp", .nh_handler = arpintr, .nh_proto = NETISR_ARP, .nh_policy = NETISR_POLICY_SOURCE, }; /* * Timeout routine. Age arp_tab entries periodically. */ static void arptimer(void *arg) { struct llentry *lle = (struct llentry *)arg; struct ifnet *ifp; int r_skip_req; if (lle->la_flags & LLE_STATIC) { return; } LLE_WLOCK(lle); if (callout_pending(&lle->lle_timer)) { /* * Here we are a bit odd here in the treatment of * active/pending. If the pending bit is set, it got * rescheduled before I ran. The active * bit we ignore, since if it was stopped * in ll_tablefree() and was currently running * it would have return 0 so the code would * not have deleted it since the callout could * not be stopped so we want to go through * with the delete here now. If the callout * was restarted, the pending bit will be back on and * we just want to bail since the callout_reset would * return 1 and our reference would have been removed * by arpresolve() below. */ LLE_WUNLOCK(lle); return; } ifp = lle->lle_tbl->llt_ifp; CURVNET_SET(ifp->if_vnet); switch (lle->ln_state) { case ARP_LLINFO_REACHABLE: /* * Expiration time is approaching. * Let's try to refresh entry if it is still * in use. * * Set r_skip_req to get feedback from * fast path. Change state and re-schedule * ourselves. */ LLE_REQ_LOCK(lle); lle->r_skip_req = 1; LLE_REQ_UNLOCK(lle); lle->ln_state = ARP_LLINFO_VERIFY; callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit); LLE_WUNLOCK(lle); CURVNET_RESTORE(); return; case ARP_LLINFO_VERIFY: LLE_REQ_LOCK(lle); r_skip_req = lle->r_skip_req; LLE_REQ_UNLOCK(lle); if (r_skip_req == 0 && lle->la_preempt > 0) { /* Entry was used, issue refresh request */ struct epoch_tracker et; struct in_addr dst; dst = lle->r_l3addr.addr4; lle->la_preempt--; callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit); LLE_WUNLOCK(lle); NET_EPOCH_ENTER(et); arprequest(ifp, NULL, &dst, NULL); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return; } /* Nothing happened. Reschedule if not too late */ if (lle->la_expire > time_uptime) { callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit); LLE_WUNLOCK(lle); CURVNET_RESTORE(); return; } break; case ARP_LLINFO_INCOMPLETE: case ARP_LLINFO_DELETED: break; } if ((lle->la_flags & LLE_DELETED) == 0) { int evt; if (lle->la_flags & LLE_VALID) evt = LLENTRY_EXPIRED; else evt = LLENTRY_TIMEDOUT; EVENTHANDLER_INVOKE(lle_event, lle, evt); } callout_stop(&lle->lle_timer); /* XXX: LOR avoidance. We still have ref on lle. */ LLE_WUNLOCK(lle); IF_AFDATA_LOCK(ifp); LLE_WLOCK(lle); /* Guard against race with other llentry_free(). */ if (lle->la_flags & LLE_LINKED) { LLE_REMREF(lle); lltable_unlink_entry(lle->lle_tbl, lle); } IF_AFDATA_UNLOCK(ifp); size_t pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); ARPSTAT_INC(timeouts); CURVNET_RESTORE(); } /* * Stores link-layer header for @ifp in format suitable for if_output() * into buffer @buf. Resulting header length is stored in @bufsize. * * Returns 0 on success. */ static int arp_fillheader(struct ifnet *ifp, struct arphdr *ah, int bcast, u_char *buf, size_t *bufsize) { struct if_encap_req ereq; int error; bzero(buf, *bufsize); bzero(&ereq, sizeof(ereq)); ereq.buf = buf; ereq.bufsize = *bufsize; ereq.rtype = IFENCAP_LL; ereq.family = AF_ARP; ereq.lladdr = ar_tha(ah); ereq.hdata = (u_char *)ah; if (bcast) ereq.flags = IFENCAP_FLAG_BROADCAST; error = ifp->if_requestencap(ifp, &ereq); if (error == 0) *bufsize = ereq.bufsize; return (error); } /* * Broadcast an ARP request. Caller specifies: * - arp header source ip address * - arp header target ip address * - arp header source ethernet address */ static int arprequest_internal(struct ifnet *ifp, const struct in_addr *sip, const struct in_addr *tip, u_char *enaddr) { struct mbuf *m; struct arphdr *ah; struct sockaddr sa; u_char *carpaddr = NULL; uint8_t linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; struct route ro; int error; NET_EPOCH_ASSERT(); if (sip == NULL) { /* * The caller did not supply a source address, try to find * a compatible one among those assigned to this interface. */ struct ifaddr *ifa; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; if (ifa->ifa_carp) { if ((*carp_iamatch_p)(ifa, &carpaddr) == 0) continue; sip = &IA_SIN(ifa)->sin_addr; } else { carpaddr = NULL; sip = &IA_SIN(ifa)->sin_addr; } if (0 == ((sip->s_addr ^ tip->s_addr) & IA_MASKSIN(ifa)->sin_addr.s_addr)) break; /* found it. */ } if (sip == NULL) { printf("%s: cannot find matching address\n", __func__); return (EADDRNOTAVAIL); } } if (enaddr == NULL) enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp); if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) return (ENOMEM); m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) + 2 * ifp->if_addrlen; m->m_pkthdr.len = m->m_len; M_ALIGN(m, m->m_len); ah = mtod(m, struct arphdr *); bzero((caddr_t)ah, m->m_len); #ifdef MAC mac_netinet_arp_send(ifp, m); #endif ah->ar_pro = htons(ETHERTYPE_IP); ah->ar_hln = ifp->if_addrlen; /* hardware address length */ ah->ar_pln = sizeof(struct in_addr); /* protocol address length */ ah->ar_op = htons(ARPOP_REQUEST); bcopy(enaddr, ar_sha(ah), ah->ar_hln); bcopy(sip, ar_spa(ah), ah->ar_pln); bcopy(tip, ar_tpa(ah), ah->ar_pln); sa.sa_family = AF_ARP; sa.sa_len = 2; /* Calculate link header for sending frame */ bzero(&ro, sizeof(ro)); linkhdrsize = sizeof(linkhdr); error = arp_fillheader(ifp, ah, 1, linkhdr, &linkhdrsize); if (error != 0 && error != EAFNOSUPPORT) { ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n", if_name(ifp), error); return (error); } ro.ro_prepend = linkhdr; ro.ro_plen = linkhdrsize; ro.ro_flags = 0; m->m_flags |= M_BCAST; m_clrprotoflags(m); /* Avoid confusing lower layers. */ error = (*ifp->if_output)(ifp, m, &sa, &ro); ARPSTAT_INC(txrequests); if (error) { ARPSTAT_INC(txerrors); ARP_LOG(LOG_DEBUG, "Failed to send ARP packet on %s: %d\n", if_name(ifp), error); } return (error); } void arprequest(struct ifnet *ifp, const struct in_addr *sip, const struct in_addr *tip, u_char *enaddr) { (void) arprequest_internal(ifp, sip, tip, enaddr); } /* * Resolve an IP address into an ethernet address - heavy version. * Used internally by arpresolve(). * We have already checked that we can't use an existing lle without * modification so we have to acquire an LLE_EXCLUSIVE lle lock. * * On success, desten and pflags are filled in and the function returns 0; * If the packet must be held pending resolution, we return EWOULDBLOCK * On other errors, we return the corresponding error code. * Note that m_freem() handles NULL. */ static int arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m, const struct sockaddr *dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { struct llentry *la = NULL, *la_tmp; struct mbuf *curr = NULL; struct mbuf *next = NULL; int error, renew; char *lladdr; int ll_len; NET_EPOCH_ASSERT(); if (pflags != NULL) *pflags = 0; if (plle != NULL) *plle = NULL; if ((flags & LLE_CREATE) == 0) la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); if (la == NULL && (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) { la = lltable_alloc_entry(LLTABLE(ifp), 0, dst); if (la == NULL) { char addrbuf[INET_ADDRSTRLEN]; log(LOG_DEBUG, "arpresolve: can't allocate llinfo for %s on %s\n", inet_ntoa_r(SIN(dst)->sin_addr, addrbuf), if_name(ifp)); m_freem(m); return (EINVAL); } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(la); la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); /* Prefer ANY existing lle over newly-created one */ if (la_tmp == NULL) lltable_link_entry(LLTABLE(ifp), la); IF_AFDATA_WUNLOCK(ifp); if (la_tmp != NULL) { lltable_free_entry(LLTABLE(ifp), la); la = la_tmp; } } if (la == NULL) { m_freem(m); return (EINVAL); } if ((la->la_flags & LLE_VALID) && ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) { if (flags & LLE_ADDRONLY) { lladdr = la->ll_addr; ll_len = ifp->if_addrlen; } else { lladdr = la->r_linkdata; ll_len = la->r_hdrlen; } bcopy(lladdr, desten, ll_len); /* Notify LLE code that the entry was used by datapath */ llentry_mark_used(la); if (pflags != NULL) *pflags = la->la_flags & (LLE_VALID|LLE_IFADDR); if (plle) { LLE_ADDREF(la); *plle = la; } LLE_WUNLOCK(la); return (0); } renew = (la->la_asked == 0 || la->la_expire != time_uptime); /* * There is an arptab entry, but no ethernet address * response yet. Add the mbuf to the list, dropping * the oldest packet if we have exceeded the system * setting. */ if (m != NULL) { if (la->la_numheld >= V_arp_maxhold) { if (la->la_hold != NULL) { next = la->la_hold->m_nextpkt; m_freem(la->la_hold); la->la_hold = next; la->la_numheld--; ARPSTAT_INC(dropped); } } if (la->la_hold != NULL) { curr = la->la_hold; while (curr->m_nextpkt != NULL) curr = curr->m_nextpkt; curr->m_nextpkt = m; } else la->la_hold = m; la->la_numheld++; } /* * Return EWOULDBLOCK if we have tried less than arp_maxtries. It * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH * if we have already sent arp_maxtries ARP requests. Retransmit the * ARP request, but not faster than one request per second. */ if (la->la_asked < V_arp_maxtries) error = EWOULDBLOCK; /* First request. */ else error = is_gw != 0 ? EHOSTUNREACH : EHOSTDOWN; if (renew) { int canceled, e; LLE_ADDREF(la); la->la_expire = time_uptime; canceled = callout_reset(&la->lle_timer, hz * V_arpt_down, arptimer, la); if (canceled) LLE_REMREF(la); la->la_asked++; LLE_WUNLOCK(la); e = arprequest_internal(ifp, NULL, &SIN(dst)->sin_addr, NULL); /* * Only overwrite 'error' in case of error; in case of success * the proper return value was already set above. */ if (e != 0) return (e); return (error); } LLE_WUNLOCK(la); return (error); } /* * Lookups link header based on an IP address. * On input: * ifp is the interface we use * is_gw != 0 if @dst represents gateway to some destination * m is the mbuf. May be NULL if we don't have a packet. * dst is the next hop, * desten is the storage to put LL header. * flags returns subset of lle flags: LLE_VALID | LLE_IFADDR * * On success, full/partial link header and flags are filled in and * the function returns 0. * If the packet must be held pending resolution, we return EWOULDBLOCK * On other errors, we return the corresponding error code. * Note that m_freem() handles NULL. */ int arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m, const struct sockaddr *dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { struct llentry *la = NULL; NET_EPOCH_ASSERT(); if (pflags != NULL) *pflags = 0; if (plle != NULL) *plle = NULL; if (m != NULL) { if (m->m_flags & M_BCAST) { /* broadcast */ (void)memcpy(desten, ifp->if_broadcastaddr, ifp->if_addrlen); return (0); } if (m->m_flags & M_MCAST) { /* multicast */ ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); return (0); } } la = lla_lookup(LLTABLE(ifp), plle ? LLE_EXCLUSIVE : LLE_UNLOCKED, dst); if (la != NULL && (la->r_flags & RLLE_VALID) != 0) { /* Entry found, let's copy lle info */ bcopy(la->r_linkdata, desten, la->r_hdrlen); if (pflags != NULL) *pflags = LLE_VALID | (la->r_flags & RLLE_IFADDR); /* Notify the LLE handling code that the entry was used. */ llentry_mark_used(la); if (plle) { LLE_ADDREF(la); *plle = la; LLE_WUNLOCK(la); } return (0); } if (plle && la) LLE_WUNLOCK(la); return (arpresolve_full(ifp, is_gw, la == NULL ? LLE_CREATE : 0, m, dst, desten, pflags, plle)); } /* * Common length and type checks are done here, * then the protocol-specific routine is called. */ static void arpintr(struct mbuf *m) { struct arphdr *ar; struct ifnet *ifp; char *layer; int hlen; ifp = m->m_pkthdr.rcvif; if (m->m_len < sizeof(struct arphdr) && ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) { ARP_LOG(LOG_NOTICE, "packet with short header received on %s\n", if_name(ifp)); return; } ar = mtod(m, struct arphdr *); /* Check if length is sufficient */ if (m->m_len < arphdr_len(ar)) { m = m_pullup(m, arphdr_len(ar)); if (m == NULL) { ARP_LOG(LOG_NOTICE, "short packet received on %s\n", if_name(ifp)); return; } ar = mtod(m, struct arphdr *); } hlen = 0; layer = ""; switch (ntohs(ar->ar_hrd)) { case ARPHRD_ETHER: hlen = ETHER_ADDR_LEN; /* RFC 826 */ layer = "ethernet"; break; case ARPHRD_INFINIBAND: hlen = 20; /* RFC 4391, INFINIBAND_ALEN */ layer = "infiniband"; break; case ARPHRD_IEEE1394: hlen = 0; /* SHALL be 16 */ /* RFC 2734 */ layer = "firewire"; /* * Restrict too long hardware addresses. * Currently we are capable of handling 20-byte * addresses ( sizeof(lle->ll_addr) ) */ if (ar->ar_hln >= 20) hlen = 16; break; default: ARP_LOG(LOG_NOTICE, "packet with unknown hardware format 0x%02d received on " "%s\n", ntohs(ar->ar_hrd), if_name(ifp)); m_freem(m); return; } if (hlen != 0 && hlen != ar->ar_hln) { ARP_LOG(LOG_NOTICE, "packet with invalid %s address length %d received on %s\n", layer, ar->ar_hln, if_name(ifp)); m_freem(m); return; } ARPSTAT_INC(received); switch (ntohs(ar->ar_pro)) { #ifdef INET case ETHERTYPE_IP: in_arpinput(m); return; #endif } m_freem(m); } #ifdef INET /* * ARP for Internet protocols on 10 Mb/s Ethernet. * Algorithm is that given in RFC 826. * In addition, a sanity check is performed on the sender * protocol address, to catch impersonators. * We no longer handle negotiations for use of trailer protocol: * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent * along with IP replies if we wanted trailers sent to us, * and also sent them in response to IP replies. * This allowed either end to announce the desire to receive * trailer packets. * We no longer reply to requests for ETHERTYPE_TRAIL protocol either, * but formerly didn't normally send requests. */ static int log_arp_wrong_iface = 1; static int log_arp_movements = 1; static int log_arp_permanent_modify = 1; static int allow_multicast = 0; SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW, &log_arp_wrong_iface, 0, "log arp packets arriving on the wrong interface"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW, &log_arp_movements, 0, "log arp replies from MACs different than the one in the cache"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW, &log_arp_permanent_modify, 0, "log arp replies from MACs different than the one in the permanent arp entry"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW, &allow_multicast, 0, "accept multicast addresses"); static void in_arpinput(struct mbuf *m) { struct rm_priotracker in_ifa_tracker; struct arphdr *ah; struct ifnet *ifp = m->m_pkthdr.rcvif; struct llentry *la = NULL, *la_tmp; struct ifaddr *ifa; struct in_ifaddr *ia; struct sockaddr sa; struct in_addr isaddr, itaddr, myaddr; u_int8_t *enaddr = NULL; int op; int bridged = 0, is_bridge = 0; int carped; struct sockaddr_in sin; struct sockaddr *dst; - struct nhop4_basic nh4; + struct nhop_object *nh; uint8_t linkhdr[LLE_MAX_LINKHDR]; struct route ro; size_t linkhdrsize; int lladdr_off; int error; char addrbuf[INET_ADDRSTRLEN]; NET_EPOCH_ASSERT(); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr.s_addr = 0; if (ifp->if_bridge) bridged = 1; if (ifp->if_type == IFT_BRIDGE) is_bridge = 1; /* * We already have checked that mbuf contains enough contiguous data * to hold entire arp message according to the arp header. */ ah = mtod(m, struct arphdr *); /* * ARP is only for IPv4 so we can reject packets with * a protocol length not equal to an IPv4 address. */ if (ah->ar_pln != sizeof(struct in_addr)) { ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n", sizeof(struct in_addr)); goto drop; } if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) { ARP_LOG(LOG_NOTICE, "%*D is multicast\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":"); goto drop; } op = ntohs(ah->ar_op); (void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr)); (void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr)); if (op == ARPOP_REPLY) ARPSTAT_INC(rxreplies); /* * For a bridge, we want to check the address irrespective * of the receive interface. (This will change slightly * when we have clusters of interfaces). */ IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || ia->ia_ifp == ifp) && itaddr.s_addr == ia->ia_addr.sin_addr.s_addr && (ia->ia_ifa.ifa_carp == NULL || (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } } LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash) if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || ia->ia_ifp == ifp) && isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } #define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \ (ia->ia_ifp->if_bridge == ifp->if_softc && \ !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) && \ addr == ia->ia_addr.sin_addr.s_addr) /* * Check the case when bridge shares its MAC address with * some of its children, so packets are claimed by bridge * itself (bridge_input() does it first), but they are really * meant to be destined to the bridge member. */ if (is_bridge) { LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) { ifa_ref(&ia->ia_ifa); ifp = ia->ia_ifp; IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } } } #undef BDG_MEMBER_MATCHES_ARP IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * No match, use the first inet address on the receive interface * as a dummy address for the rest of the function. */ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && (ifa->ifa_carp == NULL || (*carp_iamatch_p)(ifa, &enaddr))) { ia = ifatoia(ifa); ifa_ref(ifa); goto match; } /* * If bridging, fall back to using any inet address. */ IN_IFADDR_RLOCK(&in_ifa_tracker); if (!bridged || (ia = CK_STAILQ_FIRST(&V_in_ifaddrhead)) == NULL) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto drop; } ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); match: if (!enaddr) enaddr = (u_int8_t *)IF_LLADDR(ifp); carped = (ia->ia_ifa.ifa_carp != NULL); myaddr = ia->ia_addr.sin_addr; ifa_free(&ia->ia_ifa); if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen)) goto drop; /* it's from me, ignore it. */ if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) { ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address " "%s!\n", inet_ntoa_r(isaddr, addrbuf)); goto drop; } if (ifp->if_addrlen != ah->ar_hln) { ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, " "i/f %d (ignored)\n", ifp->if_addrlen, (u_char *) ar_sha(ah), ":", ah->ar_hln, ifp->if_addrlen); goto drop; } /* * Warn if another host is using the same IP address, but only if the * IP address isn't 0.0.0.0, which is used for DHCP only, in which * case we suppress the warning to avoid false positive complaints of * potential misconfiguration. */ if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr && myaddr.s_addr != 0) { ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", inet_ntoa_r(isaddr, addrbuf), ifp->if_xname); itaddr = myaddr; ARPSTAT_INC(dupips); goto reply; } if (ifp->if_flags & IFF_STATICARP) goto reply; bzero(&sin, sizeof(sin)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr = isaddr; dst = (struct sockaddr *)&sin; la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); if (la != NULL) arp_check_update_lle(ah, isaddr, ifp, bridged, la); else if (itaddr.s_addr == myaddr.s_addr) { /* * Request/reply to our address, but no lle exists yet. * Calculate full link prepend to use in lle. */ linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr, &linkhdrsize, &lladdr_off) != 0) goto reply; /* Allocate new entry */ la = lltable_alloc_entry(LLTABLE(ifp), 0, dst); if (la == NULL) { /* * lle creation may fail if source address belongs * to non-directly connected subnet. However, we * will try to answer the request instead of dropping * frame. */ goto reply; } lltable_set_entry_addr(ifp, la, linkhdr, linkhdrsize, lladdr_off); IF_AFDATA_WLOCK(ifp); LLE_WLOCK(la); la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); /* * Check if lle still does not exists. * If it does, that means that we either * 1) have configured it explicitly, via * 1a) 'arp -s' static entry or * 1b) interface address static record * or * 2) it was the result of sending first packet to-host * or * 3) it was another arp reply packet we handled in * different thread. * * In all cases except 3) we definitely need to prefer * existing lle. For the sake of simplicity, prefer any * existing lle over newly-create one. */ if (la_tmp == NULL) lltable_link_entry(LLTABLE(ifp), la); IF_AFDATA_WUNLOCK(ifp); if (la_tmp == NULL) { arp_mark_lle_reachable(la); LLE_WUNLOCK(la); } else { /* Free newly-create entry and handle packet */ lltable_free_entry(LLTABLE(ifp), la); la = la_tmp; la_tmp = NULL; arp_check_update_lle(ah, isaddr, ifp, bridged, la); /* arp_check_update_lle() returns @la unlocked */ } la = NULL; } reply: if (op != ARPOP_REQUEST) goto drop; ARPSTAT_INC(rxrequests); if (itaddr.s_addr == myaddr.s_addr) { /* Shortcut.. the receiving interface is the target. */ (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); } else { /* * Destination address is not ours. Check if * proxyarp entry exists or proxyarp is turned on globally. */ struct llentry *lle; sin.sin_addr = itaddr; lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin); if ((lle != NULL) && (lle->la_flags & LLE_PUB)) { (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), lle->ll_addr, ah->ar_hln); LLE_RUNLOCK(lle); } else { if (lle != NULL) LLE_RUNLOCK(lle); if (!V_arp_proxyall) goto drop; - if (fib4_lookup_nh_basic(ifp->if_fib, itaddr, 0, 0, - &nh4) != 0) + NET_EPOCH_ASSERT(); + nh = fib4_lookup(ifp->if_fib, itaddr, 0, 0, 0); + if (nh == NULL) goto drop; /* * Don't send proxies for nodes on the same interface * as this one came out of, or we'll get into a fight * over who claims what Ether address. */ - if (nh4.nh_ifp == ifp) + if (nh->nh_ifp == ifp) goto drop; (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); /* * Also check that the node which sent the ARP packet * is on the interface we expect it to be on. This * avoids ARP chaos if an interface is connected to the * wrong network. */ - if (fib4_lookup_nh_basic(ifp->if_fib, isaddr, 0, 0, - &nh4) != 0) + nh = fib4_lookup(ifp->if_fib, isaddr, 0, 0, 0); + if (nh == NULL) goto drop; - if (nh4.nh_ifp != ifp) { + if (nh->nh_ifp != ifp) { ARP_LOG(LOG_INFO, "proxy: ignoring request" " from %s via %s\n", inet_ntoa_r(isaddr, addrbuf), ifp->if_xname); goto drop; } #ifdef DEBUG_PROXY printf("arp: proxying for %s\n", inet_ntoa_r(itaddr, addrbuf)); #endif } } if (itaddr.s_addr == myaddr.s_addr && IN_LINKLOCAL(ntohl(itaddr.s_addr))) { /* RFC 3927 link-local IPv4; always reply by broadcast. */ #ifdef DEBUG_LINKLOCAL printf("arp: sending reply for link-local addr %s\n", inet_ntoa_r(itaddr, addrbuf)); #endif m->m_flags |= M_BCAST; m->m_flags &= ~M_MCAST; } else { /* default behaviour; never reply by broadcast. */ m->m_flags &= ~(M_BCAST|M_MCAST); } (void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln); (void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln); ah->ar_op = htons(ARPOP_REPLY); ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */ m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln); m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = NULL; sa.sa_family = AF_ARP; sa.sa_len = 2; /* Calculate link header for sending frame */ bzero(&ro, sizeof(ro)); linkhdrsize = sizeof(linkhdr); error = arp_fillheader(ifp, ah, 0, linkhdr, &linkhdrsize); /* * arp_fillheader() may fail due to lack of support inside encap request * routing. This is not necessary an error, AF_ARP can/should be handled * by if_output(). */ if (error != 0 && error != EAFNOSUPPORT) { ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n", if_name(ifp), error); return; } ro.ro_prepend = linkhdr; ro.ro_plen = linkhdrsize; ro.ro_flags = 0; m_clrprotoflags(m); /* Avoid confusing lower layers. */ (*ifp->if_output)(ifp, m, &sa, &ro); ARPSTAT_INC(txreplies); return; drop: m_freem(m); } #endif /* * Checks received arp data against existing @la. * Updates lle state/performs notification if necessary. */ static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp, int bridged, struct llentry *la) { struct sockaddr sa; struct mbuf *m_hold, *m_hold_next; uint8_t linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; char addrbuf[INET_ADDRSTRLEN]; LLE_WLOCK_ASSERT(la); /* the following is not an error when doing bridging */ if (!bridged && la->lle_tbl->llt_ifp != ifp) { if (log_arp_wrong_iface) ARP_LOG(LOG_WARNING, "%s is on %s " "but got reply from %*D on %s\n", inet_ntoa_r(isaddr, addrbuf), la->lle_tbl->llt_ifp->if_xname, ifp->if_addrlen, (u_char *)ar_sha(ah), ":", ifp->if_xname); LLE_WUNLOCK(la); return; } if ((la->la_flags & LLE_VALID) && bcmp(ar_sha(ah), la->ll_addr, ifp->if_addrlen)) { if (la->la_flags & LLE_STATIC) { LLE_WUNLOCK(la); if (log_arp_permanent_modify) ARP_LOG(LOG_ERR, "%*D attempts to modify " "permanent entry for %s on %s\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", inet_ntoa_r(isaddr, addrbuf), ifp->if_xname); return; } if (log_arp_movements) { ARP_LOG(LOG_INFO, "%s moved from %*D " "to %*D on %s\n", inet_ntoa_r(isaddr, addrbuf), ifp->if_addrlen, (u_char *)la->ll_addr, ":", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", ifp->if_xname); } } /* Calculate full link prepend to use in lle */ linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr, &linkhdrsize, &lladdr_off) != 0) return; /* Check if something has changed */ if (memcmp(la->r_linkdata, linkhdr, linkhdrsize) != 0 || (la->la_flags & LLE_VALID) == 0) { /* Try to perform LLE update */ if (lltable_try_set_entry_addr(ifp, la, linkhdr, linkhdrsize, lladdr_off) == 0) return; /* Clear fast path feedback request if set */ la->r_skip_req = 0; } arp_mark_lle_reachable(la); /* * The packets are all freed within the call to the output * routine. * * NB: The lock MUST be released before the call to the * output routine. */ if (la->la_hold != NULL) { m_hold = la->la_hold; la->la_hold = NULL; la->la_numheld = 0; lltable_fill_sa_entry(la, &sa); LLE_WUNLOCK(la); for (; m_hold != NULL; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_hold->m_nextpkt = NULL; /* Avoid confusing lower layers. */ m_clrprotoflags(m_hold); (*ifp->if_output)(ifp, m_hold, &sa, NULL); } } else LLE_WUNLOCK(la); } static void arp_mark_lle_reachable(struct llentry *la) { int canceled, wtime; LLE_WLOCK_ASSERT(la); la->ln_state = ARP_LLINFO_REACHABLE; EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED); if (!(la->la_flags & LLE_STATIC)) { LLE_ADDREF(la); la->la_expire = time_uptime + V_arpt_keep; wtime = V_arpt_keep - V_arp_maxtries * V_arpt_rexmit; if (wtime < 0) wtime = V_arpt_keep; canceled = callout_reset(&la->lle_timer, hz * wtime, arptimer, la); if (canceled) LLE_REMREF(la); } la->la_asked = 0; la->la_preempt = V_arp_maxtries; } /* * Add permanent link-layer record for given interface address. */ static __noinline void arp_add_ifa_lle(struct ifnet *ifp, const struct sockaddr *dst) { struct llentry *lle, *lle_tmp; /* * Interface address LLE record is considered static * because kernel code relies on LLE_STATIC flag to check * if these entries can be rewriten by arp updates. */ lle = lltable_alloc_entry(LLTABLE(ifp), LLE_IFADDR | LLE_STATIC, dst); if (lle == NULL) { log(LOG_INFO, "arp_ifinit: cannot create arp " "entry for interface address\n"); return; } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(lle); /* Unlink any entry if exists */ lle_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); if (lle_tmp != NULL) lltable_unlink_entry(LLTABLE(ifp), lle_tmp); lltable_link_entry(LLTABLE(ifp), lle); IF_AFDATA_WUNLOCK(ifp); if (lle_tmp != NULL) EVENTHANDLER_INVOKE(lle_event, lle_tmp, LLENTRY_EXPIRED); EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED); LLE_WUNLOCK(lle); if (lle_tmp != NULL) lltable_free_entry(LLTABLE(ifp), lle_tmp); } /* * Handle the garp_rexmit_count. Like sysctl_handle_int(), but limits the range * of valid values. */ static int sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS) { int error; int rexmit_count = *(int *)arg1; error = sysctl_handle_int(oidp, &rexmit_count, 0, req); /* Enforce limits on any new value that may have been set. */ if (!error && req->newptr) { /* A new value was set. */ if (rexmit_count < 0) { rexmit_count = 0; } else if (rexmit_count > MAX_GARP_RETRANSMITS) { rexmit_count = MAX_GARP_RETRANSMITS; } *(int *)arg1 = rexmit_count; } return (error); } /* * Retransmit a Gratuitous ARP (GARP) and, if necessary, schedule a callout to * retransmit it again. A pending callout owns a reference to the ifa. */ static void garp_rexmit(void *arg) { struct in_ifaddr *ia = arg; if (callout_pending(&ia->ia_garp_timer) || !callout_active(&ia->ia_garp_timer)) { IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp); ifa_free(&ia->ia_ifa); return; } CURVNET_SET(ia->ia_ifa.ifa_ifp->if_vnet); /* * Drop lock while the ARP request is generated. */ IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp); arprequest(ia->ia_ifa.ifa_ifp, &IA_SIN(ia)->sin_addr, &IA_SIN(ia)->sin_addr, IF_LLADDR(ia->ia_ifa.ifa_ifp)); /* * Increment the count of retransmissions. If the count has reached the * maximum value, stop sending the GARP packets. Otherwise, schedule * the callout to retransmit another GARP packet. */ ++ia->ia_garp_count; if (ia->ia_garp_count >= garp_rexmit_count) { ifa_free(&ia->ia_ifa); } else { int rescheduled; IF_ADDR_WLOCK(ia->ia_ifa.ifa_ifp); rescheduled = callout_reset(&ia->ia_garp_timer, (1 << ia->ia_garp_count) * hz, garp_rexmit, ia); IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp); if (rescheduled) { ifa_free(&ia->ia_ifa); } } CURVNET_RESTORE(); } /* * Start the GARP retransmit timer. * * A single GARP is always transmitted when an IPv4 address is added * to an interface and that is usually sufficient. However, in some * circumstances, such as when a shared address is passed between * cluster nodes, this single GARP may occasionally be dropped or * lost. This can lead to neighbors on the network link working with a * stale ARP cache and sending packets destined for that address to * the node that previously owned the address, which may not respond. * * To avoid this situation, GARP retransmits can be enabled by setting * the net.link.ether.inet.garp_rexmit_count sysctl to a value greater * than zero. The setting represents the maximum number of * retransmissions. The interval between retransmissions is calculated * using an exponential backoff algorithm, doubling each time, so the * retransmission intervals are: {1, 2, 4, 8, 16, ...} (seconds). */ static void garp_timer_start(struct ifaddr *ifa) { struct in_ifaddr *ia = (struct in_ifaddr *) ifa; IF_ADDR_WLOCK(ia->ia_ifa.ifa_ifp); ia->ia_garp_count = 0; if (callout_reset(&ia->ia_garp_timer, (1 << ia->ia_garp_count) * hz, garp_rexmit, ia) == 0) { ifa_ref(ifa); } IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp); } void arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa) { struct epoch_tracker et; const struct sockaddr_in *dst_in; const struct sockaddr *dst; if (ifa->ifa_carp != NULL) return; dst = ifa->ifa_addr; dst_in = (const struct sockaddr_in *)dst; if (ntohl(dst_in->sin_addr.s_addr) == INADDR_ANY) return; NET_EPOCH_ENTER(et); arp_announce_ifaddr(ifp, dst_in->sin_addr, IF_LLADDR(ifp)); NET_EPOCH_EXIT(et); if (garp_rexmit_count > 0) { garp_timer_start(ifa); } arp_add_ifa_lle(ifp, dst); } void arp_announce_ifaddr(struct ifnet *ifp, struct in_addr addr, u_char *enaddr) { if (ntohl(addr.s_addr) != INADDR_ANY) arprequest(ifp, &addr, &addr, enaddr); } /* * Sends gratuitous ARPs for each ifaddr to notify other * nodes about the address change. */ static __noinline void arp_handle_ifllchange(struct ifnet *ifp) { struct ifaddr *ifa; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit(ifp, ifa); } } /* * A handler for interface link layer address change event. */ static void arp_iflladdr(void *arg __unused, struct ifnet *ifp) { lltable_update_ifaddr(LLTABLE(ifp)); if ((ifp->if_flags & IFF_UP) != 0) arp_handle_ifllchange(ifp); } static void vnet_arp_init(void) { if (IS_DEFAULT_VNET(curvnet)) { netisr_register(&arp_nh); iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event, arp_iflladdr, NULL, EVENTHANDLER_PRI_ANY); } #ifdef VIMAGE else netisr_register_vnet(&arp_nh); #endif } VNET_SYSINIT(vnet_arp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, vnet_arp_init, 0); #ifdef VIMAGE /* * We have to unregister ARP along with IP otherwise we risk doing INADDR_HASH * lookups after destroying the hash. Ideally this would go on SI_ORDER_3.5. */ static void vnet_arp_destroy(__unused void *arg) { netisr_unregister_vnet(&arp_nh); } VNET_SYSUNINIT(vnet_arp_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, vnet_arp_destroy, NULL); #endif Index: head/sys/netinet/in_fib.c =================================================================== --- head/sys/netinet/in_fib.c (revision 362899) +++ head/sys/netinet/in_fib.c (revision 362900) @@ -1,405 +1,246 @@ /*- * Copyright (c) 2015 * Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_route.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #ifdef INET /* Verify struct route compatiblity */ /* Assert 'struct route_in' is compatible with 'struct route' */ CHK_STRUCT_ROUTE_COMPAT(struct route_in, ro_dst4); -static void fib4_rte_to_nh_basic(struct nhop_object *nh, struct in_addr dst, - uint32_t flags, struct nhop4_basic *pnh4); -static void fib4_rte_to_nh_extended(struct nhop_object *nh, struct in_addr dst, - uint32_t flags, struct nhop4_extended *pnh4); - - -static void -fib4_rte_to_nh_basic(struct nhop_object *nh, struct in_addr dst, - uint32_t flags, struct nhop4_basic *pnh4) -{ - - if ((flags & NHR_IFAIF) != 0) - pnh4->nh_ifp = nh->nh_ifa->ifa_ifp; - else - pnh4->nh_ifp = nh->nh_ifp; - pnh4->nh_mtu = nh->nh_mtu; - if (nh->nh_flags & NHF_GATEWAY) - pnh4->nh_addr = nh->gw4_sa.sin_addr; - else - pnh4->nh_addr = dst; - /* Set flags */ - pnh4->nh_flags = nh->nh_flags; - /* TODO: Handle RTF_BROADCAST here */ -} - -static void -fib4_rte_to_nh_extended(struct nhop_object *nh, struct in_addr dst, - uint32_t flags, struct nhop4_extended *pnh4) -{ - - if ((flags & NHR_IFAIF) != 0) - pnh4->nh_ifp = nh->nh_ifa->ifa_ifp; - else - pnh4->nh_ifp = nh->nh_ifp; - pnh4->nh_mtu = nh->nh_mtu; - if (nh->nh_flags & NHF_GATEWAY) - pnh4->nh_addr = nh->gw4_sa.sin_addr; - else - pnh4->nh_addr = dst; - /* Set flags */ - pnh4->nh_flags = nh->nh_flags; - pnh4->nh_ia = ifatoia(nh->nh_ifa); - pnh4->nh_src = IA_SIN(pnh4->nh_ia)->sin_addr; -} - -/* - * Performs IPv4 route table lookup on @dst. Returns 0 on success. - * Stores nexthop info provided @pnh4 structure. - * Note that - * - nh_ifp cannot be safely dereferenced - * - nh_ifp represents logical transmit interface (rt_ifp) (e.g. if - * looking up address on interface "ix0" pointer to "lo0" interface - * will be returned instead of "ix0") - * - nh_ifp represents "address" interface if NHR_IFAIF flag is passed - * - howewer mtu from "transmit" interface will be returned. - */ -int -fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flags, - uint32_t flowid, struct nhop4_basic *pnh4) -{ - RIB_RLOCK_TRACKER; - struct rib_head *rh; - struct radix_node *rn; - struct sockaddr_in sin; - struct nhop_object *nh; - - KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_basic: bad fibnum")); - rh = rt_tables_get_rnh(fibnum, AF_INET); - if (rh == NULL) - return (ENOENT); - - /* Prepare lookup key */ - memset(&sin, 0, sizeof(sin)); - sin.sin_len = sizeof(struct sockaddr_in); - sin.sin_addr = dst; - - RIB_RLOCK(rh); - rn = rh->rnh_matchaddr((void *)&sin, &rh->head); - if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - nh = RNTORT(rn)->rt_nhop; - /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(nh->nh_ifp)) { - fib4_rte_to_nh_basic(nh, dst, flags, pnh4); - RIB_RUNLOCK(rh); - - return (0); - } - } - RIB_RUNLOCK(rh); - - return (ENOENT); -} - -/* - * Performs IPv4 route table lookup on @dst. Returns 0 on success. - * Stores extende nexthop info provided @pnh4 structure. - * Note that - * - nh_ifp cannot be safely dereferenced unless NHR_REF is specified. - * - in that case you need to call fib4_free_nh_ext() - * - nh_ifp represents logical transmit interface (rt_ifp) (e.g. if - * looking up address of interface "ix0" pointer to "lo0" interface - * will be returned instead of "ix0") - * - nh_ifp represents "address" interface if NHR_IFAIF flag is passed - * - howewer mtu from "transmit" interface will be returned. - */ -int -fib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flags, - uint32_t flowid, struct nhop4_extended *pnh4) -{ - RIB_RLOCK_TRACKER; - struct rib_head *rh; - struct radix_node *rn; - struct sockaddr_in sin; - struct rtentry *rte; - struct nhop_object *nh; - - KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_ext: bad fibnum")); - rh = rt_tables_get_rnh(fibnum, AF_INET); - if (rh == NULL) - return (ENOENT); - - /* Prepare lookup key */ - memset(&sin, 0, sizeof(sin)); - sin.sin_len = sizeof(struct sockaddr_in); - sin.sin_addr = dst; - - RIB_RLOCK(rh); - rn = rh->rnh_matchaddr((void *)&sin, &rh->head); - if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rte = RNTORT(rn); -#ifdef RADIX_MPATH - rte = rt_mpath_select(rte, flowid); - if (rte == NULL) { - RIB_RUNLOCK(rh); - return (ENOENT); - } -#endif - nh = rte->rt_nhop; - /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(nh->nh_ifp)) { - fib4_rte_to_nh_extended(nh, dst, flags, pnh4); - if ((flags & NHR_REF) != 0) { - /* TODO: lwref on egress ifp's ? */ - } - RIB_RUNLOCK(rh); - - return (0); - } - } - RIB_RUNLOCK(rh); - - return (ENOENT); -} - -void -fib4_free_nh_ext(uint32_t fibnum, struct nhop4_extended *pnh4) -{ - -} /* * Looks up path in fib @fibnum specified by @dst. * Returns path nexthop on success. Nexthop is safe to use * within the current network epoch. If longer lifetime is required, * one needs to pass NHR_REF as a flag. This will return referenced * nexthop. */ struct nhop_object * fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, uint32_t flags, uint32_t flowid) { RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; struct rtentry *rt; struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib4_lookup: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET); if (rh == NULL) return (NULL); /* Prepare lookup key */ struct sockaddr_in sin4; memset(&sin4, 0, sizeof(sin4)); sin4.sin_family = AF_INET; sin4.sin_len = sizeof(struct sockaddr_in); sin4.sin_addr = dst; nh = NULL; RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rt = RNTORT(rn); #ifdef RADIX_MPATH if (rt_mpath_next(rt) != NULL) rt = rt_mpath_selectrte(rt, flowid); #endif nh = rt->rt_nhop; /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { if (flags & NHR_REF) nhop_ref_object(nh); RIB_RUNLOCK(rh); return (nh); } } RIB_RUNLOCK(rh); RTSTAT_INC(rts_unreach); return (NULL); } inline static int check_urpf(const struct nhop_object *nh, uint32_t flags, const struct ifnet *src_if) { if (src_if != NULL && nh->nh_aifp == src_if) { return (1); } if (src_if == NULL) { if ((flags & NHR_NODEFAULT) == 0) return (1); else if ((nh->nh_flags & NHF_DEFAULT) == 0) return (1); } return (0); } #ifdef RADIX_MPATH inline static int check_urpf_mpath(struct rtentry *rt, uint32_t flags, const struct ifnet *src_if) { while (rt != NULL) { if (check_urpf(rt->rt_nhop, flags, src_if) != 0) return (1); rt = rt_mpath_next(rt); } return (0); } #endif /* * Performs reverse path forwarding lookup. * If @src_if is non-zero, verifies that at least 1 path goes via * this interface. * If @src_if is zero, verifies that route exist. * if @flags contains NHR_NOTDEFAULT, do not consider default route. * * Returns 1 if route matching conditions is found, 0 otherwise. */ int fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, uint32_t flags, const struct ifnet *src_if) { RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; struct rtentry *rt; int ret; KASSERT((fibnum < rt_numfibs), ("fib4_check_urpf: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET); if (rh == NULL) return (0); /* Prepare lookup key */ struct sockaddr_in sin4; memset(&sin4, 0, sizeof(sin4)); sin4.sin_len = sizeof(struct sockaddr_in); sin4.sin_addr = dst; RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rt = RNTORT(rn); #ifdef RADIX_MPATH ret = check_urpf_mpath(rt, flags, src_if); #else ret = check_urpf(rt->rt_nhop, flags, src_if); #endif RIB_RUNLOCK(rh); return (ret); } RIB_RUNLOCK(rh); return (0); } struct nhop_object * fib4_lookup_debugnet(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, uint32_t flags) { struct rib_head *rh; struct radix_node *rn; struct rtentry *rt; struct nhop_object *nh; KASSERT((fibnum < rt_numfibs), ("fib4_lookup_debugnet: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET); if (rh == NULL) return (NULL); /* Prepare lookup key */ struct sockaddr_in sin4; memset(&sin4, 0, sizeof(sin4)); sin4.sin_family = AF_INET; sin4.sin_len = sizeof(struct sockaddr_in); sin4.sin_addr = dst; nh = NULL; /* unlocked lookup */ rn = rh->rnh_matchaddr((void *)&sin4, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rt = RNTORT(rn); #ifdef RADIX_MPATH if (rt_mpath_next(rt) != NULL) rt = rt_mpath_selectrte(rt, 0); #endif nh = rt->rt_nhop; /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { if (flags & NHR_REF) nhop_ref_object(nh); return (nh); } } return (NULL); } #endif Index: head/sys/netinet/in_fib.h =================================================================== --- head/sys/netinet/in_fib.h (revision 362899) +++ head/sys/netinet/in_fib.h (revision 362900) @@ -1,81 +1,55 @@ /*- * Copyright (c) 2015 * Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _NETINET_IN_FIB_H_ #define _NETINET_IN_FIB_H_ struct route_in { /* common fields shared among all 'struct route' */ struct nhop_object *ro_nh; struct llentry *ro_lle; char *ro_prepend; uint16_t ro_plen; uint16_t ro_flags; uint16_t ro_mtu; /* saved ro_rt mtu */ uint16_t spare; /* custom sockaddr */ struct sockaddr_in ro_dst4; }; -/* Basic nexthop info used for uRPF/mtu checks */ -struct nhop4_basic { - struct ifnet *nh_ifp; /* Logical egress interface */ - uint16_t nh_mtu; /* nexthop mtu */ - uint16_t nh_flags; /* nhop flags */ - struct in_addr nh_addr; /* GW/DST IPv4 address */ -}; - -/* Extended nexthop info used for control protocols */ -struct nhop4_extended { - struct ifnet *nh_ifp; /* Logical egress interface */ - struct in_ifaddr *nh_ia; /* Associated address */ - uint16_t nh_mtu; /* nexthop mtu */ - uint16_t nh_flags; /* nhop flags */ - uint8_t spare[4]; - struct in_addr nh_addr; /* GW/DST IPv4 address */ - struct in_addr nh_src; /* default source IPv4 address */ - uint64_t spare2; -}; - -int fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flags, - uint32_t flowid, struct nhop4_basic *pnh4); -int fib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flags, - uint32_t flowid, struct nhop4_extended *pnh4); -void fib4_free_nh_ext(uint32_t fibnum, struct nhop4_extended *pnh4); - struct nhop_object *fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, uint32_t flags, uint32_t flowid); int fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, uint32_t flags, const struct ifnet *src_if); struct nhop_object *fib4_lookup_debugnet(uint32_t fibnum, struct in_addr dst, uint32_t scopeid, uint32_t flags); #endif Index: head/sys/netinet/in_mcast.c =================================================================== --- head/sys/netinet/in_mcast.c (revision 362899) +++ head/sys/netinet/in_mcast.c (revision 362900) @@ -1,3058 +1,3063 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 2005 Robert N. M. Watson. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPv4 multicast socket, group, and socket option processing module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #ifndef KTR_IGMPV3 #define KTR_IGMPV3 KTR_INET #endif #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in sin; }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ static MALLOC_DEFINE(M_INMFILTER, "in_mfilter", "IPv4 multicast PCB-layer source filter"); static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group"); static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options"); static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource", "IPv4 multicast IGMP-layer source filter"); /* * Locking: * * - Lock order is: Giant, IN_MULTI_LOCK, INP_WLOCK, * IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip_moptions and in_mfilter are covered by the INP_WLOCK. * * struct in_multi is covered by IN_MULTI_LIST_LOCK. There isn't strictly * any need for in_multi itself to be virtualized -- it is bound to an ifp * anyway no matter what happens. */ struct mtx in_multi_list_mtx; MTX_SYSINIT(in_multi_mtx, &in_multi_list_mtx, "in_multi_list_mtx", MTX_DEF); struct mtx in_multi_free_mtx; MTX_SYSINIT(in_multi_free_mtx, &in_multi_free_mtx, "in_multi_free_mtx", MTX_DEF); struct sx in_multi_sx; SX_SYSINIT(in_multi_sx, &in_multi_sx, "in_multi_sx"); int ifma_restart; /* * Functions with non-static linkage defined in this file should be * declared in in_var.h: * imo_multi_filter() * in_addmulti() * in_delmulti() * in_joingroup() * in_joingroup_locked() * in_leavegroup() * in_leavegroup_locked() * and ip_var.h: * inp_freemoptions() * inp_getmoptions() * inp_setmoptions() * * XXX: Both carp and pf need to use the legacy (*,G) KPIs in_addmulti() * and in_delmulti(). */ static void imf_commit(struct in_mfilter *); static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **); static struct in_msource * imf_graft(struct in_mfilter *, const uint8_t, const struct sockaddr_in *); static void imf_leave(struct in_mfilter *); static int imf_prune(struct in_mfilter *, const struct sockaddr_in *); static void imf_purge(struct in_mfilter *); static void imf_rollback(struct in_mfilter *); static void imf_reap(struct in_mfilter *); static struct in_mfilter * imo_match_group(const struct ip_moptions *, const struct ifnet *, const struct sockaddr *); static struct in_msource * imo_match_source(struct in_mfilter *, const struct sockaddr *); static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback); static int in_getmulti(struct ifnet *, const struct in_addr *, struct in_multi **); static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims); #ifdef KTR static int inm_is_ifp_detached(const struct in_multi *); #endif static int inm_merge(struct in_multi *, /*const*/ struct in_mfilter *); static void inm_purge(struct in_multi *); static void inm_reap(struct in_multi *); static void inm_release(struct in_multi *); static struct ip_moptions * inp_findmoptions(struct inpcb *); static int inp_get_source_filters(struct inpcb *, struct sockopt *); static int inp_join_group(struct inpcb *, struct sockopt *); static int inp_leave_group(struct inpcb *, struct sockopt *); static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *, const struct sockaddr_in *, const struct in_addr); static int inp_block_unblock_source(struct inpcb *, struct sockopt *); static int inp_set_multicast_if(struct inpcb *, struct sockopt *); static int inp_set_source_filters(struct inpcb *, struct sockopt *); static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS); static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPv4 multicast"); static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc, CTLFLAG_RWTUN, &in_mcast_maxgrpsrc, 0, "Max source filters per group"); static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc, CTLFLAG_RWTUN, &in_mcast_maxsocksrc, 0, "Max source filters per socket"); int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP; SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RWTUN, &in_mcast_loop, 0, "Loopback multicast datagrams by default"); static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters, "Per-interface stack-wide source filters"); #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp * is detached. */ static int __inline inm_is_ifp_detached(const struct in_multi *inm) { struct ifnet *ifp; KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); } return (ifp == NULL); } #endif static struct task free_task; static struct in_multi_head inm_free_list = SLIST_HEAD_INITIALIZER(); static void inm_release_task(void *arg __unused, int pending __unused); static void inm_init(void) { TASK_INIT(&free_task, 0, inm_release_task, NULL); } SYSINIT(inm_init, SI_SUB_TASKQ, SI_ORDER_ANY, inm_init, NULL); void inm_release_list_deferred(struct in_multi_head *inmh) { if (SLIST_EMPTY(inmh)) return; mtx_lock(&in_multi_free_mtx); SLIST_CONCAT(&inm_free_list, inmh, in_multi, inm_nrele); mtx_unlock(&in_multi_free_mtx); taskqueue_enqueue(taskqueue_thread, &free_task); } void inm_disconnect(struct in_multi *inm) { struct ifnet *ifp; struct ifmultiaddr *ifma, *ll_ifma; ifp = inm->inm_ifp; IF_ADDR_WLOCK_ASSERT(ifp); ifma = inm->inm_ifma; if_ref(ifp); if (ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname); if ((ll_ifma = ifma->ifma_llifma) != NULL) { MPASS(ifma != ll_ifma); ifma->ifma_llifma = NULL; MPASS(ll_ifma->ifma_llifma == NULL); MPASS(ll_ifma->ifma_ifp == ifp); if (--ll_ifma->ifma_refcount == 0) { if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname); if_freemulti(ll_ifma); ifma_restart = true; } } } void inm_release_deferred(struct in_multi *inm) { struct in_multi_head tmp; IN_MULTI_LIST_LOCK_ASSERT(); MPASS(inm->inm_refcount > 0); if (--inm->inm_refcount == 0) { SLIST_INIT(&tmp); inm_disconnect(inm); inm->inm_ifma->ifma_protospec = NULL; SLIST_INSERT_HEAD(&tmp, inm, inm_nrele); inm_release_list_deferred(&tmp); } } static void inm_release_task(void *arg __unused, int pending __unused) { struct in_multi_head inm_free_tmp; struct in_multi *inm, *tinm; SLIST_INIT(&inm_free_tmp); mtx_lock(&in_multi_free_mtx); SLIST_CONCAT(&inm_free_tmp, &inm_free_list, in_multi, inm_nrele); mtx_unlock(&in_multi_free_mtx); IN_MULTI_LOCK(); SLIST_FOREACH_SAFE(inm, &inm_free_tmp, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&inm_free_tmp, inm_nrele); MPASS(inm); inm_release(inm); } IN_MULTI_UNLOCK(); } /* * Initialize an in_mfilter structure to a known state at t0, t1 * with an empty source filter list. */ static __inline void imf_init(struct in_mfilter *imf, const int st0, const int st1) { memset(imf, 0, sizeof(struct in_mfilter)); RB_INIT(&imf->imf_sources); imf->imf_st[0] = st0; imf->imf_st[1] = st1; } struct in_mfilter * ip_mfilter_alloc(const int mflags, const int st0, const int st1) { struct in_mfilter *imf; imf = malloc(sizeof(*imf), M_INMFILTER, mflags); if (imf != NULL) imf_init(imf, st0, st1); return (imf); } void ip_mfilter_free(struct in_mfilter *imf) { imf_purge(imf); free(imf, M_INMFILTER); } /* * Function for looking up an in_multi record for an IPv4 multicast address * on a given interface. ifp must be valid. If no record found, return NULL. * The IN_MULTI_LIST_LOCK and IF_ADDR_LOCK on ifp must be held. */ struct in_multi * inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina) { struct ifmultiaddr *ifma; struct in_multi *inm; IN_MULTI_LIST_LOCK_ASSERT(); IF_ADDR_LOCK_ASSERT(ifp); inm = NULL; CK_STAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_addr.s_addr == ina.s_addr) break; inm = NULL; } return (inm); } /* * Wrapper for inm_lookup_locked(). * The IF_ADDR_LOCK will be taken on ifp and released on return. */ struct in_multi * inm_lookup(struct ifnet *ifp, const struct in_addr ina) { struct epoch_tracker et; struct in_multi *inm; IN_MULTI_LIST_LOCK_ASSERT(); NET_EPOCH_ENTER(et); inm = inm_lookup_locked(ifp, ina); NET_EPOCH_EXIT(et); return (inm); } /* * Find an IPv4 multicast group entry for this ip_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ static struct in_mfilter * imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group) { const struct sockaddr_in *gsin; struct in_mfilter *imf; struct in_multi *inm; gsin = (const struct sockaddr_in *)group; IP_MFILTER_FOREACH(imf, &imo->imo_head) { inm = imf->imf_inm; if (inm == NULL) continue; if ((ifp == NULL || (inm->inm_ifp == ifp)) && in_hosteq(inm->inm_addr, gsin->sin_addr)) { break; } } return (imf); } /* * Find an IPv4 multicast source entry for this imo which matches * the given group index for this socket, and source address. * * NOTE: This does not check if the entry is in-mode, merely if * it exists, which may not be the desired behaviour. */ static struct in_msource * imo_match_source(struct in_mfilter *imf, const struct sockaddr *src) { struct ip_msource find; struct ip_msource *ims; const sockunion_t *psa; KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__)); /* Source trees are keyed in host byte order. */ psa = (const sockunion_t *)src; find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); return ((struct in_msource *)ims); } /* * Perform filtering for multicast datagrams on a socket by group and source. * * Returns 0 if a datagram should be allowed through, or various error codes * if the socket was not a member of the group, or the source was muted, etc. */ int imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group, const struct sockaddr *src) { struct in_mfilter *imf; struct in_msource *ims; int mode; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); imf = imo_match_group(imo, ifp, group); if (imf == NULL) return (MCAST_NOTGMEMBER); /* * Check if the source was included in an (S,G) join. * Allow reception on exclusive memberships by default, * reject reception on inclusive memberships by default. * Exclude source only if an in-mode exclude filter exists. * Include source only if an in-mode include filter exists. * NOTE: We are comparing group state here at IGMP t1 (now) * with socket-layer t0 (since last downcall). */ mode = imf->imf_st[1]; ims = imo_match_source(imf, src); if ((ims == NULL && mode == MCAST_INCLUDE) || (ims != NULL && ims->imsl_st[0] != mode)) return (MCAST_NOTSMEMBER); return (MCAST_PASS); } /* * Find and return a reference to an in_multi record for (ifp, group), * and bump its reference count. * If one does not exist, try to allocate it, and update link-layer multicast * filters on ifp to listen for group. * Assumes the IN_MULTI lock is held across the call. * Return 0 if successful, otherwise return an appropriate error code. */ static int in_getmulti(struct ifnet *ifp, const struct in_addr *group, struct in_multi **pinm) { struct sockaddr_in gsin; struct ifmultiaddr *ifma; struct in_ifinfo *ii; struct in_multi *inm; int error; IN_MULTI_LOCK_ASSERT(); ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET]; IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, *group); if (inm != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(inm->inm_refcount >= 1, ("%s: bad refcount %d", __func__, inm->inm_refcount)); inm_acquire_locked(inm); *pinm = inm; } IN_MULTI_LIST_UNLOCK(); if (inm != NULL) return (0); memset(&gsin, 0, sizeof(gsin)); gsin.sin_family = AF_INET; gsin.sin_len = sizeof(struct sockaddr_in); gsin.sin_addr = *group; /* * Check if a link-layer group is already associated * with this network-layer group on the given ifnet. */ error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma); if (error != 0) return (error); /* XXX ifma_protospec must be covered by IF_ADDR_LOCK */ IN_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); /* * If something other than netinet is occupying the link-layer * group, print a meaningful error message and back out of * the allocation. * Otherwise, bump the refcount on the existing network-layer * group association and return it. */ if (ifma->ifma_protospec != NULL) { inm = (struct in_multi *)ifma->ifma_protospec; #ifdef INVARIANTS KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr", __func__)); KASSERT(ifma->ifma_addr->sa_family == AF_INET, ("%s: ifma not AF_INET", __func__)); KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__)); if (inm->inm_ifma != ifma || inm->inm_ifp != ifp || !in_hosteq(inm->inm_addr, *group)) { char addrbuf[INET_ADDRSTRLEN]; panic("%s: ifma %p is inconsistent with %p (%s)", __func__, ifma, inm, inet_ntoa_r(*group, addrbuf)); } #endif inm_acquire_locked(inm); *pinm = inm; goto out_locked; } IF_ADDR_WLOCK_ASSERT(ifp); /* * A new in_multi record is needed; allocate and initialize it. * We DO NOT perform an IGMP join as the in_ layer may need to * push an initial source list down to IGMP to support SSM. * * The initial source filter state is INCLUDE, {} as per the RFC. */ inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO); if (inm == NULL) { IF_ADDR_WUNLOCK(ifp); IN_MULTI_LIST_UNLOCK(); if_delmulti_ifma(ifma); return (ENOMEM); } inm->inm_addr = *group; inm->inm_ifp = ifp; inm->inm_igi = ii->ii_igmp; inm->inm_ifma = ifma; inm->inm_refcount = 1; inm->inm_state = IGMP_NOT_MEMBER; mbufq_init(&inm->inm_scq, IGMP_MAX_STATE_CHANGES); inm->inm_st[0].iss_fmode = MCAST_UNDEFINED; inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; RB_INIT(&inm->inm_srcs); ifma->ifma_protospec = inm; *pinm = inm; out_locked: IF_ADDR_WUNLOCK(ifp); IN_MULTI_LIST_UNLOCK(); return (0); } /* * Drop a reference to an in_multi record. * * If the refcount drops to 0, free the in_multi record and * delete the underlying link-layer membership. */ static void inm_release(struct in_multi *inm) { struct ifmultiaddr *ifma; struct ifnet *ifp; CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount); MPASS(inm->inm_refcount == 0); CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm); ifma = inm->inm_ifma; ifp = inm->inm_ifp; /* XXX this access is not covered by IF_ADDR_LOCK */ CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma); if (ifp != NULL) { CURVNET_SET(ifp->if_vnet); inm_purge(inm); free(inm, M_IPMADDR); if_delmulti_ifma_flags(ifma, 1); CURVNET_RESTORE(); if_rele(ifp); } else { inm_purge(inm); free(inm, M_IPMADDR); if_delmulti_ifma_flags(ifma, 1); } } /* * Clear recorded source entries for a group. * Used by the IGMP code. Caller must hold the IN_MULTI lock. * FIXME: Should reap. */ void inm_clear_recorded(struct in_multi *inm) { struct ip_msource *ims; IN_MULTI_LIST_LOCK_ASSERT(); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { if (ims->ims_stp) { ims->ims_stp = 0; --inm->inm_st[1].iss_rec; } } KASSERT(inm->inm_st[1].iss_rec == 0, ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec)); } /* * Record a source as pending for a Source-Group IGMPv3 query. * This lives here as it modifies the shared tree. * * inm is the group descriptor. * naddr is the address of the source to record in network-byte order. * * If the net.inet.igmp.sgalloc sysctl is non-zero, we will * lazy-allocate a source node in response to an SG query. * Otherwise, no allocation is performed. This saves some memory * with the trade-off that the source will not be reported to the * router if joined in the window between the query response and * the group actually being joined on the local host. * * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed. * This turns off the allocation of a recorded source entry if * the group has not been joined. * * Return 0 if the source didn't exist or was already marked as recorded. * Return 1 if the source was marked as recorded by this function. * Return <0 if any error occurred (negated errno code). */ int inm_record_source(struct in_multi *inm, const in_addr_t naddr) { struct ip_msource find; struct ip_msource *ims, *nims; IN_MULTI_LIST_LOCK_ASSERT(); find.ims_haddr = ntohl(naddr); ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims && ims->ims_stp) return (0); if (ims == NULL) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (-ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (-ENOMEM); nims->ims_haddr = find.ims_haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; } /* * Mark the source as recorded and update the recorded * source count. */ ++ims->ims_stp; ++inm->inm_st[1].iss_rec; return (1); } /* * Return a pointer to an in_msource owned by an in_mfilter, * given its source address. * Lazy-allocate if needed. If this is a new entry its filter state is * undefined at t0. * * imf is the filter set being modified. * haddr is the source address in *host* byte-order. * * SMPng: May be called with locks held; malloc must not block. */ static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **plims) { struct ip_msource find; struct ip_msource *ims, *nims; struct in_msource *lims; int error; error = 0; ims = NULL; lims = NULL; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); lims = (struct in_msource *)ims; if (lims == NULL) { if (imf->imf_nsrc == in_mcast_maxsocksrc) return (ENOSPC); nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); lims = (struct in_msource *)nims; lims->ims_haddr = find.ims_haddr; lims->imsl_st[0] = MCAST_UNDEFINED; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; } *plims = lims; return (error); } /* * Graft a source entry into an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being in the new filter mode at t1. * * Return the pointer to the new node, otherwise return NULL. */ static struct in_msource * imf_graft(struct in_mfilter *imf, const uint8_t st1, const struct sockaddr_in *psin) { struct ip_msource *nims; struct in_msource *lims; nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (NULL); lims = (struct in_msource *)nims; lims->ims_haddr = ntohl(psin->sin_addr.s_addr); lims->imsl_st[0] = MCAST_UNDEFINED; lims->imsl_st[1] = st1; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; return (lims); } /* * Prune a source entry from an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being left at t1, it is not freed. * * Return 0 if no error occurred, otherwise return an errno value. */ static int imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin) { struct ip_msource find; struct ip_msource *ims; struct in_msource *lims; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); if (ims == NULL) return (ENOENT); lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; return (0); } /* * Revert socket-layer filter set deltas at t1 to t0 state. */ static void imf_rollback(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) { /* no change at t1 */ continue; } else if (lims->imsl_st[0] != MCAST_UNDEFINED) { /* revert change to existing source at t1 */ lims->imsl_st[1] = lims->imsl_st[0]; } else { /* revert source added t1 */ CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } imf->imf_st[1] = imf->imf_st[0]; } /* * Mark socket-layer filter set as INCLUDE {} at t1. */ static void imf_leave(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; } imf->imf_st[1] = MCAST_INCLUDE; } /* * Mark socket-layer filter set deltas as committed. */ static void imf_commit(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[0] = lims->imsl_st[1]; } imf->imf_st[0] = imf->imf_st[1]; } /* * Reap unreferenced sources from socket-layer filter set. */ static void imf_reap(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if ((lims->imsl_st[0] == MCAST_UNDEFINED) && (lims->imsl_st[1] == MCAST_UNDEFINED)) { CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } } /* * Purge socket-layer filter set. */ static void imf_purge(struct in_mfilter *imf) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED; KASSERT(RB_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); } /* * Look up a source filter entry for a multicast group. * * inm is the group descriptor to work with. * haddr is the host-byte-order IPv4 address to look up. * noalloc may be non-zero to suppress allocation of sources. * *pims will be set to the address of the retrieved or allocated source. * * SMPng: NOTE: may be called with locks held. * Return 0 if successful, otherwise return a non-zero error code. */ static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims) { struct ip_msource find; struct ip_msource *ims, *nims; find.ims_haddr = haddr; ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims == NULL && !noalloc) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); nims->ims_haddr = haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; #ifdef KTR CTR3(KTR_IGMPV3, "%s: allocated 0x%08x as %p", __func__, haddr, ims); #endif } *pims = ims; return (0); } /* * Merge socket-layer source into IGMP-layer source. * If rollback is non-zero, perform the inverse of the merge. */ static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback) { int n = rollback ? -1 : 1; if (lims->imsl_st[0] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].ex -= n; } else if (lims->imsl_st[0] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in -= %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].in -= n; } if (lims->imsl_st[1] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex += %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].ex += n; } else if (lims->imsl_st[1] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in += %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].in += n; } } /* * Atomically update the global in_multi state, when a membership's * filter list is being updated in any way. * * imf is the per-inpcb-membership group filter pointer. * A fake imf may be passed for in-kernel consumers. * * XXX This is a candidate for a set-symmetric-difference style loop * which would eliminate the repeated lookup from root of ims nodes, * as they share the same key space. * * If any error occurred this function will back out of refcounts * and return a non-zero value. */ static int inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct ip_msource *ims, *nims; struct in_msource *lims; int schanged, error; int nsrc0, nsrc1; schanged = 0; error = 0; nsrc1 = nsrc0 = 0; IN_MULTI_LIST_LOCK_ASSERT(); /* * Update the source filters first, as this may fail. * Maintain count of in-mode filters at t0, t1. These are * used to work out if we transition into ASM mode or not. * Maintain a count of source filters whose state was * actually modified by this operation. */ RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++; if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; error = inm_get_source(inm, lims->ims_haddr, 0, &nims); ++schanged; if (error) break; ims_merge(nims, lims, 0); } if (error) { struct ip_msource *bims; RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; (void)inm_get_source(inm, lims->ims_haddr, 1, &bims); if (bims == NULL) continue; ims_merge(bims, lims, 1); } goto out_reap; } CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1", __func__, nsrc0, nsrc1); /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ if (imf->imf_st[0] == imf->imf_st[1] && imf->imf_st[1] == MCAST_INCLUDE) { if (nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } } /* Handle filter mode transition on socket. */ if (imf->imf_st[0] != imf->imf_st[1]) { CTR3(KTR_IGMPV3, "%s: imf transition %d to %d", __func__, imf->imf_st[0], imf->imf_st[1]); if (imf->imf_st[0] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__); --inm->inm_st[1].iss_ex; } else if (imf->imf_st[0] == MCAST_INCLUDE) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } if (imf->imf_st[1] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__); inm->inm_st[1].iss_ex++; } else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) { CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__); inm->inm_st[1].iss_in++; } } /* * Track inm filter state in terms of listener counts. * If there are any exclusive listeners, stack-wide * membership is exclusive. * Otherwise, if only inclusive listeners, stack-wide is inclusive. * If no listeners remain, state is undefined at t1, * and the IGMP lifecycle for this group should finish. */ if (inm->inm_st[1].iss_ex > 0) { CTR1(KTR_IGMPV3, "%s: transition to EX", __func__); inm->inm_st[1].iss_fmode = MCAST_EXCLUDE; } else if (inm->inm_st[1].iss_in > 0) { CTR1(KTR_IGMPV3, "%s: transition to IN", __func__); inm->inm_st[1].iss_fmode = MCAST_INCLUDE; } else { CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__); inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; } /* Decrement ASM listener count on transition out of ASM mode. */ if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { if ((imf->imf_st[1] != MCAST_EXCLUDE) || (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) { CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__); --inm->inm_st[1].iss_asm; } } /* Increment ASM listener count on transition to ASM mode. */ if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__); inm->inm_st[1].iss_asm++; } CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm); inm_print(inm); out_reap: if (schanged > 0) { CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__); inm_reap(inm); } return (error); } /* * Mark an in_multi's filter set deltas as committed. * Called by IGMP after a state change has been enqueued. */ void inm_commit(struct in_multi *inm) { struct ip_msource *ims; CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm); CTR1(KTR_IGMPV3, "%s: pre commit:", __func__); inm_print(inm); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { ims->ims_st[0] = ims->ims_st[1]; } inm->inm_st[0] = inm->inm_st[1]; } /* * Reap unreferenced nodes from an in_multi's filter set. */ static void inm_reap(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 || ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 || ims->ims_stp != 0) continue; CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Purge all source nodes from an in_multi's filter set. */ static void inm_purge(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Join a multicast group; unlocked entry point. * * SMPng: XXX: in_joingroup() is called from in_control() when Giant * is not held. Fortunately, ifp is unlikely to have been detached * at this point, so we assume it's OK to recurse. */ int in_joingroup(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { int error; IN_MULTI_LOCK(); error = in_joingroup_locked(ifp, gina, imf, pinm); IN_MULTI_UNLOCK(); return (error); } /* * Join a multicast group; real entry point. * * Only preserves atomicity at inm level. * NOTE: imf argument cannot be const due to sys/tree.h limitations. * * If the IGMP downcall fails, the group is not joined, and an error * code is returned. */ int in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { struct in_mfilter timf; struct in_multi *inm; int error; IN_MULTI_LOCK_ASSERT(); IN_MULTI_LIST_UNLOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: join 0x%08x on %p(%s))", __func__, ntohl(gina->s_addr), ifp, ifp->if_xname); error = 0; inm = NULL; /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); imf = &timf; } error = in_getmulti(ifp, gina, &inm); if (error) { CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__); return (error); } IN_MULTI_LIST_LOCK(); CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_inm_release; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) { CTR1(KTR_IGMPV3, "%s: failed to update source", __func__); goto out_inm_release; } out_inm_release: if (error) { CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); IF_ADDR_WLOCK(ifp); inm_release_deferred(inm); IF_ADDR_WUNLOCK(ifp); } else { *pinm = inm; } IN_MULTI_LIST_UNLOCK(); return (error); } /* * Leave a multicast group; unlocked entry point. */ int in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { int error; IN_MULTI_LOCK(); error = in_leavegroup_locked(inm, imf); IN_MULTI_UNLOCK(); return (error); } /* * Leave a multicast group; real entry point. * All source filters will be expunged. * * Only preserves atomicity at inm level. * * Holding the write lock for the INP which contains imf * is highly advisable. We can't assert for it as imf does not * contain a back-pointer to the owning inp. * * Note: This is not the same as inm_release(*) as this function also * makes a state change downcall into IGMP. */ int in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct in_mfilter timf; int error; IN_MULTI_LOCK_ASSERT(); IN_MULTI_LIST_UNLOCK_ASSERT(); error = 0; CTR5(KTR_IGMPV3, "%s: leave inm %p, 0x%08x/%s, imf %p", __func__, inm, ntohl(inm->inm_addr.s_addr), (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname), imf); /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); imf = &timf; } /* * Begin state merge transaction at IGMP layer. * * As this particular invocation should not cause any memory * to be allocated, and there is no opportunity to roll back * the transaction, it MUST NOT fail. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); CURVNET_SET(inm->inm_ifp->if_vnet); error = igmp_change_state(inm); IF_ADDR_WLOCK(inm->inm_ifp); inm_release_deferred(inm); IF_ADDR_WUNLOCK(inm->inm_ifp); IN_MULTI_LIST_UNLOCK(); CURVNET_RESTORE(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); return (error); } /*#ifndef BURN_BRIDGES*/ /* * Join an IPv4 multicast group in (*,G) exclusive mode. * The group must be a 224.0.0.0/24 link-scope group. * This KPI is for legacy kernel consumers only. */ struct in_multi * in_addmulti(struct in_addr *ap, struct ifnet *ifp) { struct in_multi *pinm; int error; #ifdef INVARIANTS char addrbuf[INET_ADDRSTRLEN]; #endif KASSERT(IN_LOCAL_GROUP(ntohl(ap->s_addr)), ("%s: %s not in 224.0.0.0/24", __func__, inet_ntoa_r(*ap, addrbuf))); error = in_joingroup(ifp, ap, NULL, &pinm); if (error != 0) pinm = NULL; return (pinm); } /* * Block or unblock an ASM multicast source on an inpcb. * This implements the delta-based API described in RFC 3678. * * The delta-based API applies only to exclusive-mode memberships. * An IGMP downcall will be performed. * * SMPng: NOTE: Must take Giant as a join may create a new ifma. * * Return 0 if successful, otherwise return an appropriate error code. */ static int inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; struct rm_priotracker in_ifa_tracker; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; uint16_t fmode; int error, doblock; ifp = NULL; error = 0; doblock = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; ssa = (sockunion_t *)&gsr.gsr_source; switch (sopt->sopt_name) { case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: { struct ip_mreq_source mreqs; error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; if (!in_nullhost(mreqs.imr_interface)) { IN_IFADDR_RLOCK(&in_ifa_tracker); INADDR_TO_IFP(mreqs.imr_interface, ifp); IN_IFADDR_RUNLOCK(&in_ifa_tracker); } if (sopt->sopt_name == IP_BLOCK_SOURCE) doblock = 1; CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; } case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) doblock = 1; break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); IN_MULTI_LOCK(); /* * Check if we are actually a member of this group. */ imo = inp_findmoptions(inp); imf = imo_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imf->imf_inm; /* * Attempting to use the delta-based API on an * non exclusive-mode membership is an error. */ fmode = imf->imf_st[0]; if (fmode != MCAST_EXCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Deal with error cases up-front: * Asked to block, but already blocked; or * Asked to unblock, but nothing to unblock. * If adding a new block entry, allocate it. */ ims = imo_match_source(imf, &ssa->sa); if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__, ntohl(ssa->sin.sin_addr.s_addr), doblock ? "" : "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at socket layer. */ if (doblock) { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); ims = imf_graft(imf, fmode, &ssa->sin); if (ims == NULL) error = ENOMEM; } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); error = imf_prune(imf, &ssa->sin); } if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_imf_rollback; } /* * Begin state merge transaction at IGMP layer. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); goto out_imf_rollback; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); IN_MULTI_UNLOCK(); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. * SMPng: NOTE: Returns with the INP write lock held. */ static struct ip_moptions * inp_findmoptions(struct inpcb *inp) { struct ip_moptions *imo; INP_WLOCK(inp); if (inp->inp_moptions != NULL) return (inp->inp_moptions); INP_WUNLOCK(inp); imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); imo->imo_multicast_ifp = NULL; imo->imo_multicast_addr.s_addr = INADDR_ANY; imo->imo_multicast_vif = -1; imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = in_mcast_loop; STAILQ_INIT(&imo->imo_head); INP_WLOCK(inp); if (inp->inp_moptions != NULL) { free(imo, M_IPMOPTS); return (inp->inp_moptions); } inp->inp_moptions = imo; return (imo); } static void inp_gcmoptions(struct ip_moptions *imo) { struct in_mfilter *imf; struct in_multi *inm; struct ifnet *ifp; while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) { ip_mfilter_remove(&imo->imo_head, imf); imf_leave(imf); if ((inm = imf->imf_inm) != NULL) { if ((ifp = inm->inm_ifp) != NULL) { CURVNET_SET(ifp->if_vnet); (void)in_leavegroup(inm, imf); CURVNET_RESTORE(); } else { (void)in_leavegroup(inm, imf); } } ip_mfilter_free(imf); } free(imo, M_IPMOPTS); } /* * Discard the IP multicast options (and source filters). To minimize * the amount of work done while holding locks such as the INP's * pcbinfo lock (which is used in the receive path), the free * operation is deferred to the epoch callback task. */ void inp_freemoptions(struct ip_moptions *imo) { if (imo == NULL) return; inp_gcmoptions(imo); } /* * Atomically get source filters on a socket for an IPv4 multicast group. * Called with INP lock held; returns with lock released. */ static int inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct ip_moptions *imo; struct in_mfilter *imf; struct ip_msource *ims; struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; size_t nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); imo = inp->inp_moptions; KASSERT(imo != NULL, ("%s: null ip_moptions", __func__)); INP_WUNLOCK(inp); error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EINVAL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EINVAL); INP_WLOCK(inp); /* * Lookup group on the socket. */ gsa = (sockunion_t *)&msfr.msfr_group; imf = imo_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } /* * Ignore memberships which are in limbo. */ if (imf->imf_st[1] == MCAST_UNDEFINED) { INP_WUNLOCK(inp); return (EAGAIN); } msfr.msfr_fmode = imf->imf_st[1]; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. * We only copy out the number of entries which userland * has asked for, but we always tell userland how big the * buffer really needs to be. */ if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) msfr.msfr_nsrcs = in_mcast_maxsocksrc; tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_NOWAIT | M_ZERO); if (tss == NULL) { INP_WUNLOCK(inp); return (ENOBUFS); } } /* * Count number of sources in-mode at t0. * If buffer space exists and remains, copy out source entries. */ nsrcs = msfr.msfr_nsrcs; ncsrcs = 0; ptss = tss; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == MCAST_UNDEFINED || lims->imsl_st[0] != imf->imf_st[0]) continue; ++ncsrcs; if (tss != NULL && nsrcs > 0) { psin = (struct sockaddr_in *)ptss; psin->sin_family = AF_INET; psin->sin_len = sizeof(struct sockaddr_in); psin->sin_addr.s_addr = htonl(lims->ims_haddr); psin->sin_port = 0; ++ptss; --nsrcs; } } INP_WUNLOCK(inp); if (tss != NULL) { error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); free(tss, M_TEMP); if (error) return (error); } msfr.msfr_nsrcs = ncsrcs; error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ int inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct rm_priotracker in_ifa_tracker; struct ip_mreqn mreqn; struct ip_moptions *imo; struct ifnet *ifp; struct in_ifaddr *ia; int error, optval; u_char coptval; INP_WLOCK(inp); imo = inp->inp_moptions; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = 0; switch (sopt->sopt_name) { case IP_MULTICAST_VIF: if (imo != NULL) optval = imo->imo_multicast_vif; else optval = -1; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_IF: memset(&mreqn, 0, sizeof(struct ip_mreqn)); if (imo != NULL) { ifp = imo->imo_multicast_ifp; if (!in_nullhost(imo->imo_multicast_addr)) { mreqn.imr_address = imo->imo_multicast_addr; } else if (ifp != NULL) { struct epoch_tracker et; mreqn.imr_ifindex = ifp->if_index; NET_EPOCH_ENTER(et); IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) mreqn.imr_address = IA_SIN(ia)->sin_addr; NET_EPOCH_EXIT(et); } } INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { error = sooptcopyout(sopt, &mreqn, sizeof(struct ip_mreqn)); } else { error = sooptcopyout(sopt, &mreqn.imr_address, sizeof(struct in_addr)); } break; case IP_MULTICAST_TTL: if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_TTL; else optval = coptval = imo->imo_multicast_ttl; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_LOOP: if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_LOOP; else optval = coptval = imo->imo_multicast_loop; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MSFILTER: if (imo == NULL) { error = EADDRNOTAVAIL; INP_WUNLOCK(inp); } else { error = inp_get_source_filters(inp, sopt); } break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Look up the ifnet to use for a multicast group membership, * given the IPv4 address of an interface, and the IPv4 group address. * * This routine exists to support legacy multicast applications * which do not understand that multicast memberships are scoped to * specific physical links in the networking stack, or which need * to join link-scope groups before IPv4 addresses are configured. * * If inp is non-NULL, use this socket's current FIB number for any * required FIB lookup. * If ina is INADDR_ANY, look up the group address in the unicast FIB, * and use its ifp; usually, this points to the default next-hop. * * If the FIB lookup fails, attempt to use the first non-loopback * interface with multicast capability in the system as a * last resort. The legacy IPv4 ASM API requires that we do * this in order to allow groups to be joined when the routing * table has not yet been populated during boot. * * Returns NULL if no ifp could be found. * * FUTURE: Implement IPv4 source-address selection. */ static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *inp, const struct sockaddr_in *gsin, const struct in_addr ina) { struct rm_priotracker in_ifa_tracker; struct ifnet *ifp; - struct nhop4_basic nh4; + struct nhop_object *nh; uint32_t fibnum; KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__)); KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)), ("%s: not multicast", __func__)); ifp = NULL; if (!in_nullhost(ina)) { IN_IFADDR_RLOCK(&in_ifa_tracker); INADDR_TO_IFP(ina, ifp); IN_IFADDR_RUNLOCK(&in_ifa_tracker); } else { fibnum = inp ? inp->inp_inc.inc_fibnum : 0; - if (fib4_lookup_nh_basic(fibnum, gsin->sin_addr, 0, 0, &nh4)==0) - ifp = nh4.nh_ifp; + nh = fib4_lookup(fibnum, gsin->sin_addr, 0, 0, 0); + if (nh != NULL) + ifp = nh->nh_ifp; else { struct in_ifaddr *ia; struct ifnet *mifp; mifp = NULL; IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { mifp = ia->ia_ifp; if (!(mifp->if_flags & IFF_LOOPBACK) && (mifp->if_flags & IFF_MULTICAST)) { ifp = mifp; break; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } return (ifp); } /* * Join an IPv4 multicast group, possibly with a source. */ static int inp_join_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; struct in_msource *lims; int error, is_new; ifp = NULL; lims = NULL; error = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_ADD_MEMBERSHIP: { struct ip_mreqn mreqn; if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); else error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqn.imr_multiaddr; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); if (sopt->sopt_valsize == sizeof(struct ip_mreqn) && mreqn.imr_ifindex != 0) ifp = ifnet_byindex(mreqn.imr_ifindex); else ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, mreqn.imr_address); break; } case IP_ADD_SOURCE_MEMBERSHIP: { struct ip_mreq_source mreqs; error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); if (error) return (error); gsa->sin.sin_family = ssa->sin.sin_family = AF_INET; gsa->sin.sin_len = ssa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); ssa->sin.sin_addr = mreqs.imr_sourceaddr; ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, mreqs.imr_interface); CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; } case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: if (sopt->sopt_name == MCAST_JOIN_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); /* * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. */ gsa->sin.sin_port = 0; if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); ssa->sin.sin_port = 0; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); IN_MULTI_LOCK(); /* * Find the membership in the membership list. */ imo = inp_findmoptions(inp); imf = imo_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { is_new = 1; inm = NULL; if (ip_mfilter_count(&imo->imo_head) >= IP_MAX_MEMBERSHIPS) { error = ENOMEM; goto out_inp_locked; } } else { is_new = 0; inm = imf->imf_inm; if (ssa->ss.ss_family != AF_UNSPEC) { /* * MCAST_JOIN_SOURCE_GROUP on an exclusive membership * is an error. On an existing inclusive membership, * it just adds the source to the filter list. */ if (imf->imf_st[1] != MCAST_INCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Throw out duplicates. * * XXX FIXME: This makes a naive assumption that * even if entries exist for *ssa in this imf, * they will be rejected as dupes, even if they * are not valid in the current mode (in-mode). * * in_msource is transactioned just as for anything * else in SSM -- but note naive use of inm_graft() * below for allocating new filter entries. * * This is only an issue if someone mixes the * full-state SSM API with the delta-based API, * which is discouraged in the relevant RFCs. */ lims = imo_match_source(imf, &ssa->sa); if (lims != NULL /*&& lims->imsl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; goto out_inp_locked; } } else { /* * MCAST_JOIN_GROUP on an existing exclusive * membership is an error; return EADDRINUSE * to preserve 4.4BSD API idempotence, and * avoid tedious detour to code below. * NOTE: This is bending RFC 3678 a bit. * * On an existing inclusive membership, this is also * an error; if you want to change filter mode, * you must use the userland API setsourcefilter(). * XXX We don't reject this for imf in UNDEFINED * state at t1, because allocation of a filter * is atomic with allocation of a membership. */ error = EINVAL; if (imf->imf_st[1] == MCAST_EXCLUDE) error = EADDRINUSE; goto out_inp_locked; } } /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * Graft new source into filter list for this inpcb's * membership of the group. The in_multi may not have * been allocated yet if this is a new membership, however, * the in_mfilter slot will be allocated and must be initialized. * * Note: Grafting of exclusive mode filters doesn't happen * in this path. * XXX: Should check for non-NULL lims (node exists but may * not be in-mode) for interop with full-state API. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* Membership starts in IN mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/source", __func__); imf = ip_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_INCLUDE); if (imf == NULL) { error = ENOMEM; goto out_inp_locked; } } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); } lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin); if (lims == NULL) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); error = ENOMEM; goto out_inp_locked; } } else { /* No address specified; Membership starts in EX mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__); imf = ip_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_EXCLUDE); if (imf == NULL) { error = ENOMEM; goto out_inp_locked; } } } /* * Begin state merge transaction at IGMP layer. */ if (is_new) { in_pcbref(inp); INP_WUNLOCK(inp); error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf, &imf->imf_inm); INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { error = ENXIO; goto out_inp_unlocked; } if (error) { CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed", __func__); goto out_inp_locked; } /* * NOTE: Refcount from in_joingroup_locked() * is protecting membership. */ ip_mfilter_insert(&imo->imo_head, imf); } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); imf_rollback(imf); imf_reap(imf); goto out_inp_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); imf_rollback(imf); imf_reap(imf); goto out_inp_locked; } } imf_commit(imf); imf = NULL; out_inp_locked: INP_WUNLOCK(inp); out_inp_unlocked: IN_MULTI_UNLOCK(); if (is_new && imf) { if (imf->imf_inm != NULL) { IN_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); inm_release_deferred(imf->imf_inm); IF_ADDR_WUNLOCK(ifp); IN_MULTI_LIST_UNLOCK(); } ip_mfilter_free(imf); } return (error); } /* * Leave an IPv4 multicast group on an inpcb, possibly with a source. */ static int inp_leave_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; struct ip_mreq_source mreqs; struct rm_priotracker in_ifa_tracker; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; int error; bool is_final; ifp = NULL; error = 0; is_final = true; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: if (sopt->sopt_name == IP_DROP_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Swap interface and sourceaddr arguments, * as ip_mreq and ip_mreq_source are laid * out differently. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } /* * Attempt to look up hinted ifp from interface address. * Fallthrough with null ifp iff lookup fails, to * preserve 4.4BSD mcast API idempotence. * XXX NOTE WELL: The RFC 3678 API is preferred because * using an IPv4 address as a key is racy. */ if (!in_nullhost(mreqs.imr_interface)) { IN_IFADDR_RLOCK(&in_ifa_tracker); INADDR_TO_IFP(mreqs.imr_interface, ifp); IN_IFADDR_RUNLOCK(&in_ifa_tracker); } CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: if (sopt->sopt_name == MCAST_LEAVE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); } if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (ifp == NULL) return (EADDRNOTAVAIL); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); IN_MULTI_LOCK(); /* * Find the membership in the membership list. */ imo = inp_findmoptions(inp); imf = imo_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imf->imf_inm; if (ssa->ss.ss_family != AF_UNSPEC) is_final = false; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * If we were instructed only to leave a given source, do so. * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ if (is_final) { ip_mfilter_remove(&imo->imo_head, imf); imf_leave(imf); /* * Give up the multicast address record to which * the membership points. */ (void) in_leavegroup_locked(imf->imf_inm, imf); } else { if (imf->imf_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; goto out_inp_locked; } ims = imo_match_source(imf, &ssa->sa); if (ims == NULL) { CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__, ntohl(ssa->sin.sin_addr.s_addr), "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); error = imf_prune(imf, &ssa->sin); if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_inp_locked; } } /* * Begin state merge transaction at IGMP layer. */ if (!is_final) { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); imf_rollback(imf); imf_reap(imf); goto out_inp_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); imf_rollback(imf); imf_reap(imf); goto out_inp_locked; } } imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); if (is_final && imf) ip_mfilter_free(imf); IN_MULTI_UNLOCK(); return (error); } /* * Select the interface for transmitting IPv4 multicast datagrams. * * Either an instance of struct in_addr or an instance of struct ip_mreqn * may be passed to this socket option. An address of INADDR_ANY or an * interface index of 0 is used to remove a previous selection. * When no interface is selected, one is chosen for every send. */ static int inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { struct rm_priotracker in_ifa_tracker; struct in_addr addr; struct ip_mreqn mreqn; struct ifnet *ifp; struct ip_moptions *imo; int error; if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { /* * An interface index was specified using the * Linux-derived ip_mreqn structure. */ error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); if (error) return (error); if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex) return (EINVAL); if (mreqn.imr_ifindex == 0) { ifp = NULL; } else { ifp = ifnet_byindex(mreqn.imr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); } } else { /* * An interface was specified by IPv4 address. * This is the traditional BSD usage. */ error = sooptcopyin(sopt, &addr, sizeof(struct in_addr), sizeof(struct in_addr)); if (error) return (error); if (in_nullhost(addr)) { ifp = NULL; } else { IN_IFADDR_RLOCK(&in_ifa_tracker); INADDR_TO_IFP(addr, ifp); IN_IFADDR_RUNLOCK(&in_ifa_tracker); if (ifp == NULL) return (EADDRNOTAVAIL); } CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = 0x%08x", __func__, ifp, ntohl(addr.s_addr)); } /* Reject interfaces which do not support multicast. */ if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); imo = inp_findmoptions(inp); imo->imo_multicast_ifp = ifp; imo->imo_multicast_addr.s_addr = INADDR_ANY; INP_WUNLOCK(inp); return (0); } /* * Atomically set source filters on a socket for an IPv4 multicast group. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. */ static int inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) return (ENOBUFS); if ((msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE)) return (EINVAL); if (msfr.msfr_group.ss_family != AF_INET || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); gsa->sin.sin_port = 0; /* ignore port */ if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); IN_MULTI_LOCK(); /* * Take the INP write lock. * Check if this socket is a member of this group. */ imo = inp_findmoptions(inp); imf = imo_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imf->imf_inm; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); imf->imf_st[1] = msfr.msfr_fmode; /* * Apply any new source filters, if present. * Make a copy of the user-space source vector so * that we may copy them with a single copyin. This * allows us to deal with page faults up-front. */ if (msfr.msfr_nsrcs > 0) { struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *kss, *pkss; int i; INP_WUNLOCK(inp); CTR2(KTR_IGMPV3, "%s: loading %lu source list entries", __func__, (unsigned long)msfr.msfr_nsrcs); kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); if (error) { free(kss, M_TEMP); return (error); } INP_WLOCK(inp); /* * Mark all source filters as UNDEFINED at t1. * Restore new group filter mode, as imf_leave() * will set it to INCLUDE. */ imf_leave(imf); imf->imf_st[1] = msfr.msfr_fmode; /* * Update socket layer filters at t1, lazy-allocating * new entries. This saves a bunch of memory at the * cost of one RB_FIND() per source entry; duplicate * entries in the msfr_nsrcs vector are ignored. * If we encounter an error, rollback transaction. * * XXX This too could be replaced with a set-symmetric * difference like loop to avoid walking from root * every time, as the key space is common. */ for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { psin = (struct sockaddr_in *)pkss; if (psin->sin_family != AF_INET) { error = EAFNOSUPPORT; break; } if (psin->sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; break; } error = imf_get_source(imf, psin, &lims); if (error) break; lims->imsl_st[1] = imf->imf_st[1]; } free(kss, M_TEMP); } if (error) goto out_imf_rollback; INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at IGMP layer. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); goto out_imf_rollback; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); IN_MULTI_UNLOCK(); return (error); } /* * Set the IP multicast options in response to user setsockopt(). * * Many of the socket options handled in this function duplicate the * functionality of socket options in the regular unicast API. However, * it is not possible to merge the duplicate code, because the idempotence * of the IPv4 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. * * SMPng: XXX: Unlocked read of inp_socket believed OK. * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING * is refactored to no longer use vifs. */ int inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip_moptions *imo; int error; + struct epoch_tracker et; error = 0; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) return (EOPNOTSUPP); switch (sopt->sopt_name) { case IP_MULTICAST_VIF: { int vifi; /* * Select a multicast VIF for transmission. * Only useful if multicast forwarding is active. */ if (legal_vif_num == NULL) { error = EOPNOTSUPP; break; } error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int)); if (error) break; if (!legal_vif_num(vifi) && (vifi != -1)) { error = EINVAL; break; } imo = inp_findmoptions(inp); imo->imo_multicast_vif = vifi; INP_WUNLOCK(inp); break; } case IP_MULTICAST_IF: error = inp_set_multicast_if(inp, sopt); break; case IP_MULTICAST_TTL: { u_char ttl; /* * Set the IP time-to-live for outgoing multicast packets. * The original multicast API required a char argument, * which is inconsistent with the rest of the socket API. * We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &ttl, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int ittl; error = sooptcopyin(sopt, &ittl, sizeof(u_int), sizeof(u_int)); if (error) break; if (ittl > 255) { error = EINVAL; break; } ttl = (u_char)ittl; } imo = inp_findmoptions(inp); imo->imo_multicast_ttl = ttl; INP_WUNLOCK(inp); break; } case IP_MULTICAST_LOOP: { u_char loop; /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. The original multicast API required a * char argument, which is inconsistent with the rest * of the socket API. We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &loop, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int iloop; error = sooptcopyin(sopt, &iloop, sizeof(u_int), sizeof(u_int)); if (error) break; loop = (u_char)iloop; } imo = inp_findmoptions(inp); imo->imo_multicast_loop = !!loop; INP_WUNLOCK(inp); break; } case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: + NET_EPOCH_ENTER(et); error = inp_join_group(inp, sopt); + NET_EPOCH_EXIT(et); break; case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = inp_leave_group(inp, sopt); break; case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_block_unblock_source(inp, sopt); break; case IP_MSFILTER: error = inp_set_source_filters(inp, sopt); break; default: error = EOPNOTSUPP; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Expose IGMP's multicast filter mode and source list(s) to userland, * keyed by (ifindex, group). * The filter mode is written out as a uint32_t, followed by * 0..n of struct in_addr. * For use by ifmcstat(8). * SMPng: NOTE: unlocked read of ifindex space. */ static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) { struct in_addr src, group; struct epoch_tracker et; struct ifnet *ifp; struct ifmultiaddr *ifma; struct in_multi *inm; struct ip_msource *ims; int *name; int retval; u_int namelen; uint32_t fmode, ifindex; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 2) return (EINVAL); ifindex = name[0]; if (ifindex <= 0 || ifindex > V_if_index) { CTR2(KTR_IGMPV3, "%s: ifindex %u out of range", __func__, ifindex); return (ENOENT); } group.s_addr = name[1]; if (!IN_MULTICAST(ntohl(group.s_addr))) { CTR2(KTR_IGMPV3, "%s: group 0x%08x is not multicast", __func__, ntohl(group.s_addr)); return (EINVAL); } NET_EPOCH_ENTER(et); ifp = ifnet_byindex(ifindex); if (ifp == NULL) { NET_EPOCH_EXIT(et); CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u", __func__, ifindex); return (ENOENT); } retval = sysctl_wire_old_buffer(req, sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr))); if (retval) { NET_EPOCH_EXIT(et); return (retval); } IN_MULTI_LIST_LOCK(); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (!in_hosteq(inm->inm_addr, group)) continue; fmode = inm->inm_st[1].iss_fmode; retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); if (retval != 0) break; RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); /* * Only copy-out sources which are in-mode. */ if (fmode != ims_get_mode(inm, ims, 1)) { CTR1(KTR_IGMPV3, "%s: skip non-in-mode", __func__); continue; } src.s_addr = htonl(ims->ims_haddr); retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr)); if (retval != 0) break; } } IN_MULTI_LIST_UNLOCK(); NET_EPOCH_EXIT(et); return (retval); } #if defined(KTR) && (KTR_COMPILE & KTR_IGMPV3) static const char *inm_modestrs[] = { [MCAST_UNDEFINED] = "un", [MCAST_INCLUDE] = "in", [MCAST_EXCLUDE] = "ex", }; _Static_assert(MCAST_UNDEFINED == 0 && MCAST_EXCLUDE + 1 == nitems(inm_modestrs), "inm_modestrs: no longer matches #defines"); static const char * inm_mode_str(const int mode) { if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) return (inm_modestrs[mode]); return ("??"); } static const char *inm_statestrs[] = { [IGMP_NOT_MEMBER] = "not-member", [IGMP_SILENT_MEMBER] = "silent", [IGMP_REPORTING_MEMBER] = "reporting", [IGMP_IDLE_MEMBER] = "idle", [IGMP_LAZY_MEMBER] = "lazy", [IGMP_SLEEPING_MEMBER] = "sleeping", [IGMP_AWAKENING_MEMBER] = "awakening", [IGMP_G_QUERY_PENDING_MEMBER] = "query-pending", [IGMP_SG_QUERY_PENDING_MEMBER] = "sg-query-pending", [IGMP_LEAVING_MEMBER] = "leaving", }; _Static_assert(IGMP_NOT_MEMBER == 0 && IGMP_LEAVING_MEMBER + 1 == nitems(inm_statestrs), "inm_statetrs: no longer matches #defines"); static const char * inm_state_str(const int state) { if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER) return (inm_statestrs[state]); return ("??"); } /* * Dump an in_multi structure to the console. */ void inm_print(const struct in_multi *inm) { int t; char addrbuf[INET_ADDRSTRLEN]; if ((ktr_mask & KTR_IGMPV3) == 0) return; printf("%s: --- begin inm %p ---\n", __func__, inm); printf("addr %s ifp %p(%s) ifma %p\n", inet_ntoa_r(inm->inm_addr, addrbuf), inm->inm_ifp, inm->inm_ifp->if_xname, inm->inm_ifma); printf("timer %u state %s refcount %u scq.len %u\n", inm->inm_timer, inm_state_str(inm->inm_state), inm->inm_refcount, inm->inm_scq.mq_len); printf("igi %p nsrc %lu sctimer %u scrv %u\n", inm->inm_igi, inm->inm_nsrc, inm->inm_sctimer, inm->inm_scrv); for (t = 0; t < 2; t++) { printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, inm_mode_str(inm->inm_st[t].iss_fmode), inm->inm_st[t].iss_asm, inm->inm_st[t].iss_ex, inm->inm_st[t].iss_in, inm->inm_st[t].iss_rec); } printf("%s: --- end inm %p ---\n", __func__, inm); } #else /* !KTR || !(KTR_COMPILE & KTR_IGMPV3) */ void inm_print(const struct in_multi *inm) { } #endif /* KTR && (KTR_COMPILE & KTR_IGMPV3) */ RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); Index: head/sys/netinet/ip_options.c =================================================================== --- head/sys/netinet/ip_options.c (revision 362899) +++ head/sys/netinet/ip_options.c (revision 362900) @@ -1,761 +1,763 @@ /* * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * Copyright (c) 2005 Andre Oppermann, Internet Business Solutions AG. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ipstealth.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_DEFINE_STATIC(int, ip_dosourceroute); SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_dosourceroute), 0, "Enable forwarding source routed IP packets"); #define V_ip_dosourceroute VNET(ip_dosourceroute) VNET_DEFINE_STATIC(int, ip_acceptsourceroute); SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_acceptsourceroute), 0, "Enable accepting source routed IP packets"); #define V_ip_acceptsourceroute VNET(ip_acceptsourceroute) VNET_DEFINE(int, ip_doopts) = 1; /* 0 = ignore, 1 = process, 2 = reject */ SYSCTL_INT(_net_inet_ip, OID_AUTO, process_options, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_doopts), 0, "Enable IP options processing ([LS]SRR, RR, TS)"); static void save_rte(struct mbuf *m, u_char *, struct in_addr); /* * Do option processing on a datagram, possibly discarding it if bad options * are encountered, or forwarding it if source-routed. * * The pass argument is used when operating in the IPSTEALTH mode to tell * what options to process: [LS]SRR (pass 0) or the others (pass 1). The * reason for as many as two passes is that when doing IPSTEALTH, non-routing * options should be processed only if the packet is for us. * * Returns 1 if packet has been forwarded/freed, 0 if the packet should be * processed further. */ int ip_dooptions(struct mbuf *m, int pass) { struct ip *ip = mtod(m, struct ip *); u_char *cp; struct in_ifaddr *ia; int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; struct in_addr *sin, dst; uint32_t ntime; - struct nhop4_extended nh_ext; + struct nhop_object *nh; struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; NET_EPOCH_ASSERT(); /* Ignore or reject packets with IP options. */ if (V_ip_doopts == 0) return 0; else if (V_ip_doopts == 2) { type = ICMP_UNREACH; code = ICMP_UNREACH_FILTER_PROHIB; goto bad; } dst = ip->ip_dst; cp = (u_char *)(ip + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } optlen = cp[IPOPT_OLEN]; if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } } switch (opt) { default: break; /* * Source routing with record. Find interface with current * destination address. If none on this machine then drop if * strictly routed, or do nothing if loosely routed. Record * interface address and bring up next address component. If * strictly routed make sure next address is on directly * accessible net. */ case IPOPT_LSRR: case IPOPT_SSRR: #ifdef IPSTEALTH if (V_ipstealth && pass > 0) break; #endif if (optlen < IPOPT_OFFSET + sizeof(*cp)) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } ipaddr.sin_addr = ip->ip_dst; if (ifa_ifwithaddr_check((struct sockaddr *)&ipaddr) == 0) { if (opt == IPOPT_SSRR) { type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; } if (!V_ip_dosourceroute) goto nosourcerouting; /* * Loose routing, and not at next destination * yet; nothing to do except forward. */ break; } off--; /* 0 origin */ if (off > optlen - (int)sizeof(struct in_addr)) { /* * End of source route. Should be for us. */ if (!V_ip_acceptsourceroute) goto nosourcerouting; save_rte(m, cp, ip->ip_src); break; } #ifdef IPSTEALTH if (V_ipstealth) goto dropit; #endif if (!V_ip_dosourceroute) { if (V_ipforwarding) { char srcbuf[INET_ADDRSTRLEN]; char dstbuf[INET_ADDRSTRLEN]; /* * Acting as a router, so generate * ICMP */ nosourcerouting: log(LOG_WARNING, "attempted source route from %s " "to %s\n", inet_ntoa_r(ip->ip_src, srcbuf), inet_ntoa_r(ip->ip_dst, dstbuf)); type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; } else { /* * Not acting as a router, so * silently drop. */ #ifdef IPSTEALTH dropit: #endif IPSTAT_INC(ips_cantforward); m_freem(m); return (1); } } /* * locate outgoing interface */ (void)memcpy(&ipaddr.sin_addr, cp + off, sizeof(ipaddr.sin_addr)); type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; if (opt == IPOPT_SSRR) { #define INA struct in_ifaddr * #define SA struct sockaddr * ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr, RT_ALL_FIBS); if (ia == NULL) ia = (INA)ifa_ifwithnet((SA)&ipaddr, 0, RT_ALL_FIBS); if (ia == NULL) goto bad; memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); } else { /* XXX MRT 0 for routing */ - if (fib4_lookup_nh_ext(M_GETFIB(m), - ipaddr.sin_addr, 0, 0, &nh_ext) != 0) + nh = fib4_lookup(M_GETFIB(m), ipaddr.sin_addr, + 0, NHR_NONE, 0); + if (nh == NULL) goto bad; - memcpy(cp + off, &nh_ext.nh_src, + memcpy(cp + off, &(IA_SIN(nh->nh_ifa)->sin_addr), sizeof(struct in_addr)); } ip->ip_dst = ipaddr.sin_addr; cp[IPOPT_OFFSET] += sizeof(struct in_addr); /* * Let ip_intr's mcast routing check handle mcast pkts */ forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr)); break; case IPOPT_RR: #ifdef IPSTEALTH if (V_ipstealth && pass == 0) break; #endif if (optlen < IPOPT_OFFSET + sizeof(*cp)) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } /* * If no space remains, ignore. */ off--; /* 0 origin */ if (off > optlen - (int)sizeof(struct in_addr)) break; (void)memcpy(&ipaddr.sin_addr, &ip->ip_dst, sizeof(ipaddr.sin_addr)); /* * Locate outgoing interface; if we're the * destination, use the incoming interface (should be * same). */ if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) != NULL) { memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); - } else if (fib4_lookup_nh_ext(M_GETFIB(m), - ipaddr.sin_addr, 0, 0, &nh_ext) == 0) { - memcpy(cp + off, &nh_ext.nh_src, + } else if ((nh = fib4_lookup(M_GETFIB(m), + ipaddr.sin_addr, 0, NHR_NONE, 0)) != NULL) { + memcpy(cp + off, &(IA_SIN(nh->nh_ifa)->sin_addr), sizeof(struct in_addr)); } else { type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; goto bad; } cp[IPOPT_OFFSET] += sizeof(struct in_addr); break; case IPOPT_TS: #ifdef IPSTEALTH if (V_ipstealth && pass == 0) break; #endif code = cp - (u_char *)ip; if (optlen < 4 || optlen > 40) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < 5) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if (off > optlen - (int)sizeof(int32_t)) { cp[IPOPT_OFFSET + 1] += (1 << 4); if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } break; } off--; /* 0 origin */ sin = (struct in_addr *)(cp + off); switch (cp[IPOPT_OFFSET + 1] & 0x0f) { case IPOPT_TS_TSONLY: break; case IPOPT_TS_TSANDADDR: if (off + sizeof(uint32_t) + sizeof(struct in_addr) > optlen) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } ipaddr.sin_addr = dst; ia = (INA)ifaof_ifpforaddr((SA)&ipaddr, m->m_pkthdr.rcvif); if (ia == NULL) continue; (void)memcpy(sin, &IA_SIN(ia)->sin_addr, sizeof(struct in_addr)); cp[IPOPT_OFFSET] += sizeof(struct in_addr); off += sizeof(struct in_addr); break; case IPOPT_TS_PRESPEC: if (off + sizeof(uint32_t) + sizeof(struct in_addr) > optlen) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } (void)memcpy(&ipaddr.sin_addr, sin, sizeof(struct in_addr)); if (ifa_ifwithaddr_check((SA)&ipaddr) == 0) continue; cp[IPOPT_OFFSET] += sizeof(struct in_addr); off += sizeof(struct in_addr); break; default: code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip; goto bad; } ntime = iptime(); (void)memcpy(cp + off, &ntime, sizeof(uint32_t)); cp[IPOPT_OFFSET] += sizeof(uint32_t); } } if (forward && V_ipforwarding) { ip_forward(m, 1); return (1); } return (0); bad: icmp_error(m, type, code, 0, 0); IPSTAT_INC(ips_badoptions); return (1); } /* * Save incoming source route for use in replies, to be picked up later by * ip_srcroute if the receiver is interested. */ static void save_rte(struct mbuf *m, u_char *option, struct in_addr dst) { unsigned olen; struct ipopt_tag *opts; opts = (struct ipopt_tag *)m_tag_get(PACKET_TAG_IPOPTIONS, sizeof(struct ipopt_tag), M_NOWAIT); if (opts == NULL) return; olen = option[IPOPT_OLEN]; if (olen > sizeof(opts->ip_srcrt) - (1 + sizeof(dst))) { m_tag_free((struct m_tag *)opts); return; } bcopy(option, opts->ip_srcrt.srcopt, olen); opts->ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); opts->ip_srcrt.dst = dst; m_tag_prepend(m, (struct m_tag *)opts); } /* * Retrieve incoming source route for use in replies, in the same form used * by setsockopt. The first hop is placed before the options, will be * removed later. */ struct mbuf * ip_srcroute(struct mbuf *m0) { struct in_addr *p, *q; struct mbuf *m; struct ipopt_tag *opts; opts = (struct ipopt_tag *)m_tag_find(m0, PACKET_TAG_IPOPTIONS, NULL); if (opts == NULL) return (NULL); if (opts->ip_nhops == 0) return (NULL); m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); #define OPTSIZ (sizeof(opts->ip_srcrt.nop) + sizeof(opts->ip_srcrt.srcopt)) /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */ m->m_len = opts->ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) + OPTSIZ; /* * First, save first hop for return route. */ p = &(opts->ip_srcrt.route[opts->ip_nhops - 1]); *(mtod(m, struct in_addr *)) = *p--; /* * Copy option fields and padding (nop) to mbuf. */ opts->ip_srcrt.nop = IPOPT_NOP; opts->ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF; (void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), &(opts->ip_srcrt.nop), OPTSIZ); q = (struct in_addr *)(mtod(m, caddr_t) + sizeof(struct in_addr) + OPTSIZ); #undef OPTSIZ /* * Record return path as an IP source route, reversing the path * (pointers are now aligned). */ while (p >= opts->ip_srcrt.route) { *q++ = *p--; } /* * Last hop goes to final destination. */ *q = opts->ip_srcrt.dst; m_tag_delete(m0, (struct m_tag *)opts); return (m); } /* * Strip out IP options, at higher level protocol in the kernel. */ void ip_stripoptions(struct mbuf *m) { struct ip *ip = mtod(m, struct ip *); int olen; olen = (ip->ip_hl << 2) - sizeof(struct ip); m->m_len -= olen; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len -= olen; ip->ip_len = htons(ntohs(ip->ip_len) - olen); ip->ip_hl = sizeof(struct ip) >> 2; bcopy((char *)ip + sizeof(struct ip) + olen, (ip + 1), (size_t )(m->m_len - sizeof(struct ip))); } /* * Insert IP options into preformed packet. Adjust IP destination as * required for IP source routing, as indicated by a non-zero in_addr at the * start of the options. * * XXX This routine assumes that the packet has no options in place. */ struct mbuf * ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) { struct ipoption *p = mtod(opt, struct ipoption *); struct mbuf *n; struct ip *ip = mtod(m, struct ip *); unsigned optlen; optlen = opt->m_len - sizeof(p->ipopt_dst); if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) { *phlen = 0; return (m); /* XXX should fail */ } if (p->ipopt_dst.s_addr) ip->ip_dst = p->ipopt_dst; if (!M_WRITABLE(m) || M_LEADINGSPACE(m) < optlen) { n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) { *phlen = 0; return (m); } m_move_pkthdr(n, m); n->m_pkthdr.rcvif = NULL; n->m_pkthdr.len += optlen; m->m_len -= sizeof(struct ip); m->m_data += sizeof(struct ip); n->m_next = m; m = n; m->m_len = optlen + sizeof(struct ip); m->m_data += max_linkhdr; bcopy(ip, mtod(m, void *), sizeof(struct ip)); } else { m->m_data -= optlen; m->m_len += optlen; m->m_pkthdr.len += optlen; bcopy(ip, mtod(m, void *), sizeof(struct ip)); } ip = mtod(m, struct ip *); bcopy(p->ipopt_list, ip + 1, optlen); *phlen = sizeof(struct ip) + optlen; ip->ip_v = IPVERSION; ip->ip_hl = *phlen >> 2; ip->ip_len = htons(ntohs(ip->ip_len) + optlen); return (m); } /* * Copy options from ip to jp, omitting those not copied during * fragmentation. */ int ip_optcopy(struct ip *ip, struct ip *jp) { u_char *cp, *dp; int opt, optlen, cnt; cp = (u_char *)(ip + 1); dp = (u_char *)(jp + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) { /* Preserve for IP mcast tunnel's LSRR alignment. */ *dp++ = IPOPT_NOP; optlen = 1; continue; } KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp), ("ip_optcopy: malformed ipv4 option")); optlen = cp[IPOPT_OLEN]; KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt, ("ip_optcopy: malformed ipv4 option")); /* Bogus lengths should have been caught by ip_dooptions. */ if (optlen > cnt) optlen = cnt; if (IPOPT_COPIED(opt)) { bcopy(cp, dp, optlen); dp += optlen; } } for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) *dp++ = IPOPT_EOL; return (optlen); } /* * Set up IP options in pcb for insertion in output packets. Store in mbuf * with pointer in pcbopt, adding pseudo-option with destination address if * source routed. */ int ip_pcbopts(struct inpcb *inp, int optname, struct mbuf *m) { int cnt, optlen; u_char *cp; struct mbuf **pcbopt; u_char opt; INP_WLOCK_ASSERT(inp); pcbopt = &inp->inp_options; /* turn off any old options */ if (*pcbopt) (void)m_free(*pcbopt); *pcbopt = NULL; if (m == NULL || m->m_len == 0) { /* * Only turning off any previous options. */ if (m != NULL) (void)m_free(m); return (0); } if (m->m_len % sizeof(int32_t)) goto bad; /* * IP first-hop destination address will be stored before actual * options; move other options back and clear it when none present. */ if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) goto bad; cnt = m->m_len; m->m_len += sizeof(struct in_addr); cp = mtod(m, u_char *) + sizeof(struct in_addr); bcopy(mtod(m, void *), cp, (unsigned)cnt); bzero(mtod(m, void *), sizeof(struct in_addr)); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) goto bad; optlen = cp[IPOPT_OLEN]; if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) goto bad; } switch (opt) { default: break; case IPOPT_LSRR: case IPOPT_SSRR: /* * User process specifies route as: * * ->A->B->C->D * * D must be our final destination (but we can't * check that since we may not have connected yet). * A is first hop destination, which doesn't appear * in actual IP option, but is stored before the * options. */ /* XXX-BZ PRIV_NETINET_SETHDROPTS? */ if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) goto bad; m->m_len -= sizeof(struct in_addr); cnt -= sizeof(struct in_addr); optlen -= sizeof(struct in_addr); cp[IPOPT_OLEN] = optlen; /* * Move first hop before start of options. */ bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), sizeof(struct in_addr)); /* * Then copy rest of options back * to close up the deleted entry. */ bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)), &cp[IPOPT_OFFSET+1], (unsigned)cnt - (IPOPT_MINOFF - 1)); break; } } if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) goto bad; *pcbopt = m; return (0); bad: (void)m_free(m); return (EINVAL); } /* * Check for the presence of the IP Router Alert option [RFC2113] * in the header of an IPv4 datagram. * * This call is not intended for use from the forwarding path; it is here * so that protocol domains may check for the presence of the option. * Given how FreeBSD's IPv4 stack is currently structured, the Router Alert * option does not have much relevance to the implementation, though this * may change in future. * Router alert options SHOULD be passed if running in IPSTEALTH mode and * we are not the endpoint. * Length checks on individual options should already have been performed * by ip_dooptions() therefore they are folded under INVARIANTS here. * * Return zero if not present or options are invalid, non-zero if present. */ int ip_checkrouteralert(struct mbuf *m) { struct ip *ip = mtod(m, struct ip *); u_char *cp; int opt, optlen, cnt, found_ra; found_ra = 0; cp = (u_char *)(ip + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { #ifdef INVARIANTS if (cnt < IPOPT_OLEN + sizeof(*cp)) break; #endif optlen = cp[IPOPT_OLEN]; #ifdef INVARIANTS if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) break; #endif } switch (opt) { case IPOPT_RA: #ifdef INVARIANTS if (optlen != IPOPT_OFFSET + sizeof(uint16_t) || (*((uint16_t *)&cp[IPOPT_OFFSET]) != 0)) break; else #endif found_ra = 1; break; default: break; } } return (found_ra); } Index: head/sys/netinet6/icmp6.c =================================================================== --- head/sys/netinet6/icmp6.c (revision 362899) +++ head/sys/netinet6/icmp6.c (revision 362900) @@ -1,2807 +1,2808 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #define MBUF_PRIVATE /* XXXRW: Optimisation tries to avoid M_EXT mbufs */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern struct domain inet6domain; VNET_PCPUSTAT_DEFINE(struct icmp6stat, icmp6stat); VNET_PCPUSTAT_SYSINIT(icmp6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(icmp6stat); #endif /* VIMAGE */ VNET_DECLARE(struct inpcbinfo, ripcbinfo); VNET_DECLARE(struct inpcbhead, ripcb); VNET_DECLARE(int, icmp6errppslim); VNET_DEFINE_STATIC(int, icmp6errpps_count) = 0; VNET_DEFINE_STATIC(struct timeval, icmp6errppslim_last); VNET_DECLARE(int, icmp6_nodeinfo); #define V_ripcbinfo VNET(ripcbinfo) #define V_ripcb VNET(ripcb) #define V_icmp6errppslim VNET(icmp6errppslim) #define V_icmp6errpps_count VNET(icmp6errpps_count) #define V_icmp6errppslim_last VNET(icmp6errppslim_last) #define V_icmp6_nodeinfo VNET(icmp6_nodeinfo) static void icmp6_errcount(int, int); static int icmp6_rip6_input(struct mbuf **, int); static void icmp6_reflect(struct mbuf *, size_t); static int icmp6_ratelimit(const struct in6_addr *, const int, const int); static const char *icmp6_redirect_diag(struct in6_addr *, struct in6_addr *, struct in6_addr *); static struct mbuf *ni6_input(struct mbuf *, int, struct prison *); static struct mbuf *ni6_nametodns(const char *, int, int); static int ni6_dnsmatch(const char *, int, const char *, int); static int ni6_addrs(struct icmp6_nodeinfo *, struct mbuf *, struct ifnet **, struct in6_addr *); static int ni6_store_addrs(struct icmp6_nodeinfo *, struct icmp6_nodeinfo *, struct ifnet *, int); static int icmp6_notify_error(struct mbuf **, int, int, int); /* * Kernel module interface for updating icmp6stat. The argument is an index * into icmp6stat treated as an array of u_quad_t. While this encodes the * general layout of icmp6stat into the caller, it doesn't encode its * location, so that future changes to add, for example, per-CPU stats * support won't cause binary compatibility problems for kernel modules. */ void kmod_icmp6stat_inc(int statnum) { counter_u64_add(VNET(icmp6stat)[statnum], 1); } static void icmp6_errcount(int type, int code) { switch (type) { case ICMP6_DST_UNREACH: switch (code) { case ICMP6_DST_UNREACH_NOROUTE: ICMP6STAT_INC(icp6s_odst_unreach_noroute); return; case ICMP6_DST_UNREACH_ADMIN: ICMP6STAT_INC(icp6s_odst_unreach_admin); return; case ICMP6_DST_UNREACH_BEYONDSCOPE: ICMP6STAT_INC(icp6s_odst_unreach_beyondscope); return; case ICMP6_DST_UNREACH_ADDR: ICMP6STAT_INC(icp6s_odst_unreach_addr); return; case ICMP6_DST_UNREACH_NOPORT: ICMP6STAT_INC(icp6s_odst_unreach_noport); return; } break; case ICMP6_PACKET_TOO_BIG: ICMP6STAT_INC(icp6s_opacket_too_big); return; case ICMP6_TIME_EXCEEDED: switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: ICMP6STAT_INC(icp6s_otime_exceed_transit); return; case ICMP6_TIME_EXCEED_REASSEMBLY: ICMP6STAT_INC(icp6s_otime_exceed_reassembly); return; } break; case ICMP6_PARAM_PROB: switch (code) { case ICMP6_PARAMPROB_HEADER: ICMP6STAT_INC(icp6s_oparamprob_header); return; case ICMP6_PARAMPROB_NEXTHEADER: ICMP6STAT_INC(icp6s_oparamprob_nextheader); return; case ICMP6_PARAMPROB_OPTION: ICMP6STAT_INC(icp6s_oparamprob_option); return; } break; case ND_REDIRECT: ICMP6STAT_INC(icp6s_oredirect); return; } ICMP6STAT_INC(icp6s_ounknown); } /* * A wrapper function for icmp6_error() necessary when the erroneous packet * may not contain enough scope zone information. */ void icmp6_error2(struct mbuf *m, int type, int code, int param, struct ifnet *ifp) { struct ip6_hdr *ip6; if (ifp == NULL) return; if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); return; } } ip6 = mtod(m, struct ip6_hdr *); if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0) return; if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) return; icmp6_error(m, type, code, param); } /* * Generate an error packet of type error in response to bad IP6 packet. */ void icmp6_error(struct mbuf *m, int type, int code, int param) { struct ip6_hdr *oip6, *nip6; struct icmp6_hdr *icmp6; u_int preplen; int off; int nxt; ICMP6STAT_INC(icp6s_error); /* count per-type-code statistics */ icmp6_errcount(type, code); #ifdef M_DECRYPTED /*not openbsd*/ if (m->m_flags & M_DECRYPTED) { ICMP6STAT_INC(icp6s_canterror); goto freeit; } #endif if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); return; } } oip6 = mtod(m, struct ip6_hdr *); /* * If the destination address of the erroneous packet is a multicast * address, or the packet was sent using link-layer multicast, * we should basically suppress sending an error (RFC 2463, Section * 2.4). * We have two exceptions (the item e.2 in that section): * - the Packet Too Big message can be sent for path MTU discovery. * - the Parameter Problem Message that can be allowed an icmp6 error * in the option type field. This check has been done in * ip6_unknown_opt(), so we can just check the type and code. */ if ((m->m_flags & (M_BCAST|M_MCAST) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) && (type != ICMP6_PACKET_TOO_BIG && (type != ICMP6_PARAM_PROB || code != ICMP6_PARAMPROB_OPTION))) goto freeit; /* * RFC 2463, 2.4 (e.5): source address check. * XXX: the case of anycast source? */ if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_src)) goto freeit; /* * If we are about to send ICMPv6 against ICMPv6 error/redirect, * don't do it. */ nxt = -1; off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); if (off >= 0 && nxt == IPPROTO_ICMPV6) { struct icmp6_hdr *icp; if (m->m_len < off + sizeof(struct icmp6_hdr)) { m = m_pullup(m, off + sizeof(struct icmp6_hdr)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); return; } } oip6 = mtod(m, struct ip6_hdr *); icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); if (icp->icmp6_type < ICMP6_ECHO_REQUEST || icp->icmp6_type == ND_REDIRECT) { /* * ICMPv6 error * Special case: for redirect (which is * informational) we must not send icmp6 error. */ ICMP6STAT_INC(icp6s_canterror); goto freeit; } else { /* ICMPv6 informational - send the error */ } } else { /* non-ICMPv6 - send the error */ } /* Finally, do rate limitation check. */ if (icmp6_ratelimit(&oip6->ip6_src, type, code)) { ICMP6STAT_INC(icp6s_toofreq); goto freeit; } /* * OK, ICMP6 can be generated. */ if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN) m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len); preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); M_PREPEND(m, preplen, M_NOWAIT); /* FIB is also copied over. */ if (m == NULL) { nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__)); return; } nip6 = mtod(m, struct ip6_hdr *); nip6->ip6_src = oip6->ip6_src; nip6->ip6_dst = oip6->ip6_dst; in6_clearscope(&oip6->ip6_src); in6_clearscope(&oip6->ip6_dst); icmp6 = (struct icmp6_hdr *)(nip6 + 1); icmp6->icmp6_type = type; icmp6->icmp6_code = code; icmp6->icmp6_pptr = htonl((u_int32_t)param); ICMP6STAT_INC(icp6s_outhist[type]); icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */ return; freeit: /* * If we can't tell whether or not we can generate ICMP6, free it. */ m_freem(m); } /* * Process a received ICMP6 message. */ int icmp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m, *n; struct ifnet *ifp; struct ip6_hdr *ip6, *nip6; struct icmp6_hdr *icmp6, *nicmp6; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; int code, error, icmp6len, ip6len, noff, off, sum; NET_EPOCH_ASSERT(); m = *mp; off = *offp; if (m->m_len < off + sizeof(struct icmp6_hdr)) { m = m_pullup(m, off + sizeof(struct icmp6_hdr)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (IPPROTO_DONE); } } /* * Locate icmp6 structure in mbuf, and check * that not corrupted and of at least minimum length */ icmp6len = m->m_pkthdr.len - off; if (icmp6len < sizeof(struct icmp6_hdr)) { ICMP6STAT_INC(icp6s_tooshort); goto freeit; } ip6 = mtod(m, struct ip6_hdr *); ifp = m->m_pkthdr.rcvif; /* * Check multicast group membership. * Note: SSM filters are not applied for ICMPv6 traffic. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct in6_multi *inm; inm = in6m_lookup(ifp, &ip6->ip6_dst); if (inm == NULL) { IP6STAT_INC(ip6s_notmember); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto freeit; } } /* Calculate the checksum. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); code = icmp6->icmp6_code; if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) { nd6log((LOG_ERR, "ICMP6 checksum error(%d|%x) %s\n", icmp6->icmp6_type, sum, ip6_sprintf(ip6bufs, &ip6->ip6_src))); ICMP6STAT_INC(icp6s_checksum); goto freeit; } ICMP6STAT_INC(icp6s_inhist[icmp6->icmp6_type]); icmp6_ifstat_inc(ifp, ifs6_in_msg); if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK) icmp6_ifstat_inc(ifp, ifs6_in_error); ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: icmp6_ifstat_inc(ifp, ifs6_in_dstunreach); switch (code) { case ICMP6_DST_UNREACH_NOROUTE: case ICMP6_DST_UNREACH_ADDR: /* PRC_HOSTDEAD is a DOS */ code = PRC_UNREACH_NET; break; case ICMP6_DST_UNREACH_ADMIN: icmp6_ifstat_inc(ifp, ifs6_in_adminprohib); code = PRC_UNREACH_ADMIN_PROHIB; break; case ICMP6_DST_UNREACH_BEYONDSCOPE: /* I mean "source address was incorrect." */ code = PRC_PARAMPROB; break; case ICMP6_DST_UNREACH_NOPORT: code = PRC_UNREACH_PORT; break; default: goto badcode; } goto deliver; break; case ICMP6_PACKET_TOO_BIG: icmp6_ifstat_inc(ifp, ifs6_in_pkttoobig); /* validation is made in icmp6_mtudisc_update */ code = PRC_MSGSIZE; /* * Updating the path MTU will be done after examining * intermediate extension headers. */ goto deliver; break; case ICMP6_TIME_EXCEEDED: icmp6_ifstat_inc(ifp, ifs6_in_timeexceed); switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: code = PRC_TIMXCEED_INTRANS; break; case ICMP6_TIME_EXCEED_REASSEMBLY: code = PRC_TIMXCEED_REASS; break; default: goto badcode; } goto deliver; break; case ICMP6_PARAM_PROB: icmp6_ifstat_inc(ifp, ifs6_in_paramprob); switch (code) { case ICMP6_PARAMPROB_NEXTHEADER: code = PRC_UNREACH_PROTOCOL; break; case ICMP6_PARAMPROB_HEADER: case ICMP6_PARAMPROB_OPTION: code = PRC_PARAMPROB; break; default: goto badcode; } goto deliver; break; case ICMP6_ECHO_REQUEST: icmp6_ifstat_inc(ifp, ifs6_in_echo); if (code != 0) goto badcode; if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) { /* Give up remote */ break; } if (!M_WRITABLE(n) || n->m_len < off + sizeof(struct icmp6_hdr)) { struct mbuf *n0 = n; int n0len; CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) <= MHLEN); n = m_gethdr(M_NOWAIT, n0->m_type); if (n == NULL) { /* Give up remote */ m_freem(n0); break; } m_move_pkthdr(n, n0); /* FIB copied. */ n0len = n0->m_pkthdr.len; /* save for use below */ /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); noff = sizeof(struct ip6_hdr); /* new mbuf contains only ipv6+icmpv6 headers */ n->m_len = noff + sizeof(struct icmp6_hdr); /* * Adjust mbuf. ip6_plen will be adjusted in * ip6_output(). */ m_adj(n0, off + sizeof(struct icmp6_hdr)); /* recalculate complete packet size */ n->m_pkthdr.len = n0len + (noff - off); n->m_next = n0; } else { if (n->m_len < off + sizeof(*nicmp6)) { n = m_pullup(n, off + sizeof(*nicmp6)); if (n == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); break; } } nicmp6 = (struct icmp6_hdr *)(mtod(n, caddr_t) + off); noff = off; } if (n) { nicmp6->icmp6_type = ICMP6_ECHO_REPLY; nicmp6->icmp6_code = 0; ICMP6STAT_INC(icp6s_reflect); ICMP6STAT_INC(icp6s_outhist[ICMP6_ECHO_REPLY]); icmp6_reflect(n, noff); } break; case ICMP6_ECHO_REPLY: icmp6_ifstat_inc(ifp, ifs6_in_echoreply); if (code != 0) goto badcode; break; case MLD_LISTENER_QUERY: case MLD_LISTENER_REPORT: case MLD_LISTENER_DONE: case MLDV2_LISTENER_REPORT: /* * Drop MLD traffic which is not link-local, has a hop limit * of greater than 1 hop, or which does not have the * IPv6 HBH Router Alert option. * As IPv6 HBH options are stripped in ip6_input() we must * check an mbuf header flag. * XXX Should we also sanity check that these messages * were directed to a link-local multicast prefix? */ if ((ip6->ip6_hlim != 1) || (m->m_flags & M_RTALERT_MLD) == 0) goto freeit; if (mld_input(&m, off, icmp6len) != 0) { *mp = NULL; return (IPPROTO_DONE); } /* m stays. */ break; case ICMP6_WRUREQUEST: /* ICMP6_FQDN_QUERY */ { enum { WRU, FQDN } mode; struct prison *pr; if (!V_icmp6_nodeinfo) break; if (icmp6len == sizeof(struct icmp6_hdr) + 4) mode = WRU; else if (icmp6len >= sizeof(struct icmp6_nodeinfo)) mode = FQDN; else goto badlen; pr = NULL; sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_vnet == ifp->if_vnet) break; sx_sunlock(&allprison_lock); if (pr == NULL) pr = curthread->td_ucred->cr_prison; if (mode == FQDN) { if (m->m_len < off + sizeof(struct icmp6_nodeinfo)) { m = m_pullup(m, off + sizeof(struct icmp6_nodeinfo)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (IPPROTO_DONE); } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n) n = ni6_input(n, off, pr); /* XXX meaningless if n == NULL */ noff = sizeof(struct ip6_hdr); } else { u_char *p; int maxhlen, hlen; /* * XXX: this combination of flags is pointless, * but should we keep this for compatibility? */ if ((V_icmp6_nodeinfo & (ICMP6_NODEINFO_FQDNOK | ICMP6_NODEINFO_TMPADDROK)) != (ICMP6_NODEINFO_FQDNOK | ICMP6_NODEINFO_TMPADDROK)) break; if (code != 0) goto badcode; CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) + 4 <= MHLEN); n = m_gethdr(M_NOWAIT, m->m_type); if (n == NULL) { /* Give up remote */ break; } if (!m_dup_pkthdr(n, m, M_NOWAIT)) { /* * Previous code did a blind M_COPY_PKTHDR * and said "just for rcvif". If true, then * we could tolerate the dup failing (due to * the deep copy of the tag chain). For now * be conservative and just fail. */ m_free(n); n = NULL; break; } /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); p = (u_char *)(nicmp6 + 1); bzero(p, 4); maxhlen = M_TRAILINGSPACE(n) - (sizeof(*nip6) + sizeof(*nicmp6) + 4); mtx_lock(&pr->pr_mtx); hlen = strlen(pr->pr_hostname); if (maxhlen > hlen) maxhlen = hlen; /* meaningless TTL */ bcopy(pr->pr_hostname, p + 4, maxhlen); mtx_unlock(&pr->pr_mtx); noff = sizeof(struct ip6_hdr); n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + 4 + maxhlen; nicmp6->icmp6_type = ICMP6_WRUREPLY; nicmp6->icmp6_code = 0; } if (n) { ICMP6STAT_INC(icp6s_reflect); ICMP6STAT_INC(icp6s_outhist[ICMP6_WRUREPLY]); icmp6_reflect(n, noff); } break; } case ICMP6_WRUREPLY: if (code != 0) goto badcode; break; case ND_ROUTER_SOLICIT: icmp6_ifstat_inc(ifp, ifs6_in_routersolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_solicit)) goto badlen; if (send_sendso_input_hook != NULL) { if (m->m_len < off + icmp6len) { m = m_pullup(m, off + icmp6len); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = NULL; return (IPPROTO_DONE); } } error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_rs_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_ROUTER_ADVERT: icmp6_ifstat_inc(ifp, ifs6_in_routeradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_advert)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_ra_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_NEIGHBOR_SOLICIT: icmp6_ifstat_inc(ifp, ifs6_in_neighborsolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_solicit)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_ns_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_NEIGHBOR_ADVERT: icmp6_ifstat_inc(ifp, ifs6_in_neighboradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_advert)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_na_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_REDIRECT: icmp6_ifstat_inc(ifp, ifs6_in_redirect); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_redirect)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); icmp6_redirect_input(m, off); m = n; if (m == NULL) goto freeit; break; case ICMP6_ROUTER_RENUMBERING: if (code != ICMP6_ROUTER_RENUMBERING_COMMAND && code != ICMP6_ROUTER_RENUMBERING_RESULT) goto badcode; if (icmp6len < sizeof(struct icmp6_router_renum)) goto badlen; break; default: nd6log((LOG_DEBUG, "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n", icmp6->icmp6_type, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ifp ? ifp->if_index : 0)); if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) { /* ICMPv6 error: MUST deliver it by spec... */ code = PRC_NCMDS; /* deliver */ } else { /* ICMPv6 informational: MUST not deliver */ break; } deliver: if (icmp6_notify_error(&m, off, icmp6len, code) != 0) { /* In this case, m should've been freed. */ *mp = NULL; return (IPPROTO_DONE); } break; badcode: ICMP6STAT_INC(icp6s_badcode); break; badlen: ICMP6STAT_INC(icp6s_badlen); break; } /* deliver the packet to appropriate sockets */ icmp6_rip6_input(&m, *offp); *mp = m; return (IPPROTO_DONE); freeit: m_freem(m); *mp = NULL; return (IPPROTO_DONE); } static int icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code) { struct mbuf *m; struct icmp6_hdr *icmp6; struct ip6_hdr *eip6; u_int32_t notifymtu; struct sockaddr_in6 icmp6src, icmp6dst; m = *mp; if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) { ICMP6STAT_INC(icp6s_tooshort); goto freeit; } if (m->m_len < off + sizeof(*icmp6) + sizeof(struct ip6_hdr)) { m = m_pullup(m, off + sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (-1); } } icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); eip6 = (struct ip6_hdr *)(icmp6 + 1); /* Detect the upper level protocol */ { void (*ctlfunc)(int, struct sockaddr *, void *); u_int8_t nxt = eip6->ip6_nxt; int eoff = off + sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr); struct ip6ctlparam ip6cp; struct in6_addr *finaldst = NULL; int icmp6type = icmp6->icmp6_type; struct ip6_frag *fh; struct ip6_rthdr *rth; struct ip6_rthdr0 *rth0; int rthlen; while (1) { /* XXX: should avoid infinite loop explicitly? */ struct ip6_ext *eh; switch (nxt) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_AH: if (m->m_len < eoff + sizeof(struct ip6_ext)) { m = m_pullup(m, eoff + sizeof(struct ip6_ext)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (-1); } } eh = (struct ip6_ext *) (mtod(m, caddr_t) + eoff); if (nxt == IPPROTO_AH) eoff += (eh->ip6e_len + 2) << 2; else eoff += (eh->ip6e_len + 1) << 3; nxt = eh->ip6e_nxt; break; case IPPROTO_ROUTING: /* * When the erroneous packet contains a * routing header, we should examine the * header to determine the final destination. * Otherwise, we can't properly update * information that depends on the final * destination (e.g. path MTU). */ if (m->m_len < eoff + sizeof(*rth)) { m = m_pullup(m, eoff + sizeof(*rth)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (-1); } } rth = (struct ip6_rthdr *) (mtod(m, caddr_t) + eoff); rthlen = (rth->ip6r_len + 1) << 3; /* * XXX: currently there is no * officially defined type other * than type-0. * Note that if the segment left field * is 0, all intermediate hops must * have been passed. */ if (rth->ip6r_segleft && rth->ip6r_type == IPV6_RTHDR_TYPE_0) { int hops; if (m->m_len < eoff + rthlen) { m = m_pullup(m, eoff + rthlen); if (m == NULL) { IP6STAT_INC( ip6s_exthdrtoolong); *mp = m; return (-1); } } rth0 = (struct ip6_rthdr0 *) (mtod(m, caddr_t) + eoff); /* just ignore a bogus header */ if ((rth0->ip6r0_len % 2) == 0 && (hops = rth0->ip6r0_len/2)) finaldst = (struct in6_addr *)(rth0 + 1) + (hops - 1); } eoff += rthlen; nxt = rth->ip6r_nxt; break; case IPPROTO_FRAGMENT: if (m->m_len < eoff + sizeof(struct ip6_frag)) { m = m_pullup(m, eoff + sizeof(struct ip6_frag)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (-1); } } fh = (struct ip6_frag *)(mtod(m, caddr_t) + eoff); /* * Data after a fragment header is meaningless * unless it is the first fragment, but * we'll go to the notify label for path MTU * discovery. */ if (fh->ip6f_offlg & IP6F_OFF_MASK) goto notify; eoff += sizeof(struct ip6_frag); nxt = fh->ip6f_nxt; break; default: /* * This case includes ESP and the No Next * Header. In such cases going to the notify * label does not have any meaning * (i.e. ctlfunc will be NULL), but we go * anyway since we might have to update * path MTU information. */ goto notify; } } notify: icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); /* * retrieve parameters from the inner IPv6 header, and convert * them into sockaddr structures. * XXX: there is no guarantee that the source or destination * addresses of the inner packet are in the same scope as * the addresses of the icmp packet. But there is no other * way to determine the zone. */ eip6 = (struct ip6_hdr *)(icmp6 + 1); bzero(&icmp6dst, sizeof(icmp6dst)); icmp6dst.sin6_len = sizeof(struct sockaddr_in6); icmp6dst.sin6_family = AF_INET6; if (finaldst == NULL) icmp6dst.sin6_addr = eip6->ip6_dst; else icmp6dst.sin6_addr = *finaldst; if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; bzero(&icmp6src, sizeof(icmp6src)); icmp6src.sin6_len = sizeof(struct sockaddr_in6); icmp6src.sin6_family = AF_INET6; icmp6src.sin6_addr = eip6->ip6_src; if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; icmp6src.sin6_flowinfo = (eip6->ip6_flow & IPV6_FLOWLABEL_MASK); if (finaldst == NULL) finaldst = &eip6->ip6_dst; ip6cp.ip6c_m = m; ip6cp.ip6c_icmp6 = icmp6; ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1); ip6cp.ip6c_off = eoff; ip6cp.ip6c_finaldst = finaldst; ip6cp.ip6c_src = &icmp6src; ip6cp.ip6c_nxt = nxt; if (icmp6type == ICMP6_PACKET_TOO_BIG) { notifymtu = ntohl(icmp6->icmp6_mtu); ip6cp.ip6c_cmdarg = (void *)¬ifymtu; icmp6_mtudisc_update(&ip6cp, 1); /*XXX*/ } ctlfunc = (void (*)(int, struct sockaddr *, void *)) (inet6sw[ip6_protox[nxt]].pr_ctlinput); if (ctlfunc) { (void) (*ctlfunc)(code, (struct sockaddr *)&icmp6dst, &ip6cp); } } *mp = m; return (0); freeit: m_freem(m); *mp = NULL; return (-1); } void icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated) { struct in6_addr *dst = ip6cp->ip6c_finaldst; struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6; struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */ u_int mtu = ntohl(icmp6->icmp6_mtu); struct in_conninfo inc; #if 0 /* * RFC2460 section 5, last paragraph. * even though minimum link MTU for IPv6 is IPV6_MMTU, * we may see ICMPv6 too big with mtu < IPV6_MMTU * due to packet translator in the middle. * see ip6_output() and ip6_getpmtu() "alwaysfrag" case for * special handling. */ if (mtu < IPV6_MMTU) return; #endif /* * we reject ICMPv6 too big with abnormally small value. * XXX what is the good definition of "abnormally small"? */ if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8) return; if (!validated) return; /* * In case the suggested mtu is less than IPV6_MMTU, we * only need to remember that it was for above mentioned * "alwaysfrag" case. * Try to be as close to the spec as possible. */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU - 8; bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) return; if (mtu < tcp_maxmtu6(&inc, NULL)) { tcp_hc_updatemtu(&inc, mtu); ICMP6STAT_INC(icp6s_pmtuchg); } } /* * Process a Node Information Query packet, based on * draft-ietf-ipngwg-icmp-name-lookups-07. * * Spec incompatibilities: * - IPv6 Subject address handling * - IPv4 Subject address handling support missing * - Proxy reply (answer even if it's not for me) * - joins NI group address at in6_ifattach() time only, does not cope * with hostname changes by sethostname(3) */ static struct mbuf * ni6_input(struct mbuf *m, int off, struct prison *pr) { struct icmp6_nodeinfo *ni6, *nni6; struct mbuf *n = NULL; u_int16_t qtype; int subjlen; int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); struct ni_reply_fqdn *fqdn; int addrs; /* for NI_QTYPE_NODEADDR */ struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */ struct in6_addr in6_subj; /* subject address */ struct ip6_hdr *ip6; int oldfqdn = 0; /* if 1, return pascal string (03 draft) */ char *subj = NULL; struct in6_ifaddr *ia6 = NULL; ip6 = mtod(m, struct ip6_hdr *); ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off); /* * Validate IPv6 source address. * The default configuration MUST be to refuse answering queries from * global-scope addresses according to RFC4602. * Notes: * - it's not very clear what "refuse" means; this implementation * simply drops it. * - it's not very easy to identify global-scope (unicast) addresses * since there are many prefixes for them. It should be safer * and in practice sufficient to check "all" but loopback and * link-local (note that site-local unicast was deprecated and * ULA is defined as global scope-wise) */ if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 && !IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) && !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) goto bad; /* * Validate IPv6 destination address. * * The Responder must discard the Query without further processing * unless it is one of the Responder's unicast or anycast addresses, or * a link-local scope multicast address which the Responder has joined. * [RFC4602, Section 5.] */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) goto bad; /* else it's a link-local multicast, fine */ } else { /* unicast or anycast */ ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia6 == NULL) goto bad; /* XXX impossible */ if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) && !(V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) { ifa_free(&ia6->ia_ifa); nd6log((LOG_DEBUG, "ni6_input: ignore node info to " "a temporary address in %s:%d", __FILE__, __LINE__)); goto bad; } ifa_free(&ia6->ia_ifa); } /* validate query Subject field. */ qtype = ntohs(ni6->ni_qtype); subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo); switch (qtype) { case NI_QTYPE_NOOP: case NI_QTYPE_SUPTYPES: /* 07 draft */ if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0) break; /* FALLTHROUGH */ case NI_QTYPE_FQDN: case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: #if ICMP6_NI_SUBJ_IPV6 != 0 case 0: #endif /* * backward compatibility - try to accept 03 draft * format, where no Subject is present. */ if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 && subjlen == 0) { oldfqdn++; break; } #if ICMP6_NI_SUBJ_IPV6 != 0 if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6) goto bad; #endif if (subjlen != sizeof(struct in6_addr)) goto bad; /* * Validate Subject address. * * Not sure what exactly "address belongs to the node" * means in the spec, is it just unicast, or what? * * At this moment we consider Subject address as * "belong to the node" if the Subject address equals * to the IPv6 destination address; validation for * IPv6 destination address should have done enough * check for us. * * We do not do proxy at this moment. */ m_copydata(m, off + sizeof(struct icmp6_nodeinfo), subjlen, (caddr_t)&in6_subj); if (in6_setscope(&in6_subj, m->m_pkthdr.rcvif, NULL)) goto bad; subj = (char *)&in6_subj; if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj)) break; /* * XXX if we are to allow other cases, we should really * be careful about scope here. * basically, we should disallow queries toward IPv6 * destination X with subject Y, * if scope(X) > scope(Y). * if we allow scope(X) > scope(Y), it will result in * information leakage across scope boundary. */ goto bad; case ICMP6_NI_SUBJ_FQDN: /* * Validate Subject name with gethostname(3). * * The behavior may need some debate, since: * - we are not sure if the node has FQDN as * hostname (returned by gethostname(3)). * - the code does wildcard match for truncated names. * however, we are not sure if we want to perform * wildcard match, if gethostname(3) side has * truncated hostname. */ mtx_lock(&pr->pr_mtx); n = ni6_nametodns(pr->pr_hostname, strlen(pr->pr_hostname), 0); mtx_unlock(&pr->pr_mtx); if (!n || n->m_next || n->m_len == 0) goto bad; if (m->m_len < off + sizeof(struct icmp6_nodeinfo) + subjlen) { m = m_pullup(m, off + sizeof(struct icmp6_nodeinfo) + subjlen); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); goto bad; } } /* ip6 possibly invalid but not used after. */ ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off); subj = (char *)(mtod(m, caddr_t) + off + sizeof(struct icmp6_nodeinfo)); if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *), n->m_len)) { goto bad; } m_freem(n); n = NULL; break; case ICMP6_NI_SUBJ_IPV4: /* XXX: to be implemented? */ default: goto bad; } break; } /* refuse based on configuration. XXX ICMP6_NI_REFUSED? */ switch (qtype) { case NI_QTYPE_FQDN: if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0) goto bad; break; case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0) goto bad; break; } /* guess reply length */ switch (qtype) { case NI_QTYPE_NOOP: break; /* no reply data */ case NI_QTYPE_SUPTYPES: replylen += sizeof(u_int32_t); break; case NI_QTYPE_FQDN: /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); break; case NI_QTYPE_NODEADDR: addrs = ni6_addrs(ni6, m, &ifp, (struct in6_addr *)subj); if ((replylen += addrs * (sizeof(struct in6_addr) + sizeof(u_int32_t))) > MCLBYTES) replylen = MCLBYTES; /* XXX: will truncate pkt later */ break; case NI_QTYPE_IPV4ADDR: /* unsupported - should respond with unknown Qtype? */ break; default: /* * XXX: We must return a reply with the ICMP6 code * `unknown Qtype' in this case. However we regard the case * as an FQDN query for backward compatibility. * Older versions set a random value to this field, * so it rarely varies in the defined qtypes. * But the mechanism is not reliable... * maybe we should obsolete older versions. */ qtype = NI_QTYPE_FQDN; /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); oldfqdn++; break; } /* Allocate an mbuf to reply. */ if (replylen > MCLBYTES) { /* * XXX: should we try to allocate more? But MCLBYTES * is probably much larger than IPV6_MMTU... */ goto bad; } if (replylen > MHLEN) n = m_getcl(M_NOWAIT, m->m_type, M_PKTHDR); else n = m_gethdr(M_NOWAIT, m->m_type); if (n == NULL) { m_freem(m); return (NULL); } m_move_pkthdr(n, m); /* just for recvif and FIB */ n->m_pkthdr.len = n->m_len = replylen; /* copy mbuf header and IPv6 + Node Information base headers */ bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr)); nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1); bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo)); /* qtype dependent procedure */ switch (qtype) { case NI_QTYPE_NOOP: nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = 0; break; case NI_QTYPE_SUPTYPES: { u_int32_t v; nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = htons(0x0000); /* raw bitmap */ /* supports NOOP, SUPTYPES, FQDN, and NODEADDR */ v = (u_int32_t)htonl(0x0000000f); bcopy(&v, nni6 + 1, sizeof(u_int32_t)); break; } case NI_QTYPE_FQDN: nni6->ni_code = ICMP6_NI_SUCCESS; fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) + sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo)); nni6->ni_flags = 0; /* XXX: meaningless TTL */ fqdn->ni_fqdn_ttl = 0; /* ditto. */ /* * XXX do we really have FQDN in hostname? */ mtx_lock(&pr->pr_mtx); n->m_next = ni6_nametodns(pr->pr_hostname, strlen(pr->pr_hostname), oldfqdn); mtx_unlock(&pr->pr_mtx); if (n->m_next == NULL) goto bad; /* XXX we assume that n->m_next is not a chain */ if (n->m_next->m_next != NULL) goto bad; n->m_pkthdr.len += n->m_next->m_len; break; case NI_QTYPE_NODEADDR: { int lenlim, copied; nni6->ni_code = ICMP6_NI_SUCCESS; n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); lenlim = M_TRAILINGSPACE(n); copied = ni6_store_addrs(ni6, nni6, ifp, lenlim); /* XXX: reset mbuf length */ n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo) + copied; break; } default: break; /* XXX impossible! */ } nni6->ni_type = ICMP6_NI_REPLY; m_freem(m); return (n); bad: m_freem(m); if (n) m_freem(n); return (NULL); } /* * make a mbuf with DNS-encoded string. no compression support. * * XXX names with less than 2 dots (like "foo" or "foo.section") will be * treated as truncated name (two \0 at the end). this is a wild guess. * * old - return pascal string if non-zero */ static struct mbuf * ni6_nametodns(const char *name, int namelen, int old) { struct mbuf *m; char *cp, *ep; const char *p, *q; int i, len, nterm; if (old) len = namelen + 1; else len = MCLBYTES; /* Because MAXHOSTNAMELEN is usually 256, we use cluster mbuf. */ if (len > MLEN) m = m_getcl(M_NOWAIT, MT_DATA, 0); else m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) goto fail; if (old) { m->m_len = len; *mtod(m, char *) = namelen; bcopy(name, mtod(m, char *) + 1, namelen); return m; } else { m->m_len = 0; cp = mtod(m, char *); ep = mtod(m, char *) + M_TRAILINGSPACE(m); /* if not certain about my name, return empty buffer */ if (namelen == 0) return m; /* * guess if it looks like shortened hostname, or FQDN. * shortened hostname needs two trailing "\0". */ i = 0; for (p = name; p < name + namelen; p++) { if (*p && *p == '.') i++; } if (i < 2) nterm = 2; else nterm = 1; p = name; while (cp < ep && p < name + namelen) { i = 0; for (q = p; q < name + namelen && *q && *q != '.'; q++) i++; /* result does not fit into mbuf */ if (cp + i + 1 >= ep) goto fail; /* * DNS label length restriction, RFC1035 page 8. * "i == 0" case is included here to avoid returning * 0-length label on "foo..bar". */ if (i <= 0 || i >= 64) goto fail; *cp++ = i; bcopy(p, cp, i); cp += i; p = q; if (p < name + namelen && *p == '.') p++; } /* termination */ if (cp + nterm >= ep) goto fail; while (nterm-- > 0) *cp++ = '\0'; m->m_len = cp - mtod(m, char *); return m; } panic("should not reach here"); /* NOTREACHED */ fail: if (m) m_freem(m); return NULL; } /* * check if two DNS-encoded string matches. takes care of truncated * form (with \0\0 at the end). no compression support. * XXX upper/lowercase match (see RFC2065) */ static int ni6_dnsmatch(const char *a, int alen, const char *b, int blen) { const char *a0, *b0; int l; /* simplest case - need validation? */ if (alen == blen && bcmp(a, b, alen) == 0) return 1; a0 = a; b0 = b; /* termination is mandatory */ if (alen < 2 || blen < 2) return 0; if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0') return 0; alen--; blen--; while (a - a0 < alen && b - b0 < blen) { if (a - a0 + 1 > alen || b - b0 + 1 > blen) return 0; if ((signed char)a[0] < 0 || (signed char)b[0] < 0) return 0; /* we don't support compression yet */ if (a[0] >= 64 || b[0] >= 64) return 0; /* truncated case */ if (a[0] == 0 && a - a0 == alen - 1) return 1; if (b[0] == 0 && b - b0 == blen - 1) return 1; if (a[0] == 0 || b[0] == 0) return 0; if (a[0] != b[0]) return 0; l = a[0]; if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen) return 0; if (bcmp(a + 1, b + 1, l) != 0) return 0; a += 1 + l; b += 1 + l; } if (a - a0 == alen && b - b0 == blen) return 1; else return 0; } /* * calculate the number of addresses to be returned in the node info reply. */ static int ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp, struct in6_addr *subj) { struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; int addrs = 0, addrsofif, iffound = 0; int niflags = ni6->ni_flags; NET_EPOCH_ASSERT(); if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) { switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: if (subj == NULL) /* must be impossible... */ return (0); break; default: /* * XXX: we only support IPv6 subject address for * this Qtype. */ return (0); } } CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { addrsofif = 0; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 && IN6_ARE_ADDR_EQUAL(subj, &ifa6->ia_addr.sin6_addr)) iffound = 1; /* * IPv4-mapped addresses can only be returned by a * Node Information proxy, since they represent * addresses of IPv4-only nodes, which perforce do * not implement this protocol. * [icmp-name-lookups-07, Section 5.4] * So we don't support NI_NODEADDR_FLAG_COMPAT in * this function at this moment. */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; /* we need only unicast addresses */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } addrsofif++; /* count the address */ } if (iffound) { *ifpp = ifp; return (addrsofif); } addrs += addrsofif; } return (addrs); } static int ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, struct ifnet *ifp0, int resid) { struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; struct ifnet *ifp_dep = NULL; int copied = 0, allow_deprecated = 0; u_char *cp = (u_char *)(nni6 + 1); int niflags = ni6->ni_flags; u_int32_t ltime; NET_EPOCH_ASSERT(); if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL)) return (0); /* needless to copy */ ifp = ifp0 ? ifp0 : CK_STAILQ_FIRST(&V_ifnet); again: for (; ifp; ifp = CK_STAILQ_NEXT(ifp, if_link)) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 && allow_deprecated == 0) { /* * prefererred address should be put before * deprecated addresses. */ /* record the interface for later search */ if (ifp_dep == NULL) ifp_dep = ifp; continue; } else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 && allow_deprecated != 0) continue; /* we now collect deprecated addrs */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } /* now we can copy the address */ if (resid < sizeof(struct in6_addr) + sizeof(u_int32_t)) { /* * We give up much more copy. * Set the truncate flag and return. */ nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE; return (copied); } /* * Set the TTL of the address. * The TTL value should be one of the following * according to the specification: * * 1. The remaining lifetime of a DHCP lease on the * address, or * 2. The remaining Valid Lifetime of a prefix from * which the address was derived through Stateless * Autoconfiguration. * * Note that we currently do not support stateful * address configuration by DHCPv6, so the former * case can't happen. */ if (ifa6->ia6_lifetime.ia6t_expire == 0) ltime = ND6_INFINITE_LIFETIME; else { if (ifa6->ia6_lifetime.ia6t_expire > time_uptime) ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_uptime); else ltime = 0; } bcopy(<ime, cp, sizeof(u_int32_t)); cp += sizeof(u_int32_t); /* copy the address itself */ bcopy(&ifa6->ia_addr.sin6_addr, cp, sizeof(struct in6_addr)); in6_clearscope((struct in6_addr *)cp); /* XXX */ cp += sizeof(struct in6_addr); resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t)); copied += (sizeof(struct in6_addr) + sizeof(u_int32_t)); } if (ifp0) /* we need search only on the specified IF */ break; } if (allow_deprecated == 0 && ifp_dep != NULL) { ifp = ifp_dep; allow_deprecated = 1; goto again; } return (copied); } /* * XXX almost dup'ed code with rip6_input. */ static int icmp6_rip6_input(struct mbuf **mp, int off) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb *inp; struct inpcb *last = NULL; struct sockaddr_in6 fromsa; struct icmp6_hdr *icmp6; struct mbuf *opts = NULL; NET_EPOCH_ASSERT(); /* This is assumed to be safe; icmp6_input() does a pullup. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); /* * XXX: the address may have embedded scope zone ID, which should be * hidden from applications. */ bzero(&fromsa, sizeof(fromsa)); fromsa.sin6_family = AF_INET6; fromsa.sin6_len = sizeof(struct sockaddr_in6); fromsa.sin6_addr = ip6->ip6_src; if (sa6_recoverscope(&fromsa)) { m_freem(m); *mp = NULL; return (IPPROTO_DONE); } CK_LIST_FOREACH(inp, &V_ripcb, inp_list) { if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (inp->inp_ip_p != IPPROTO_ICMPV6) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) continue; INP_RLOCK(inp); if (__predict_false(inp->inp_flags2 & INP_FREED)) { INP_RUNLOCK(inp); continue; } if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, inp->in6p_icmp6filt)) { INP_RUNLOCK(inp); continue; } if (last != NULL) { struct mbuf *n = NULL; /* * Recent network drivers tend to allocate a single * mbuf cluster, rather than to make a couple of * mbufs without clusters. Also, since the IPv6 code * path tries to avoid m_pullup(), it is highly * probable that we still have an mbuf cluster here * even though the necessary length can be stored in an * mbuf's internal buffer. * Meanwhile, the default size of the receive socket * buffer for raw sockets is not so large. This means * the possibility of packet loss is relatively higher * than before. To avoid this scenario, we copy the * received data to a separate mbuf that does not use * a cluster, if possible. * XXX: it is better to copy the data after stripping * intermediate headers. */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { n = m_get(M_NOWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; } else { m_free(n); n = NULL; } } } if (n != NULL || (n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { if (last->inp_flags & INP_CONTROLOPTS) ip6_savecontrol(last, n, &opts); /* strip intermediate headers */ m_adj(n, off); SOCKBUF_LOCK(&last->inp_socket->so_rcv); if (sbappendaddr_locked( &last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) { m_freem(opts); } SOCKBUF_UNLOCK( &last->inp_socket->so_rcv); } else sorwakeup_locked(last->inp_socket); opts = NULL; } INP_RUNLOCK(last); } last = inp; } if (last != NULL) { if (last->inp_flags & INP_CONTROLOPTS) ip6_savecontrol(last, m, &opts); /* strip intermediate headers */ m_adj(m, off); /* avoid using mbuf clusters if possible (see above) */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { struct mbuf *n; n = m_get(M_NOWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; m_freem(m); m = n; } else { m_freem(n); n = NULL; } } } SOCKBUF_LOCK(&last->inp_socket->so_rcv); if (sbappendaddr_locked(&last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, m, opts) == 0) { m_freem(m); if (opts) m_freem(opts); SOCKBUF_UNLOCK(&last->inp_socket->so_rcv); } else sorwakeup_locked(last->inp_socket); INP_RUNLOCK(last); } else { m_freem(m); IP6STAT_DEC(ip6s_delivered); } *mp = NULL; return (IPPROTO_DONE); } /* * Reflect the ip6 packet back to the source. * OFF points to the icmp6 header, counted from the top of the mbuf. */ static void icmp6_reflect(struct mbuf *m, size_t off) { struct in6_addr src6, *srcp; struct ip6_hdr *ip6; struct icmp6_hdr *icmp6; struct in6_ifaddr *ia = NULL; struct ifnet *outif = NULL; int plen; int type, code, hlim; /* too short to reflect */ if (off < sizeof(struct ip6_hdr)) { nd6log((LOG_DEBUG, "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n", (u_long)off, (u_long)sizeof(struct ip6_hdr), __FILE__, __LINE__)); goto bad; } /* * If there are extra headers between IPv6 and ICMPv6, strip * off that header first. */ #ifdef DIAGNOSTIC if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN) panic("assumption failed in icmp6_reflect"); #endif if (off > sizeof(struct ip6_hdr)) { size_t l; struct ip6_hdr nip6; l = off - sizeof(struct ip6_hdr); m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6); m_adj(m, l); l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6)); } else /* off == sizeof(struct ip6_hdr) */ { size_t l; l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } } plen = m->m_pkthdr.len - sizeof(struct ip6_hdr); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_nxt = IPPROTO_ICMPV6; icmp6 = (struct icmp6_hdr *)(ip6 + 1); type = icmp6->icmp6_type; /* keep type for statistics */ code = icmp6->icmp6_code; /* ditto. */ hlim = 0; srcp = NULL; /* * If the incoming packet was addressed directly to us (i.e. unicast), * use dst as the src for the reply. * The IN6_IFF_NOTREADY case should be VERY rare, but is possible * (for example) when we encounter an error while forwarding procedure * destined to a duplicated address of ours. */ if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia != NULL && !(ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY))) { src6 = ia->ia_addr.sin6_addr; srcp = &src6; if (m->m_pkthdr.rcvif != NULL) { /* XXX: This may not be the outgoing interface */ hlim = ND_IFINFO(m->m_pkthdr.rcvif)->chlim; } else hlim = V_ip6_defhlim; } if (ia != NULL) ifa_free(&ia->ia_ifa); } if (srcp == NULL) { int error; struct in6_addr dst6; uint32_t scopeid; /* * This case matches to multicasts, our anycast, or unicasts * that we do not own. Select a source address based on the * source address of the erroneous packet. */ in6_splitscope(&ip6->ip6_src, &dst6, &scopeid); error = in6_selectsrc_addr(M_GETFIB(m), &dst6, scopeid, NULL, &src6, &hlim); if (error) { char ip6buf[INET6_ADDRSTRLEN]; nd6log((LOG_DEBUG, "icmp6_reflect: source can't be determined: " "dst=%s, error=%d\n", ip6_sprintf(ip6buf, &ip6->ip6_dst), error)); goto bad; } srcp = &src6; } /* * ip6_input() drops a packet if its src is multicast. * So, the src is never multicast. */ ip6->ip6_dst = ip6->ip6_src; ip6->ip6_src = *srcp; ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = hlim; icmp6->icmp6_cksum = 0; icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), plen); /* * XXX option handling */ m->m_flags &= ~(M_BCAST|M_MCAST); m->m_pkthdr.rcvif = NULL; ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) icmp6_ifoutstat_inc(outif, type, code); return; bad: m_freem(m); return; } void icmp6_fasttimo(void) { mld_fasttimo(); } void icmp6_slowtimo(void) { mld_slowtimo(); } static const char * icmp6_redirect_diag(struct in6_addr *src6, struct in6_addr *dst6, struct in6_addr *tgt6) { static char buf[1024]; char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; char ip6buft[INET6_ADDRSTRLEN]; snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)", ip6_sprintf(ip6bufs, src6), ip6_sprintf(ip6bufd, dst6), ip6_sprintf(ip6buft, tgt6)); return buf; } void icmp6_redirect_input(struct mbuf *m, int off) { struct ifnet *ifp; struct ip6_hdr *ip6; struct nd_redirect *nd_rd; struct in6_addr src6, redtgt6, reddst6; union nd_opts ndopts; char ip6buf[INET6_ADDRSTRLEN]; char *lladdr; int icmp6len, is_onlink, is_router, lladdrlen; M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: no rcvif", __func__)); /* XXX if we are router, we don't update route by icmp6 redirect */ if (V_ip6_forwarding) goto freeit; if (!V_icmp6_rediraccept) goto freeit; /* RFC 6980: Nodes MUST silently ignore fragments */ if(m->m_flags & M_FRAGMENTED) goto freeit; ip6 = mtod(m, struct ip6_hdr *); icmp6len = ntohs(ip6->ip6_plen); if (m->m_len < off + icmp6len) { m = m_pullup(m, off + icmp6len); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); return; } } ip6 = mtod(m, struct ip6_hdr *); nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off); ifp = m->m_pkthdr.rcvif; redtgt6 = nd_rd->nd_rd_target; reddst6 = nd_rd->nd_rd_dst; if (in6_setscope(&redtgt6, ifp, NULL) || in6_setscope(&reddst6, ifp, NULL)) { goto freeit; } /* validation */ src6 = ip6->ip6_src; if (!IN6_IS_ADDR_LINKLOCAL(&src6)) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "must be from linklocal\n", ip6_sprintf(ip6buf, &src6))); goto bad; } if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "hlim=%d (must be 255)\n", ip6_sprintf(ip6buf, &src6), ip6->ip6_hlim)); goto bad; } { /* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */ - struct nhop6_basic nh6; + struct nhop_object *nh; struct in6_addr kdst; uint32_t scopeid; in6_splitscope(&reddst6, &kdst, &scopeid); - if (fib6_lookup_nh_basic(ifp->if_fib, &kdst, scopeid, 0, 0,&nh6)==0){ - if ((nh6.nh_flags & NHF_GATEWAY) == 0) { + NET_EPOCH_ASSERT(); + nh = fib6_lookup(ifp->if_fib, &kdst, scopeid, 0, 0); + if (nh == NULL) { + struct in6_addr nh_addr; + nh_addr = ifatoia6(nh->nh_ifa)->ia_addr.sin6_addr; + if ((nh->nh_flags & NHF_GATEWAY) == 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; no route " "with inet6 gateway found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* - * Embed scope zone id into next hop address, since - * fib6_lookup_nh_basic() returns address without embedded - * scope zone id. + * Embed scope zone id into next hop address. */ - if (in6_setscope(&nh6.nh_addr, m->m_pkthdr.rcvif, NULL)) - goto freeit; + nh_addr = nh->gw6_sa.sin6_addr; - if (IN6_ARE_ADDR_EQUAL(&src6, &nh6.nh_addr) == 0) { + if (IN6_ARE_ADDR_EQUAL(&src6, &nh_addr) == 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "not equal to gw-for-src=%s (must be same): " "%s\n", - ip6_sprintf(ip6buf, &nh6.nh_addr), + ip6_sprintf(ip6buf, &nh_addr), icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } } else { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "no route found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } } if (IN6_IS_ADDR_MULTICAST(&reddst6)) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "redirect dst must be unicast: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } is_router = is_onlink = 0; if (IN6_IS_ADDR_LINKLOCAL(&redtgt6)) is_router = 1; /* router case */ if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0) is_onlink = 1; /* on-link destination case */ if (!is_router && !is_onlink) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "neither router case nor onlink case: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } icmp6len -= sizeof(*nd_rd); nd6_option_init(nd_rd + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "%s: invalid ND option, rejected: %s\n", __func__, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); /* nd6_options have incremented stats */ goto freeit; } lladdr = NULL; lladdrlen = 0; if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "%s: lladdrlen mismatch for %s " "(if %d, icmp6 packet %d): %s\n", __func__, ip6_sprintf(ip6buf, &redtgt6), ifp->if_addrlen, lladdrlen - 2, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* Validation passed. */ /* RFC 2461 8.3 */ nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT, is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER); /* * Install a gateway route in the better-router case or an interface * route in the on-link-destination case. */ { struct sockaddr_in6 sdst; struct sockaddr_in6 sgw; struct sockaddr_in6 ssrc; struct sockaddr *gw; int rt_flags; u_int fibnum; bzero(&sdst, sizeof(sdst)); bzero(&ssrc, sizeof(ssrc)); sdst.sin6_family = ssrc.sin6_family = AF_INET6; sdst.sin6_len = ssrc.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr)); rt_flags = 0; if (is_router) { bzero(&sgw, sizeof(sgw)); sgw.sin6_family = AF_INET6; sgw.sin6_len = sizeof(struct sockaddr_in6); bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr)); gw = (struct sockaddr *)&sgw; rt_flags |= RTF_GATEWAY; } else gw = ifp->if_addr->ifa_addr; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) rib_add_redirect(fibnum, (struct sockaddr *)&sdst, gw, (struct sockaddr *)&ssrc, ifp, rt_flags, V_icmp6_redirtimeout); } /* finally update cached route in each socket via pfctlinput */ { struct sockaddr_in6 sdst; bzero(&sdst, sizeof(sdst)); sdst.sin6_family = AF_INET6; sdst.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&sdst); } freeit: m_freem(m); return; bad: ICMP6STAT_INC(icp6s_badredirect); m_freem(m); } void icmp6_redirect_output(struct mbuf *m0, struct nhop_object *nh) { struct ifnet *ifp; /* my outgoing interface */ struct in6_addr *ifp_ll6; struct in6_addr *router_ll6; struct ip6_hdr *sip6; /* m0 as struct ip6_hdr */ struct mbuf *m = NULL; /* newly allocated one */ struct m_tag *mtag; struct ip6_hdr *ip6; /* m as struct ip6_hdr */ struct nd_redirect *nd_rd; struct llentry *ln = NULL; size_t maxlen; u_char *p; struct ifnet *outif = NULL; struct sockaddr_in6 src_sa; icmp6_errcount(ND_REDIRECT, 0); /* if we are not router, we don't send icmp6 redirect */ if (!V_ip6_forwarding) goto fail; /* sanity check */ if (!m0 || !nh || !(NH_IS_VALID(nh)) || !(ifp = nh->nh_ifp)) goto fail; /* * Address check: * the source address must identify a neighbor, and * the destination address must not be a multicast address * [RFC 2461, sec 8.2] */ sip6 = mtod(m0, struct ip6_hdr *); bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = sip6->ip6_src; if (nd6_is_addr_neighbor(&src_sa, ifp) == 0) goto fail; if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst)) goto fail; /* what should we do here? */ /* rate limit */ if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0)) goto fail; /* * Since we are going to append up to 1280 bytes (= IPV6_MMTU), * we almost always ask for an mbuf cluster for simplicity. * (MHLEN < IPV6_MMTU is almost always true) */ #if IPV6_MMTU >= MCLBYTES # error assumption failed about IPV6_MMTU and MCLBYTES #endif m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) goto fail; M_SETFIB(m, M_GETFIB(m0)); maxlen = M_TRAILINGSPACE(m); maxlen = min(IPV6_MMTU, maxlen); /* just for safety */ if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) { goto fail; } { /* get ip6 linklocal address for ifp(my outgoing interface). */ struct in6_ifaddr *ia; if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY| IN6_IFF_ANYCAST)) == NULL) goto fail; ifp_ll6 = &ia->ia_addr.sin6_addr; /* XXXRW: reference released prematurely. */ ifa_free(&ia->ia_ifa); } /* get ip6 linklocal address for the router. */ if (nh->nh_flags & NHF_GATEWAY) { struct sockaddr_in6 *sin6; sin6 = &nh->gw6_sa; router_ll6 = &sin6->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(router_ll6)) router_ll6 = (struct in6_addr *)NULL; } else router_ll6 = (struct in6_addr *)NULL; /* ip6 */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; /* ip6->ip6_src must be linklocal addr for my outgoing if. */ bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr)); bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr)); /* ND Redirect */ nd_rd = (struct nd_redirect *)(ip6 + 1); nd_rd->nd_rd_type = ND_REDIRECT; nd_rd->nd_rd_code = 0; nd_rd->nd_rd_reserved = 0; if (nh->nh_flags & NHF_GATEWAY) { /* * nd_rd->nd_rd_target must be a link-local address in * better router cases. */ if (!router_ll6) goto fail; bcopy(router_ll6, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } else { /* make sure redtgt == reddst */ bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } p = (u_char *)(nd_rd + 1); if (!router_ll6) goto nolladdropt; { /* target lladdr option */ int len; struct nd_opt_hdr *nd_opt; char *lladdr; ln = nd6_lookup(router_ll6, 0, ifp); if (ln == NULL) goto nolladdropt; len = sizeof(*nd_opt) + ifp->if_addrlen; len = (len + 7) & ~7; /* round by 8 */ /* safety check */ if (len + (p - (u_char *)ip6) > maxlen) goto nolladdropt; if (ln->la_flags & LLE_VALID) { nd_opt = (struct nd_opt_hdr *)p; nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = len >> 3; lladdr = (char *)(nd_opt + 1); bcopy(ln->ll_addr, lladdr, ifp->if_addrlen); p += len; } } nolladdropt: if (ln != NULL) LLE_RUNLOCK(ln); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* just to be safe */ #ifdef M_DECRYPTED /*not openbsd*/ if (m0->m_flags & M_DECRYPTED) goto noredhdropt; #endif if (p - (u_char *)ip6 > maxlen) goto noredhdropt; { /* redirected header option */ int len; struct nd_opt_rd_hdr *nd_opt_rh; /* * compute the maximum size for icmp6 redirect header option. * XXX room for auth header? */ len = maxlen - (p - (u_char *)ip6); len &= ~7; /* This is just for simplicity. */ if (m0->m_pkthdr.len != m0->m_len) { if (m0->m_next) { m_freem(m0->m_next); m0->m_next = NULL; } m0->m_pkthdr.len = m0->m_len; } /* * Redirected header option spec (RFC2461 4.6.3) talks nothing * about padding/truncate rule for the original IP packet. * From the discussion on IPv6imp in Feb 1999, * the consensus was: * - "attach as much as possible" is the goal * - pad if not aligned (original size can be guessed by * original ip6 header) * Following code adds the padding if it is simple enough, * and truncates if not. */ if (m0->m_next || m0->m_pkthdr.len != m0->m_len) panic("assumption failed in %s:%d", __FILE__, __LINE__); if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) { /* not enough room, truncate */ m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } else { /* enough room, pad or truncate */ size_t extra; extra = m0->m_pkthdr.len % 8; if (extra) { /* pad if easy enough, truncate if not */ if (8 - extra <= M_TRAILINGSPACE(m0)) { /* pad */ m0->m_len += (8 - extra); m0->m_pkthdr.len += (8 - extra); } else { /* truncate */ m0->m_pkthdr.len -= extra; m0->m_len -= extra; } } len = m0->m_pkthdr.len + sizeof(*nd_opt_rh); m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } nd_opt_rh = (struct nd_opt_rd_hdr *)p; bzero(nd_opt_rh, sizeof(*nd_opt_rh)); nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER; nd_opt_rh->nd_opt_rh_len = len >> 3; p += sizeof(*nd_opt_rh); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* connect m0 to m */ m_tag_delete_chain(m0, NULL); m0->m_flags &= ~M_PKTHDR; m->m_next = m0; m->m_pkthdr.len = m->m_len + m0->m_len; m0 = NULL; } noredhdropt:; if (m0) { m_freem(m0); m0 = NULL; } /* XXX: clear embedded link IDs in the inner header */ in6_clearscope(&sip6->ip6_src); in6_clearscope(&sip6->ip6_dst); in6_clearscope(&nd_rd->nd_rd_target); in6_clearscope(&nd_rd->nd_rd_dst); ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); nd_rd->nd_rd_cksum = 0; nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), ntohs(ip6->ip6_plen)); if (send_sendso_input_hook != NULL) { mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short), M_NOWAIT); if (mtag == NULL) goto fail; *(unsigned short *)(mtag + 1) = nd_rd->nd_rd_type; m_tag_prepend(m, mtag); } /* send the packet to outside... */ ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_redirect); } ICMP6STAT_INC(icp6s_outhist[ND_REDIRECT]); return; fail: if (m) m_freem(m); if (m0) m_freem(m0); } /* * ICMPv6 socket option processing. */ int icmp6_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0; int optlen; struct inpcb *inp = sotoinpcb(so); int level, op, optname; if (sopt) { level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; } else level = op = optname = optlen = 0; if (level != IPPROTO_ICMPV6) { return EINVAL; } switch (op) { case PRCO_SETOPT: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter ic6f; if (optlen != sizeof(ic6f)) { error = EMSGSIZE; break; } error = sooptcopyin(sopt, &ic6f, optlen, optlen); if (error == 0) { INP_WLOCK(inp); *inp->in6p_icmp6filt = ic6f; INP_WUNLOCK(inp); } break; } default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter ic6f; INP_RLOCK(inp); ic6f = *inp->in6p_icmp6filt; INP_RUNLOCK(inp); error = sooptcopyout(sopt, &ic6f, sizeof(ic6f)); break; } default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Perform rate limit check. * Returns 0 if it is okay to send the icmp6 packet. * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate * limitation. * * XXX per-destination/type check necessary? * * dst - not used at this moment * type - not used at this moment * code - not used at this moment */ static int icmp6_ratelimit(const struct in6_addr *dst, const int type, const int code) { int ret; ret = 0; /* okay to send */ /* PPS limit */ if (!ppsratecheck(&V_icmp6errppslim_last, &V_icmp6errpps_count, V_icmp6errppslim)) { /* The packet is subject to rate limit */ ret++; } return ret; } Index: head/sys/netinet6/in6.c =================================================================== --- head/sys/netinet6/in6.c (revision 362899) +++ head/sys/netinet6/in6.c (revision 362900) @@ -1,2557 +1,2557 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6.c,v 1.259 2002/01/21 11:37:50 keiichi Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.2 (Berkeley) 11/15/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * struct in6_ifreq and struct ifreq must be type punnable for common members * of ifr_ifru to allow accessors to be shared. */ _Static_assert(offsetof(struct in6_ifreq, ifr_ifru) == offsetof(struct ifreq, ifr_ifru), "struct in6_ifreq and struct ifreq are not type punnable"); VNET_DECLARE(int, icmp6_nodeinfo_oldmcprefix); #define V_icmp6_nodeinfo_oldmcprefix VNET(icmp6_nodeinfo_oldmcprefix) /* * Definitions of some costant IP6 addresses. */ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; const struct in6_addr in6addr_nodelocal_allnodes = IN6ADDR_NODELOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; const struct in6_addr in6addr_linklocal_allv2routers = IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT; const struct in6_addr in6mask0 = IN6MASK0; const struct in6_addr in6mask32 = IN6MASK32; const struct in6_addr in6mask64 = IN6MASK64; const struct in6_addr in6mask96 = IN6MASK96; const struct in6_addr in6mask128 = IN6MASK128; const struct sockaddr_in6 sa6_any = { sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 }; static int in6_notify_ifa(struct ifnet *, struct in6_ifaddr *, struct in6_aliasreq *, int); static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *); static int in6_validate_ifra(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); static struct in6_ifaddr *in6_alloc_ifa(struct ifnet *, struct in6_aliasreq *, int flags); static int in6_update_ifa_internal(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int, int); static int in6_broadcast_ifa(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); #define ifa2ia6(ifa) ((struct in6_ifaddr *)(ifa)) #define ia62ifa(ia6) (&((ia6)->ia_ifa)) void in6_newaddrmsg(struct in6_ifaddr *ia, int cmd) { struct rt_addrinfo info; struct ifaddr *ifa; struct sockaddr_dl gateway; int fibnum; ifa = &ia->ia_ifa; /* * Prepare info data for the host route. * This code mimics one from ifa_maintain_loopback_route(). */ bzero(&info, sizeof(struct rt_addrinfo)); info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED; info.rti_info[RTAX_DST] = ifa->ifa_addr; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gateway; link_init_sdl(ifa->ifa_ifp, (struct sockaddr *)&gateway, ifa->ifa_ifp->if_type); if (cmd != RTM_DELETE) info.rti_ifp = V_loif; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : ia62ifa(ia)->ifa_ifp->if_fib; if (cmd == RTM_ADD) { rt_addrmsg(cmd, &ia->ia_ifa, fibnum); rt_routemsg_info(cmd, &info, fibnum); } else if (cmd == RTM_DELETE) { rt_routemsg_info(cmd, &info, fibnum); rt_addrmsg(cmd, &ia->ia_ifa, fibnum); } } int in6_mask2len(struct in6_addr *mask, u_char *lim0) { int x = 0, y; u_char *lim = lim0, *p; /* ignore the scope_id part */ if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask)) lim = (u_char *)mask + sizeof(*mask); for (p = (u_char *)mask; p < lim; x++, p++) { if (*p != 0xff) break; } y = 0; if (p < lim) { for (y = 0; y < 8; y++) { if ((*p & (0x80 >> y)) == 0) break; } } /* * when the limit pointer is given, do a stricter check on the * remaining bits. */ if (p < lim) { if (y != 0 && (*p & (0x00ff >> y)) != 0) return (-1); for (p = p + 1; p < lim; p++) if (*p != 0) return (-1); } return x * 8 + y; } #ifdef COMPAT_FREEBSD32 struct in6_ndifreq32 { char ifname[IFNAMSIZ]; uint32_t ifindex; }; #define SIOCGDEFIFACE32_IN6 _IOWR('i', 86, struct in6_ndifreq32) #endif int in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct in6_ifreq *ifr = (struct in6_ifreq *)data; struct in6_ifaddr *ia = NULL; struct in6_aliasreq *ifra = (struct in6_aliasreq *)data; struct sockaddr_in6 *sa6; int carp_attached = 0; int error; u_long ocmd = cmd; /* * Compat to make pre-10.x ifconfig(8) operable. */ if (cmd == OSIOCAIFADDR_IN6) cmd = SIOCAIFADDR_IN6; switch (cmd) { case SIOCGETSGCNT_IN6: case SIOCGETMIFCNT_IN6: /* * XXX mrt_ioctl has a 3rd, unused, FIB argument in route.c. * We cannot see how that would be needed, so do not adjust the * KPI blindly; more likely should clean up the IPv4 variant. */ return (mrt6_ioctl ? mrt6_ioctl(cmd, data) : EOPNOTSUPP); } switch (cmd) { case SIOCAADDRCTL_POLICY: case SIOCDADDRCTL_POLICY: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ADDRCTRL6); if (error) return (error); } return (in6_src_ioctl(cmd, data)); } if (ifp == NULL) return (EOPNOTSUPP); switch (cmd) { case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCSDEFIFACE_IN6: case SIOCSIFINFO_FLAGS: case SIOCSIFINFO_IN6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ND6); if (error) return (error); } /* FALLTHROUGH */ case OSIOCGIFINFO_IN6: case SIOCGIFINFO_IN6: case SIOCGNBRINFO_IN6: case SIOCGDEFIFACE_IN6: return (nd6_ioctl(cmd, data, ifp)); #ifdef COMPAT_FREEBSD32 case SIOCGDEFIFACE32_IN6: { struct in6_ndifreq ndif; struct in6_ndifreq32 *ndif32; error = nd6_ioctl(SIOCGDEFIFACE_IN6, (caddr_t)&ndif, ifp); if (error) return (error); ndif32 = (struct in6_ndifreq32 *)data; ndif32->ifindex = ndif.ifindex; return (0); } #endif } switch (cmd) { case SIOCSIFPREFIX_IN6: case SIOCDIFPREFIX_IN6: case SIOCAIFPREFIX_IN6: case SIOCCIFPREFIX_IN6: case SIOCSGIFPREFIX_IN6: case SIOCGIFPREFIX_IN6: log(LOG_NOTICE, "prefix ioctls are now invalidated. " "please use ifconfig.\n"); return (EOPNOTSUPP); } switch (cmd) { case SIOCSSCOPE6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SCOPE6); if (error) return (error); } /* FALLTHROUGH */ case SIOCGSCOPE6: case SIOCGSCOPE6DEF: return (scope6_ioctl(cmd, data, ifp)); } /* * Find address for this interface, if it exists. * * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation * only, and used the first interface address as the target of other * operations (without checking ifra_addr). This was because netinet * code/API assumed at most 1 interface address per interface. * Since IPv6 allows a node to assign multiple addresses * on a single interface, we almost always look and check the * presence of ifra_addr, and reject invalid ones here. * It also decreases duplicated code among SIOC*_IN6 operations. */ switch (cmd) { case SIOCAIFADDR_IN6: case SIOCSIFPHYADDR_IN6: sa6 = &ifra->ifra_addr; break; case SIOCSIFADDR_IN6: case SIOCGIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFNETMASK_IN6: case SIOCDIFADDR_IN6: case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: case SIOCGIFAFLAG_IN6: case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCGIFALIFETIME_IN6: case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: sa6 = &ifr->ifr_addr; break; case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* * Although we should pass any non-INET6 ioctl requests * down to driver, we filter some legacy INET requests. * Drivers trust SIOCSIFADDR et al to come from an already * privileged layer, and do not perform any credentials * checks or input validation. */ return (EINVAL); default: sa6 = NULL; break; } if (sa6 && sa6->sin6_family == AF_INET6) { if (sa6->sin6_scope_id != 0) error = sa6_embedscope(sa6, 0); else error = in6_setscope(&sa6->sin6_addr, ifp, NULL); if (error != 0) return (error); if (td != NULL && (error = prison_check_ip6(td->td_ucred, &sa6->sin6_addr)) != 0) return (error); ia = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr); } else ia = NULL; switch (cmd) { case SIOCSIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: /* * Since IPv6 allows a node to assign multiple addresses * on a single interface, SIOCSIFxxx ioctls are deprecated. */ /* we decided to obsolete this command (20000704) */ error = EINVAL; goto out; case SIOCDIFADDR_IN6: /* * for IPv4, we look for existing in_ifaddr here to allow * "ifconfig if0 delete" to remove the first IPv4 address on * the interface. For IPv6, as the spec allows multiple * interface address from the day one, we consider "remove the * first one" semantics to be not preferable. */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } /* FALLTHROUGH */ case SIOCAIFADDR_IN6: /* * We always require users to specify a valid IPv6 address for * the corresponding operation. */ if (ifra->ifra_addr.sin6_family != AF_INET6 || ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) { error = EAFNOSUPPORT; goto out; } if (td != NULL) { error = priv_check(td, (cmd == SIOCDIFADDR_IN6) ? PRIV_NET_DELIFADDR : PRIV_NET_ADDIFADDR); if (error) goto out; } /* FALLTHROUGH */ case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: if (ifp->if_afdata[AF_INET6] == NULL) { error = EPFNOSUPPORT; goto out; } break; case SIOCGIFADDR_IN6: /* This interface is basically deprecated. use SIOCGIFCONF. */ /* FALLTHROUGH */ case SIOCGIFAFLAG_IN6: case SIOCGIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFALIFETIME_IN6: /* must think again about its semantics */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } break; } switch (cmd) { case SIOCGIFADDR_IN6: ifr->ifr_addr = ia->ia_addr; if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0) goto out; break; case SIOCGIFDSTADDR_IN6: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; goto out; } ifr->ifr_dstaddr = ia->ia_dstaddr; if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0) goto out; break; case SIOCGIFNETMASK_IN6: ifr->ifr_addr = ia->ia_prefixmask; break; case SIOCGIFAFLAG_IN6: ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags; break; case SIOCGIFSTAT_IN6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->in6_ifstat, &ifr->ifr_ifru.ifru_stat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFSTAT_ICMP6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->icmp6_ifstat, &ifr->ifr_ifru.ifru_icmp6stat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFALIFETIME_IN6: ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_vltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_expire = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_vltime; } else retlt->ia6t_expire = maxexpire; } if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_pltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_preferred = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_pltime; } else retlt->ia6t_preferred = maxexpire; } break; case SIOCAIFADDR_IN6: { struct nd_prefixctl pr0; struct nd_prefix *pr; /* * first, make or update the interface address structure, * and link it to the list. */ if ((error = in6_update_ifa(ifp, ifra, ia, 0)) != 0) goto out; if (ia != NULL) { if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, true); ifa_free(&ia->ia_ifa); } if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr)) == NULL) { /* * this can happen when the user specify the 0 valid * lifetime. */ break; } if (cmd == ocmd && ifra->ifra_vhid > 0) { if (carp_attach_p != NULL) error = (*carp_attach_p)(&ia->ia_ifa, ifra->ifra_vhid); else error = EPROTONOSUPPORT; if (error) goto out; else carp_attached = 1; } /* * then, make the prefix on-link on the interface. * XXX: we'd rather create the prefix before the address, but * we need at least one address to install the corresponding * interface route, so we configure the address first. */ /* * convert mask to prefix length (prefixmask has already * been validated in in6_update_ifa(). */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, NULL); if (pr0.ndpr_plen == 128) { /* we don't need to install a host route. */ goto aifaddr_out; } pr0.ndpr_prefix = ifra->ifra_addr; /* apply the mask for safety. */ IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &ifra->ifra_prefixmask.sin6_addr); /* * XXX: since we don't have an API to set prefix (not address) * lifetimes, we just use the same lifetimes as addresses. * The (temporarily) installed lifetimes can be overridden by * later advertised RAs (when accept_rtadv is non 0), which is * an intended behavior. */ pr0.ndpr_raf_onlink = 1; /* should be configurable? */ pr0.ndpr_raf_auto = ((ifra->ifra_flags & IN6_IFF_AUTOCONF) != 0); pr0.ndpr_vltime = ifra->ifra_lifetime.ia6t_vltime; pr0.ndpr_pltime = ifra->ifra_lifetime.ia6t_pltime; /* add the prefix if not yet. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { /* * nd6_prelist_add will install the corresponding * interface route. */ if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) { if (carp_attached) (*carp_detach_p)(&ia->ia_ifa, false); goto out; } } /* relate the address to the prefix */ if (ia->ia6_ndpr == NULL) { ia->ia6_ndpr = pr; pr->ndpr_addrcnt++; /* * If this is the first autoconf address from the * prefix, create a temporary address as well * (when required). */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF) && V_ip6_use_tempaddr && pr->ndpr_addrcnt == 1) { int e; if ((e = in6_tmpifadd(ia, 1, 0)) != 0) { log(LOG_NOTICE, "in6_control: failed " "to create a temporary address, " "errno=%d\n", e); } } } nd6_prefix_rele(pr); /* * this might affect the status of autoconfigured addresses, * that is, this address might make other addresses detached. */ pfxlist_onlink_check(); aifaddr_out: /* * Try to clear the flag when a new IPv6 address is added * onto an IFDISABLED interface and it succeeds. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) { struct in6_ndireq nd; memset(&nd, 0, sizeof(nd)); nd.ndi.flags = ND_IFINFO(ifp)->flags; nd.ndi.flags &= ~ND6_IFF_IFDISABLED; if (nd6_ioctl(SIOCSIFINFO_FLAGS, (caddr_t)&nd, ifp) < 0) log(LOG_NOTICE, "SIOCAIFADDR_IN6: " "SIOCSIFINFO_FLAGS for -ifdisabled " "failed."); /* * Ignore failure of clearing the flag intentionally. * The failure means address duplication was detected. */ } break; } case SIOCDIFADDR_IN6: { struct nd_prefix *pr; /* * If the address being deleted is the only one that owns * the corresponding prefix, expire the prefix as well. * XXX: theoretically, we don't have to worry about such * relationship, since we separate the address management * and the prefix management. We do this, however, to provide * as much backward compatibility as possible in terms of * the ioctl operation. * Note that in6_purgeaddr() will decrement ndpr_addrcnt. */ pr = ia->ia6_ndpr; in6_purgeaddr(&ia->ia_ifa); if (pr != NULL && pr->ndpr_addrcnt == 0) { ND6_WLOCK(); nd6_prefix_unlink(pr, NULL); ND6_WUNLOCK(); nd6_prefix_del(pr); } EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, &ia->ia_ifa, IFADDR_EVENT_DEL); break; } default: if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto out; } error = (*ifp->if_ioctl)(ifp, cmd, data); goto out; } error = 0; out: if (ia != NULL) ifa_free(&ia->ia_ifa); return (error); } static struct in6_multi_mship * in6_joingroup_legacy(struct ifnet *ifp, const struct in6_addr *mcaddr, int *errorp, int delay) { struct in6_multi_mship *imm; int error; imm = malloc(sizeof(*imm), M_IP6MADDR, M_NOWAIT); if (imm == NULL) { *errorp = ENOBUFS; return (NULL); } delay = (delay * PR_FASTHZ) / hz; error = in6_joingroup(ifp, mcaddr, NULL, &imm->i6mm_maddr, delay); if (error) { *errorp = error; free(imm, M_IP6MADDR); return (NULL); } return (imm); } /* * Join necessary multicast groups. Factored out from in6_update_ifa(). * This entire work should only be done once, for the default FIB. */ static int in6_update_ifa_join_mc(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags, struct in6_multi **in6m_sol) { char ip6buf[INET6_ADDRSTRLEN]; struct in6_addr mltaddr; struct in6_multi_mship *imm; int delay, error; KASSERT(in6m_sol != NULL, ("%s: in6m_sol is NULL", __func__)); /* Join solicited multicast addr for new host id. */ bzero(&mltaddr, sizeof(struct in6_addr)); mltaddr.s6_addr32[0] = IPV6_ADDR_INT32_MLL; mltaddr.s6_addr32[2] = htonl(1); mltaddr.s6_addr32[3] = ifra->ifra_addr.sin6_addr.s6_addr32[3]; mltaddr.s6_addr8[12] = 0xff; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) { /* XXX: should not happen */ log(LOG_ERR, "%s: in6_setscope failed\n", __func__); goto cleanup; } delay = error = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need a random delay for DAD on the address being * configured. It also means delaying transmission of the * corresponding MLD report to avoid report collision. * [RFC 4861, Section 6.3.7] */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); *in6m_sol = imm->i6mm_maddr; /* * Join link-local all-nodes address. */ mltaddr = in6addr_linklocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup_legacy(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); /* * Join node information group address. */ delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * The spec does not say anything about delay for this group, * but the same logic should apply. */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } if (in6_nigroup(ifp, NULL, -1, &mltaddr) == 0) { /* XXX jinmei */ imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } if (V_icmp6_nodeinfo_oldmcprefix && in6_nigroup_oldmcprefix(ifp, NULL, -1, &mltaddr) == 0) { imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } /* * Join interface-local all-nodes address. * (ff01::1%ifN, and ff01::%ifN/32) */ mltaddr = in6addr_nodelocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup_legacy(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); cleanup: return (error); } /* * Update parameters of an IPv6 interface address. * If necessary, a new entry is created and linked into address chains. * This function is separated from in6_control(). */ int in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int error, hostIsNew = 0; if ((error = in6_validate_ifra(ifp, ifra, ia, flags)) != 0) return (error); if (ia == NULL) { hostIsNew = 1; if ((ia = in6_alloc_ifa(ifp, ifra, flags)) == NULL) return (ENOBUFS); } error = in6_update_ifa_internal(ifp, ifra, ia, hostIsNew, flags); if (error != 0) { if (hostIsNew != 0) { in6_unlink_ifa(ia, ifp); ifa_free(&ia->ia_ifa); } return (error); } if (hostIsNew) error = in6_broadcast_ifa(ifp, ifra, ia, flags); return (error); } /* * Fill in basic IPv6 address request info. */ void in6_prepare_ifra(struct in6_aliasreq *ifra, const struct in6_addr *addr, const struct in6_addr *mask) { memset(ifra, 0, sizeof(struct in6_aliasreq)); ifra->ifra_addr.sin6_family = AF_INET6; ifra->ifra_addr.sin6_len = sizeof(struct sockaddr_in6); if (addr != NULL) ifra->ifra_addr.sin6_addr = *addr; ifra->ifra_prefixmask.sin6_family = AF_INET6; ifra->ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); if (mask != NULL) ifra->ifra_prefixmask.sin6_addr = *mask; } static int in6_validate_ifra(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int plen = -1; struct sockaddr_in6 dst6; struct in6_addrlifetime *lt; char ip6buf[INET6_ADDRSTRLEN]; /* Validate parameters */ if (ifp == NULL || ifra == NULL) /* this maybe redundant */ return (EINVAL); /* * The destination address for a p2p link must have a family * of AF_UNSPEC or AF_INET6. */ if ((ifp->if_flags & IFF_POINTOPOINT) != 0 && ifra->ifra_dstaddr.sin6_family != AF_INET6 && ifra->ifra_dstaddr.sin6_family != AF_UNSPEC) return (EAFNOSUPPORT); /* * Validate address */ if (ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6) || ifra->ifra_addr.sin6_family != AF_INET6) return (EINVAL); /* * validate ifra_prefixmask. don't check sin6_family, netmask * does not carry fields other than sin6_len. */ if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6)) return (EINVAL); /* * Because the IPv6 address architecture is classless, we require * users to specify a (non 0) prefix length (mask) for a new address. * We also require the prefix (when specified) mask is valid, and thus * reject a non-consecutive mask. */ if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0) return (EINVAL); if (ifra->ifra_prefixmask.sin6_len != 0) { plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, (u_char *)&ifra->ifra_prefixmask + ifra->ifra_prefixmask.sin6_len); if (plen <= 0) return (EINVAL); } else { /* * In this case, ia must not be NULL. We just use its prefix * length. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); } /* * If the destination address on a p2p interface is specified, * and the address is a scoped one, validate/set the scope * zone identifier. */ dst6 = ifra->ifra_dstaddr; if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 && (dst6.sin6_family == AF_INET6)) { struct in6_addr in6_tmp; u_int32_t zoneid; in6_tmp = dst6.sin6_addr; if (in6_setscope(&in6_tmp, ifp, &zoneid)) return (EINVAL); /* XXX: should be impossible */ if (dst6.sin6_scope_id != 0) { if (dst6.sin6_scope_id != zoneid) return (EINVAL); } else /* user omit to specify the ID. */ dst6.sin6_scope_id = zoneid; /* convert into the internal form */ if (sa6_embedscope(&dst6, 0)) return (EINVAL); /* XXX: should be impossible */ } /* Modify original ifra_dstaddr to reflect changes */ ifra->ifra_dstaddr = dst6; /* * The destination address can be specified only for a p2p or a * loopback interface. If specified, the corresponding prefix length * must be 128. */ if (ifra->ifra_dstaddr.sin6_family == AF_INET6) { if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) { /* XXX: noisy message */ nd6log((LOG_INFO, "in6_update_ifa: a destination can " "be specified for a p2p or a loopback IF only\n")); return (EINVAL); } if (plen != 128) { nd6log((LOG_INFO, "in6_update_ifa: prefixlen should " "be 128 when dstaddr is specified\n")); return (EINVAL); } } /* lifetime consistency check */ lt = &ifra->ifra_lifetime; if (lt->ia6t_pltime > lt->ia6t_vltime) return (EINVAL); if (lt->ia6t_vltime == 0) { /* * the following log might be noisy, but this is a typical * configuration mistake or a tool's bug. */ nd6log((LOG_INFO, "in6_update_ifa: valid lifetime is 0 for %s\n", ip6_sprintf(ip6buf, &ifra->ifra_addr.sin6_addr))); if (ia == NULL) return (0); /* there's nothing to do */ } /* Check prefix mask */ if (ia != NULL && ifra->ifra_prefixmask.sin6_len != 0) { /* * We prohibit changing the prefix length of an existing * address, because * + such an operation should be rare in IPv6, and * + the operation would confuse prefix management. */ if (ia->ia_prefixmask.sin6_len != 0 && in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) != plen) { nd6log((LOG_INFO, "in6_validate_ifa: the prefix length " "of an existing %s address should not be changed\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); return (EINVAL); } } return (0); } /* * Allocate a new ifaddr and link it into chains. */ static struct in6_ifaddr * in6_alloc_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags) { struct in6_ifaddr *ia; /* * When in6_alloc_ifa() is called in a process of a received * RA, it is called under an interrupt context. So, we should * call malloc with M_NOWAIT. */ ia = (struct in6_ifaddr *)ifa_alloc(sizeof(*ia), M_NOWAIT); if (ia == NULL) return (NULL); LIST_INIT(&ia->ia6_memberships); /* Initialize the address and masks, and put time stamp */ ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; ia->ia_addr.sin6_family = AF_INET6; ia->ia_addr.sin6_len = sizeof(ia->ia_addr); /* XXX: Can we assign ,sin6_addr and skip the rest? */ ia->ia_addr = ifra->ifra_addr; ia->ia6_createtime = time_uptime; if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) { /* * Some functions expect that ifa_dstaddr is not * NULL for p2p interfaces. */ ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; } else { ia->ia_ifa.ifa_dstaddr = NULL; } /* set prefix mask if any */ ia->ia_ifa.ifa_netmask = (struct sockaddr *)&ia->ia_prefixmask; if (ifra->ifra_prefixmask.sin6_len != 0) { ia->ia_prefixmask.sin6_family = AF_INET6; ia->ia_prefixmask.sin6_len = ifra->ifra_prefixmask.sin6_len; ia->ia_prefixmask.sin6_addr = ifra->ifra_prefixmask.sin6_addr; } ia->ia_ifp = ifp; ifa_ref(&ia->ia_ifa); /* if_addrhead */ IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(&ia->ia_ifa); /* in6_ifaddrhead */ IN6_IFADDR_WLOCK(); CK_STAILQ_INSERT_TAIL(&V_in6_ifaddrhead, ia, ia_link); CK_LIST_INSERT_HEAD(IN6ADDR_HASH(&ia->ia_addr.sin6_addr), ia, ia6_hash); IN6_IFADDR_WUNLOCK(); return (ia); } /* * Update/configure interface address parameters: * * 1) Update lifetime * 2) Update interface metric ad flags * 3) Notify other subsystems */ static int in6_update_ifa_internal(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int hostIsNew, int flags) { int error; /* update timestamp */ ia->ia6_updatetime = time_uptime; /* * Set lifetimes. We do not refer to ia6t_expire and ia6t_preferred * to see if the address is deprecated or invalidated, but initialize * these members for applications. */ ia->ia6_lifetime = ifra->ifra_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_expire = time_uptime + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_preferred = time_uptime + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; /* * backward compatibility - if IN6_IFF_DEPRECATED is set from the * userland, make it deprecated. */ if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) { ia->ia6_lifetime.ia6t_pltime = 0; ia->ia6_lifetime.ia6t_preferred = time_uptime; } /* * configure address flags. */ ia->ia6_flags = ifra->ifra_flags; /* * Make the address tentative before joining multicast addresses, * so that corresponding MLD responses would not have a tentative * source address. */ ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */ /* * DAD should be performed for an new address or addresses on * an interface with ND6_IFF_IFDISABLED. */ if (in6if_do_dad(ifp) && (hostIsNew || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED))) ia->ia6_flags |= IN6_IFF_TENTATIVE; /* notify other subsystems */ error = in6_notify_ifa(ifp, ia, ifra, hostIsNew); return (error); } /* * Do link-level ifa job: * 1) Add lle entry for added address * 2) Notifies routing socket users about new address * 3) join appropriate multicast group * 4) start DAD if enabled */ static int in6_broadcast_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { struct in6_multi *in6m_sol; int error = 0; /* Add local address to lltable, if necessary (ex. on p2p link). */ if ((error = nd6_add_ifa_lle(ia)) != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } /* Join necessary multicast groups. */ in6m_sol = NULL; if ((ifp->if_flags & IFF_MULTICAST) != 0) { error = in6_update_ifa_join_mc(ifp, ifra, ia, flags, &in6m_sol); if (error != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } } /* Perform DAD, if the address is TENTATIVE. */ if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) { int delay, mindelay, maxdelay; delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need to impose a delay before sending an NS * for DAD. Check if we also needed a delay for the * corresponding MLD message. If we did, the delay * should be larger than the MLD delay (this could be * relaxed a bit, but this simple logic is at least * safe). * XXX: Break data hiding guidelines and look at * state for the solicited multicast group. */ mindelay = 0; if (in6m_sol != NULL && in6m_sol->in6m_state == MLD_REPORTING_MEMBER) { mindelay = in6m_sol->in6m_timer; } maxdelay = MAX_RTR_SOLICITATION_DELAY * hz; if (maxdelay - mindelay == 0) delay = 0; else { delay = (arc4random() % (maxdelay - mindelay)) + mindelay; } } nd6_dad_start((struct ifaddr *)ia, delay); } in6_newaddrmsg(ia, RTM_ADD); ifa_free(&ia->ia_ifa); return (error); } void in6_purgeaddr(struct ifaddr *ifa) { struct ifnet *ifp = ifa->ifa_ifp; struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa; struct in6_multi_mship *imm; int plen, error; if (ifa->ifa_carp) (*carp_detach_p)(ifa, false); /* * Remove the loopback route to the interface address. * The check for the current setting of "nd6_useloopback" * is not needed. */ if (ia->ia_flags & IFA_RTSELF) { error = ifa_del_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags &= ~IFA_RTSELF; } /* stop DAD processing */ nd6_dad_stop(ifa); /* Leave multicast groups. */ while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) { LIST_REMOVE(imm, i6mm_chain); if (imm->i6mm_maddr != NULL) in6_leavegroup(imm->i6mm_maddr, NULL); free(imm, M_IP6MADDR); } plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if ((ia->ia_flags & IFA_ROUTE) && plen == 128) { error = rtinit(&(ia->ia_ifa), RTM_DELETE, ia->ia_flags | (ia->ia_dstaddr.sin6_family == AF_INET6 ? RTF_HOST : 0)); if (error != 0) log(LOG_INFO, "%s: err=%d, destination address delete " "failed\n", __func__, error); ia->ia_flags &= ~IFA_ROUTE; } in6_newaddrmsg(ia, RTM_DELETE); in6_unlink_ifa(ia, ifp); } static void in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp) { char ip6buf[INET6_ADDRSTRLEN]; int remove_lle; IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ /* * Defer the release of what might be the last reference to the * in6_ifaddr so that it can't be freed before the remainder of the * cleanup. */ IN6_IFADDR_WLOCK(); CK_STAILQ_REMOVE(&V_in6_ifaddrhead, ia, in6_ifaddr, ia_link); CK_LIST_REMOVE(ia, ia6_hash); IN6_IFADDR_WUNLOCK(); /* * Release the reference to the base prefix. There should be a * positive reference. */ remove_lle = 0; if (ia->ia6_ndpr == NULL) { nd6log((LOG_NOTICE, "in6_unlink_ifa: autoconf'ed address " "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia)))); } else { ia->ia6_ndpr->ndpr_addrcnt--; /* Do not delete lles within prefix if refcont != 0 */ if (ia->ia6_ndpr->ndpr_addrcnt == 0) remove_lle = 1; ia->ia6_ndpr = NULL; } nd6_rem_ifa_lle(ia, remove_lle); /* * Also, if the address being removed is autoconf'ed, call * pfxlist_onlink_check() since the release might affect the status of * other (detached) addresses. */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF)) { pfxlist_onlink_check(); } ifa_free(&ia->ia_ifa); /* in6_ifaddrhead */ } /* * Notifies other subsystems about address change/arrival: * 1) Notifies device handler on the first IPv6 address assignment * 2) Handle routing table changes for P2P links and route * 3) Handle routing table changes for address host route */ static int in6_notify_ifa(struct ifnet *ifp, struct in6_ifaddr *ia, struct in6_aliasreq *ifra, int hostIsNew) { int error = 0, plen, ifacount = 0; struct ifaddr *ifa; struct sockaddr_in6 *pdst; char ip6buf[INET6_ADDRSTRLEN]; /* * Give the interface a chance to initialize * if this is its first address, */ if (hostIsNew != 0) { struct epoch_tracker et; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifacount++; } NET_EPOCH_EXIT(et); } if (ifacount <= 1 && ifp->if_ioctl) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) goto done; } /* * If a new destination address is specified, scrub the old one and * install the new destination. Note that the interface must be * p2p or loopback. */ pdst = &ifra->ifra_dstaddr; if (pdst->sin6_family == AF_INET6 && !IN6_ARE_ADDR_EQUAL(&pdst->sin6_addr, &ia->ia_dstaddr.sin6_addr)) { if ((ia->ia_flags & IFA_ROUTE) != 0 && (rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST) != 0)) { nd6log((LOG_ERR, "in6_update_ifa_internal: failed to " "remove a route to the old destination: %s\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); /* proceed anyway... */ } else ia->ia_flags &= ~IFA_ROUTE; ia->ia_dstaddr = *pdst; } /* * If a new destination address is specified for a point-to-point * interface, install a route to the destination as an interface * direct route. * XXX: the logic below rejects assigning multiple addresses on a p2p * interface that share the same destination. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if (!(ia->ia_flags & IFA_ROUTE) && plen == 128 && ia->ia_dstaddr.sin6_family == AF_INET6) { int rtflags = RTF_UP | RTF_HOST; /* * Handle the case for ::1 . */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_flags |= IFA_RTSELF; error = rtinit(&ia->ia_ifa, RTM_ADD, ia->ia_flags | rtflags); if (error) goto done; ia->ia_flags |= IFA_ROUTE; } /* * add a loopback route to self if not exists */ if (!(ia->ia_flags & IFA_RTSELF) && V_nd6_useloopback) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags |= IFA_RTSELF; } done: WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Invoking IPv6 network device address event may sleep"); ifa_ref(&ia->ia_ifa); EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, &ia->ia_ifa, IFADDR_EVENT_ADD); ifa_free(&ia->ia_ifa); return (error); } /* * Find an IPv6 interface link-local address specific to an interface. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags) { struct ifaddr *ifa; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) { if ((((struct in6_ifaddr *)ifa)->ia6_flags & ignoreflags) != 0) continue; ifa_ref(ifa); break; } } return ((struct in6_ifaddr *)ifa); } /* * find the interface address corresponding to a given IPv6 address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_LIST_FOREACH(ia, IN6ADDR_HASH(addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) { if (zoneid != 0 && zoneid != ia->ia_addr.sin6_scope_id) continue; ifa_ref(&ia->ia_ifa); break; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (ia); } /* * find the internet address corresponding to a given interface and address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpwithaddr(struct ifnet *ifp, const struct in6_addr *addr) { struct epoch_tracker et; struct ifaddr *ifa; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) { ifa_ref(ifa); break; } } NET_EPOCH_EXIT(et); return ((struct in6_ifaddr *)ifa); } /* * Find a link-local scoped address on ifp and return it if any. */ struct in6_ifaddr * in6ifa_llaonifp(struct ifnet *ifp) { struct epoch_tracker et; struct sockaddr_in6 *sin6; struct ifaddr *ifa; if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) return (NULL); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_NODELOCAL(&sin6->sin6_addr)) break; } NET_EPOCH_EXIT(et); return ((struct in6_ifaddr *)ifa); } /* * Convert IP6 address to printable (loggable) representation. Caller * has to make sure that ip6buf is at least INET6_ADDRSTRLEN long. */ static char digits[] = "0123456789abcdef"; char * ip6_sprintf(char *ip6buf, const struct in6_addr *addr) { int i, cnt = 0, maxcnt = 0, idx = 0, index = 0; char *cp; const u_int16_t *a = (const u_int16_t *)addr; const u_int8_t *d; int dcolon = 0, zero = 0; cp = ip6buf; for (i = 0; i < 8; i++) { if (*(a + i) == 0) { cnt++; if (cnt == 1) idx = i; } else if (maxcnt < cnt) { maxcnt = cnt; index = idx; cnt = 0; } } if (maxcnt < cnt) { maxcnt = cnt; index = idx; } for (i = 0; i < 8; i++) { if (dcolon == 1) { if (*a == 0) { if (i == 7) *cp++ = ':'; a++; continue; } else dcolon = 2; } if (*a == 0) { if (dcolon == 0 && *(a + 1) == 0 && i == index) { if (i == 0) *cp++ = ':'; *cp++ = ':'; dcolon = 1; } else { *cp++ = '0'; *cp++ = ':'; } a++; continue; } d = (const u_char *)a; /* Try to eliminate leading zeros in printout like in :0001. */ zero = 1; *cp = digits[*d >> 4]; if (*cp != '0') { zero = 0; cp++; } *cp = digits[*d++ & 0xf]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp = digits[*d >> 4]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp++ = digits[*d & 0xf]; *cp++ = ':'; a++; } *--cp = '\0'; return (ip6buf); } int in6_localaddr(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) return 1; IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr, &ia->ia_prefixmask.sin6_addr)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return 1; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in6_localip(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_LIST_FOREACH(ia, IN6ADDR_HASH(in6), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in6_ifhasaddr(struct ifnet *ifp, struct in6_addr *addr) { struct in6_addr in6; struct ifaddr *ifa; struct in6_ifaddr *ia6; NET_EPOCH_ASSERT(); in6 = *addr; if (in6_clearscope(&in6)) return (0); in6_setscope(&in6, ifp, NULL); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia6 = (struct in6_ifaddr *)ifa; if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &in6)) return (1); } return (0); } int in6_is_addr_deprecated(struct sockaddr_in6 *sa6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_LIST_FOREACH(ia, IN6ADDR_HASH(&sa6->sin6_addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), &sa6->sin6_addr)) { if (ia->ia6_flags & IN6_IFF_DEPRECATED) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); /* true */ } break; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); /* false */ } /* * return length of part which dst and src are equal * hard coding... */ int in6_matchlen(struct in6_addr *src, struct in6_addr *dst) { int match = 0; u_char *s = (u_char *)src, *d = (u_char *)dst; u_char *lim = s + 16, r; while (s < lim) if ((r = (*d++ ^ *s++)) != 0) { while (r < 128) { match++; r <<= 1; } break; } else match += 8; return match; } /* XXX: to be scope conscious */ int in6_are_prefix_equal(struct in6_addr *p1, struct in6_addr *p2, int len) { int bytelen, bitlen; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_are_prefix_equal: invalid prefix length(%d)\n", len); return (0); } bytelen = len / 8; bitlen = len % 8; if (bcmp(&p1->s6_addr, &p2->s6_addr, bytelen)) return (0); if (bitlen != 0 && p1->s6_addr[bytelen] >> (8 - bitlen) != p2->s6_addr[bytelen] >> (8 - bitlen)) return (0); return (1); } void in6_prefixlen2mask(struct in6_addr *maskp, int len) { u_char maskarray[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; int bytelen, bitlen, i; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n", len); return; } bzero(maskp, sizeof(*maskp)); bytelen = len / 8; bitlen = len % 8; for (i = 0; i < bytelen; i++) maskp->s6_addr[i] = 0xff; if (bitlen) maskp->s6_addr[bytelen] = maskarray[bitlen - 1]; } /* * return the best address out of the same scope. if no address was * found, return the first valid address from designated IF. */ struct in6_ifaddr * in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) { int dst_scope = in6_addrscope(dst), blen = -1, tlen; struct ifaddr *ifa; struct in6_ifaddr *besta = NULL; struct in6_ifaddr *dep[2]; /* last-resort: deprecated */ NET_EPOCH_ASSERT(); dep[0] = dep[1] = NULL; /* * We first look for addresses in the same scope. * If there is one, return it. * If two or more, return one which matches the dst longest. * If none, return one of global addresses assigned other ifs. */ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[0] = (struct in6_ifaddr *)ifa; continue; } if (dst_scope == in6_addrscope(IFA_IN6(ifa))) { /* * call in6_matchlen() as few as possible */ if (besta) { if (blen == -1) blen = in6_matchlen(&besta->ia_addr.sin6_addr, dst); tlen = in6_matchlen(IFA_IN6(ifa), dst); if (tlen > blen) { blen = tlen; besta = (struct in6_ifaddr *)ifa; } } else besta = (struct in6_ifaddr *)ifa; } } if (besta) { ifa_ref(&besta->ia_ifa); return (besta); } CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[1] = (struct in6_ifaddr *)ifa; continue; } if (ifa != NULL) ifa_ref(ifa); return (struct in6_ifaddr *)ifa; } /* use the last-resort values, that are, deprecated addresses */ if (dep[0]) { ifa_ref((struct ifaddr *)dep[0]); return dep[0]; } if (dep[1]) { ifa_ref((struct ifaddr *)dep[1]); return dep[1]; } return NULL; } /* * perform DAD when interface becomes IFF_UP. */ void in6_if_up(struct ifnet *ifp) { struct epoch_tracker et; struct ifaddr *ifa; struct in6_ifaddr *ia; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (ia->ia6_flags & IN6_IFF_TENTATIVE) { /* * The TENTATIVE flag was likely set by hand * beforehand, implicitly indicating the need for DAD. * We may be able to skip the random delay in this * case, but we impose delays just in case. */ nd6_dad_start(ifa, arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz)); } } NET_EPOCH_EXIT(et); /* * special cases, like 6to4, are handled in in6_ifattach */ in6_ifattach(ifp, NULL); } int in6if_do_dad(struct ifnet *ifp) { if ((ifp->if_flags & IFF_LOOPBACK) != 0) return (0); if ((ifp->if_flags & IFF_MULTICAST) == 0) return (0); if ((ND_IFINFO(ifp)->flags & (ND6_IFF_IFDISABLED | ND6_IFF_NO_DAD)) != 0) return (0); return (1); } /* * Calculate max IPv6 MTU through all the interfaces and store it * to in6_maxmtu. */ void in6_setmaxmtu(void) { struct epoch_tracker et; unsigned long maxmtu = 0; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* this function can be called during ifnet initialization */ if (!ifp->if_afdata[AF_INET6]) continue; if ((ifp->if_flags & IFF_LOOPBACK) == 0 && IN6_LINKMTU(ifp) > maxmtu) maxmtu = IN6_LINKMTU(ifp); } NET_EPOCH_EXIT(et); if (maxmtu) /* update only when maxmtu is positive */ V_in6_maxmtu = maxmtu; } /* * Provide the length of interface identifiers to be used for the link attached * to the given interface. The length should be defined in "IPv6 over * xxx-link" document. Note that address architecture might also define * the length for a particular set of address prefixes, regardless of the * link type. As clarified in rfc2462bis, those two definitions should be * consistent, and those really are as of August 2004. */ int in6_if2idlen(struct ifnet *ifp) { switch (ifp->if_type) { case IFT_ETHER: /* RFC2464 */ case IFT_PROPVIRTUAL: /* XXX: no RFC. treat it as ether */ case IFT_L2VLAN: /* ditto */ case IFT_BRIDGE: /* bridge(4) only does Ethernet-like links */ case IFT_INFINIBAND: return (64); case IFT_PPP: /* RFC2472 */ return (64); case IFT_FRELAY: /* RFC2590 */ return (64); case IFT_IEEE1394: /* RFC3146 */ return (64); case IFT_GIF: return (64); /* draft-ietf-v6ops-mech-v2-07 */ case IFT_LOOP: return (64); /* XXX: is this really correct? */ default: /* * Unknown link type: * It might be controversial to use the today's common constant * of 64 for these cases unconditionally. For full compliance, * we should return an error in this case. On the other hand, * if we simply miss the standard for the link type or a new * standard is defined for a new link type, the IFID length * is very likely to be the common constant. As a compromise, * we always use the constant, but make an explicit notice * indicating the "unknown" case. */ printf("in6_if2idlen: unknown link type (%d)\n", ifp->if_type); return (64); } } struct in6_llentry { struct llentry base; }; #define IN6_LLTBL_DEFAULT_HSIZE 32 #define IN6_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. */ static void in6_lltable_destroy_lle_unlocked(epoch_context_t ctx) { struct llentry *lle; lle = __containerof(ctx, struct llentry, lle_epoch_ctx); LLE_LOCK_DESTROY(lle); LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); } /* * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in6_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); NET_EPOCH_CALL(in6_lltable_destroy_lle_unlocked, &lle->lle_epoch_ctx); } static struct llentry * in6_lltable_new(const struct in6_addr *addr6, u_int flags) { struct in6_llentry *lle; lle = malloc(sizeof(struct in6_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; lle->base.r_l3addr.addr6 = *addr6; lle->base.lle_refcnt = 1; lle->base.lle_free = in6_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); LLE_REQ_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } static int in6_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { const struct in6_addr *addr, *mask, *lle_addr; addr = &((const struct sockaddr_in6 *)saddr)->sin6_addr; mask = &((const struct sockaddr_in6 *)smask)->sin6_addr; lle_addr = &lle->r_l3addr.addr6; if (IN6_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. */ if (IN6_ARE_ADDR_EQUAL(addr, lle_addr) && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in6_lltable_free_entry(struct lltable *llt, struct llentry *lle) { struct ifnet *ifp; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table */ if ((lle->la_flags & LLE_LINKED) != 0) { ifp = llt->llt_ifp; IF_AFDATA_WLOCK_ASSERT(ifp); lltable_unlink_entry(llt, lle); } llentry_free(lle); } static int in6_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6; - struct nhop6_basic nh6; + struct nhop_object *nh; struct in6_addr dst; uint32_t scopeid; - int error; char ip6buf[INET6_ADDRSTRLEN]; int fibnum; NET_EPOCH_ASSERT(); KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); sin6 = (const struct sockaddr_in6 *)l3addr; in6_splitscope(&sin6->sin6_addr, &dst, &scopeid); fibnum = V_rt_add_addr_allfibs ? RT_DEFAULT_FIB : ifp->if_fib; - error = fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6); - if (error != 0 || (nh6.nh_flags & NHF_GATEWAY) || nh6.nh_ifp != ifp) { + nh = fib6_lookup(fibnum, &dst, scopeid, NHR_NONE, 0); + if (nh && ((nh->nh_flags & NHF_GATEWAY) || nh->nh_ifp != ifp)) { struct ifaddr *ifa; /* * Create an ND6 cache for an IPv6 neighbor * that is not covered by our own prefix. */ ifa = ifaof_ifpforaddr(l3addr, ifp); if (ifa != NULL) { return 0; } log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n", ip6_sprintf(ip6buf, &sin6->sin6_addr)); return EINVAL; } return 0; } /* * Called by the datapath to indicate that the entry was used. */ static void in6_lltable_mark_used(struct llentry *lle) { LLE_REQ_LOCK(lle); lle->r_skip_req = 0; /* * Set the hit time so the callback function * can determine the remaining time before * transiting to the DELAY state. */ lle->lle_hittime = time_uptime; LLE_REQ_UNLOCK(lle); } static inline uint32_t in6_lltable_hash_dst(const struct in6_addr *dst, uint32_t hsize) { return (IN6_LLTBL_HASH(dst->s6_addr32[3], hsize)); } static uint32_t in6_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in6_lltable_hash_dst(&lle->r_l3addr.addr6, hsize)); } static void in6_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)sa; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_addr = lle->r_l3addr.addr6; } static inline struct llentry * in6_lltable_find_dst(struct lltable *llt, const struct in6_addr *dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in6_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; CK_LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (IN6_ARE_ADDR_EQUAL(&lle->r_l3addr.addr6, dst)) break; } return (lle); } static void in6_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in6_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in6_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in6_lltable_new(&sin6->sin6_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if ((flags & LLE_IFADDR) == LLE_IFADDR) { linkhdrsize = LLE_MAX_LINKHDR; if (lltable_calc_llheader(ifp, AF_INET6, IF_LLADDR(ifp), linkhdr, &linkhdrsize, &lladdr_off) != 0) { NET_EPOCH_CALL(in6_lltable_destroy_lle_unlocked, &lle->lle_epoch_ctx); return (NULL); } lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); lle->la_flags |= LLE_STATIC; } if ((lle->la_flags & LLE_STATIC) != 0) lle->ln_state = ND6_LLINFO_REACHABLE; return (lle); } static struct llentry * in6_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); KASSERT((flags & (LLE_UNLOCKED | LLE_EXCLUSIVE)) != (LLE_UNLOCKED | LLE_EXCLUSIVE), ("wrong lle request flags: %#x", flags)); lle = in6_lltable_find_dst(llt, &sin6->sin6_addr); if (lle == NULL) return (NULL); if (flags & LLE_UNLOCKED) return (lle); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); /* * If the afdata lock is not held, the LLE may have been unlinked while * we were blocked on the LLE lock. Check for this case. */ if (__predict_false((lle->la_flags & LLE_LINKED) == 0)) { if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(lle); else LLE_RUNLOCK(lle); return (NULL); } return (lle); } static int in6_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in6 sin6; /* * ndp.c assumes that sdl is word aligned */ #ifdef __LP64__ uint32_t pad; #endif struct sockaddr_dl sdl; } ndpc; struct sockaddr_dl *sdl; int error; bzero(&ndpc, sizeof(ndpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle, (struct sockaddr *)&ndpc.sin6); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&ndpc.sin6) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in6 (IPv6) * struct sockaddr_dl; */ ndpc.rtm.rtm_msglen = sizeof(ndpc); ndpc.rtm.rtm_version = RTM_VERSION; ndpc.rtm.rtm_type = RTM_GET; ndpc.rtm.rtm_flags = RTF_UP; ndpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; if (V_deembed_scopeid) sa6_recoverscope(&ndpc.sin6); /* publish */ if (lle->la_flags & LLE_PUB) ndpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &ndpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } if (lle->la_expire != 0) ndpc.rtm.rtm_rmx.rmx_expire = lle->la_expire + lle->lle_remtime / hz + time_second - time_uptime; ndpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) ndpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) ndpc.rtm.rtm_flags |= RTF_PINNED; if (lle->ln_router != 0) ndpc.rtm.rtm_flags |= RTF_GATEWAY; ndpc.rtm.rtm_rmx.rmx_pksent = lle->la_asked; /* Store state in rmx_weight value */ ndpc.rtm.rtm_rmx.rmx_state = lle->ln_state; ndpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &ndpc, sizeof(ndpc)); return (error); } static struct lltable * in6_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN6_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET6; llt->llt_ifp = ifp; llt->llt_lookup = in6_lltable_lookup; llt->llt_alloc_entry = in6_lltable_alloc; llt->llt_delete_entry = in6_lltable_delete_entry; llt->llt_dump_entry = in6_lltable_dump_entry; llt->llt_hash = in6_lltable_hash; llt->llt_fill_sa_entry = in6_lltable_fill_sa_entry; llt->llt_free_entry = in6_lltable_free_entry; llt->llt_match_prefix = in6_lltable_match_prefix; llt->llt_mark_used = in6_lltable_mark_used; lltable_link(llt); return (llt); } void * in6_domifattach(struct ifnet *ifp) { struct in6_ifextra *ext; /* There are not IPv6-capable interfaces. */ switch (ifp->if_type) { case IFT_PFLOG: case IFT_PFSYNC: case IFT_USB: return (NULL); } ext = (struct in6_ifextra *)malloc(sizeof(*ext), M_IFADDR, M_WAITOK); bzero(ext, sizeof(*ext)); ext->in6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct in6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->icmp6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->nd_ifinfo = nd6_ifattach(ifp); ext->scope6_id = scope6_ifattach(ifp); ext->lltable = in6_lltattach(ifp); ext->mld_ifinfo = mld_domifattach(ifp); return ext; } int in6_domifmtu(struct ifnet *ifp) { if (ifp->if_afdata[AF_INET6] == NULL) return ifp->if_mtu; return (IN6_LINKMTU(ifp)); } void in6_domifdetach(struct ifnet *ifp, void *aux) { struct in6_ifextra *ext = (struct in6_ifextra *)aux; mld_domifdetach(ifp); scope6_ifdetach(ext->scope6_id); nd6_ifdetach(ifp, ext->nd_ifinfo); lltable_free(ext->lltable); COUNTER_ARRAY_FREE(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); free(ext->in6_ifstat, M_IFADDR); COUNTER_ARRAY_FREE(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); free(ext->icmp6_ifstat, M_IFADDR); free(ext, M_IFADDR); } /* * Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be * v4 mapped addr or v4 compat addr */ void in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin, sizeof(*sin)); sin->sin_len = sizeof(struct sockaddr_in); sin->sin_family = AF_INET; sin->sin_port = sin6->sin6_port; sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3]; } /* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin6, sizeof(*sin6)); sin6->sin6_len = sizeof(struct sockaddr_in6); sin6->sin6_family = AF_INET6; sin6->sin6_port = sin->sin_port; sin6->sin6_addr.s6_addr32[0] = 0; sin6->sin6_addr.s6_addr32[1] = 0; sin6->sin6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; sin6->sin6_addr.s6_addr32[3] = sin->sin_addr.s_addr; } /* Convert sockaddr_in6 into sockaddr_in. */ void in6_sin6_2_sin_in_sock(struct sockaddr *nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 sin6; /* * Save original sockaddr_in6 addr and convert it * to sockaddr_in. */ sin6 = *(struct sockaddr_in6 *)nam; sin_p = (struct sockaddr_in *)nam; in6_sin6_2_sin(sin_p, &sin6); } /* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 *sin6_p; sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK); sin_p = (struct sockaddr_in *)*nam; in6_sin_2_v4mapsin6(sin_p, sin6_p); free(*nam, M_SONAME); *nam = (struct sockaddr *)sin6_p; } Index: head/sys/netinet6/in6_fib.c =================================================================== --- head/sys/netinet6/in6_fib.c (revision 362899) +++ head/sys/netinet6/in6_fib.c (revision 362900) @@ -1,441 +1,263 @@ /*- * Copyright (c) 2015 * Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #ifdef INET6 -static void fib6_rte_to_nh_extended(const struct nhop_object *nh, - const struct in6_addr *dst, uint32_t flags, struct nhop6_extended *pnh6); -static void fib6_rte_to_nh_basic(const struct nhop_object *nh, const struct in6_addr *dst, - uint32_t flags, struct nhop6_basic *pnh6); -#define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) - CHK_STRUCT_ROUTE_COMPAT(struct route_in6, ro_dst); - - - -static void -fib6_rte_to_nh_basic(const struct nhop_object *nh, const struct in6_addr *dst, - uint32_t flags, struct nhop6_basic *pnh6) -{ - - /* Do explicit nexthop zero unless we're copying it */ - memset(pnh6, 0, sizeof(*pnh6)); - - if ((flags & NHR_IFAIF) != 0) - pnh6->nh_ifp = nh->nh_aifp; - else - pnh6->nh_ifp = nh->nh_ifp; - - pnh6->nh_mtu = nh->nh_mtu; - if (nh->nh_flags & NHF_GATEWAY) { - /* Return address with embedded scope. */ - pnh6->nh_addr = nh->gw6_sa.sin6_addr; - } else - pnh6->nh_addr = *dst; - /* Set flags */ - pnh6->nh_flags = nh->nh_flags; -} - -static void -fib6_rte_to_nh_extended(const struct nhop_object *nh, const struct in6_addr *dst, - uint32_t flags, struct nhop6_extended *pnh6) -{ - - /* Do explicit nexthop zero unless we're copying it */ - memset(pnh6, 0, sizeof(*pnh6)); - - if ((flags & NHR_IFAIF) != 0) - pnh6->nh_ifp = nh->nh_aifp; - else - pnh6->nh_ifp = nh->nh_ifp; - - pnh6->nh_mtu = nh->nh_mtu; - if (nh->nh_flags & NHF_GATEWAY) { - /* Return address with embedded scope. */ - pnh6->nh_addr = nh->gw6_sa.sin6_addr; - } else - pnh6->nh_addr = *dst; - /* Set flags */ - pnh6->nh_flags = nh->nh_flags; - pnh6->nh_ia = ifatoia6(nh->nh_ifa); -} - -/* - * Performs IPv6 route table lookup on @dst. Returns 0 on success. - * Stores basic nexthop info into provided @pnh6 structure. - * Note that - * - nh_ifp represents logical transmit interface (rt_ifp) by default - * - nh_ifp represents "address" interface if NHR_IFAIF flag is passed - * - mtu from logical transmit interface will be returned. - * - nh_ifp cannot be safely dereferenced - * - nh_ifp represents rt_ifp (e.g. if looking up address on - * interface "ix0" pointer to "ix0" interface will be returned instead - * of "lo0") - * - howewer mtu from "transmit" interface will be returned. - * - scope will be embedded in nh_addr - */ -int -fib6_lookup_nh_basic(uint32_t fibnum, const struct in6_addr *dst, uint32_t scopeid, - uint32_t flags, uint32_t flowid, struct nhop6_basic *pnh6) -{ - RIB_RLOCK_TRACKER; - struct rib_head *rh; - struct radix_node *rn; - struct sockaddr_in6 sin6; - struct nhop_object *nh; - - KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_basic: bad fibnum")); - rh = rt_tables_get_rnh(fibnum, AF_INET6); - if (rh == NULL) - return (ENOENT); - - /* Prepare lookup key */ - memset(&sin6, 0, sizeof(sin6)); - sin6.sin6_addr = *dst; - sin6.sin6_len = sizeof(struct sockaddr_in6); - /* Assume scopeid is valid and embed it directly */ - if (IN6_IS_SCOPE_LINKLOCAL(dst)) - sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff); - - RIB_RLOCK(rh); - rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); - if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - nh = RNTORT(rn)->rt_nhop; - /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(nh->nh_ifp)) { - fib6_rte_to_nh_basic(nh, &sin6.sin6_addr, flags, pnh6); - RIB_RUNLOCK(rh); - return (0); - } - } - RIB_RUNLOCK(rh); - - return (ENOENT); -} - -/* - * Performs IPv6 route table lookup on @dst. Returns 0 on success. - * Stores extended nexthop info into provided @pnh6 structure. - * Note that - * - nh_ifp cannot be safely dereferenced unless NHR_REF is specified. - * - in that case you need to call fib6_free_nh_ext() - * - nh_ifp represents logical transmit interface (rt_ifp) by default - * - nh_ifp represents "address" interface if NHR_IFAIF flag is passed - * - mtu from logical transmit interface will be returned. - * - scope will be embedded in nh_addr - */ -int -fib6_lookup_nh_ext(uint32_t fibnum, const struct in6_addr *dst,uint32_t scopeid, - uint32_t flags, uint32_t flowid, struct nhop6_extended *pnh6) -{ - RIB_RLOCK_TRACKER; - struct rib_head *rh; - struct radix_node *rn; - struct sockaddr_in6 sin6; - struct rtentry *rte; - struct nhop_object *nh; - - KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_ext: bad fibnum")); - rh = rt_tables_get_rnh(fibnum, AF_INET6); - if (rh == NULL) - return (ENOENT); - - /* Prepare lookup key */ - memset(&sin6, 0, sizeof(sin6)); - sin6.sin6_len = sizeof(struct sockaddr_in6); - sin6.sin6_addr = *dst; - /* Assume scopeid is valid and embed it directly */ - if (IN6_IS_SCOPE_LINKLOCAL(dst)) - sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff); - - RIB_RLOCK(rh); - rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); - if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { - rte = RNTORT(rn); -#ifdef RADIX_MPATH - rte = rt_mpath_select(rte, flowid); - if (rte == NULL) { - RIB_RUNLOCK(rh); - return (ENOENT); - } -#endif - nh = rte->rt_nhop; - /* Ensure route & ifp is UP */ - if (RT_LINK_IS_UP(nh->nh_ifp)) { - fib6_rte_to_nh_extended(nh, &sin6.sin6_addr, flags, - pnh6); - if ((flags & NHR_REF) != 0) { - /* TODO: Do lwref on egress ifp's */ - } - RIB_RUNLOCK(rh); - - return (0); - } - } - RIB_RUNLOCK(rh); - - return (ENOENT); -} - -void -fib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *pnh6) -{ - -} /* * Looks up path in fib @fibnum specified by @dst. * Assumes scope is deembedded and provided in @scopeid. * * Returns path nexthop on success. Nexthop is safe to use * within the current network epoch. If longer lifetime is required, * one needs to pass NHR_REF as a flag. This will return referenced * nexthop. */ struct nhop_object * fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags, uint32_t flowid) { RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; struct rtentry *rt; struct nhop_object *nh; struct sockaddr_in6 sin6; KASSERT((fibnum < rt_numfibs), ("fib6_lookup: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET6); if (rh == NULL) return (NULL); /* TODO: radix changes */ //addr = *dst6; /* Prepare lookup key */ memset(&sin6, 0, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_addr = *dst6; /* Assume scopeid is valid and embed it directly */ if (IN6_IS_SCOPE_LINKLOCAL(dst6)) sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff); RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rt = RNTORT(rn); #ifdef RADIX_MPATH if (rt_mpath_next(rt) != NULL) rt = rt_mpath_selectrte(rt, flowid); #endif nh = rt->rt_nhop; /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { if (flags & NHR_REF) nhop_ref_object(nh); RIB_RUNLOCK(rh); return (nh); } } RIB_RUNLOCK(rh); RTSTAT_INC(rts_unreach); return (NULL); } inline static int check_urpf(const struct nhop_object *nh, uint32_t flags, const struct ifnet *src_if) { if (src_if != NULL && nh->nh_aifp == src_if) { return (1); } if (src_if == NULL) { if ((flags & NHR_NODEFAULT) == 0) return (1); else if ((nh->nh_flags & NHF_DEFAULT) == 0) return (1); } return (0); } #ifdef RADIX_MPATH inline static int check_urpf_mpath(struct rtentry *rt, uint32_t flags, const struct ifnet *src_if) { while (rt != NULL) { if (check_urpf(rt->rt_nhop, flags, src_if) != 0) return (1); rt = rt_mpath_next(rt); } return (0); } #endif /* * Performs reverse path forwarding lookup. * If @src_if is non-zero, verifies that at least 1 path goes via * this interface. * If @src_if is zero, verifies that route exist. * if @flags contains NHR_NOTDEFAULT, do not consider default route. * * Returns 1 if route matching conditions is found, 0 otherwise. */ int fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags, const struct ifnet *src_if) { RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; struct rtentry *rt; struct sockaddr_in6 sin6; int ret; KASSERT((fibnum < rt_numfibs), ("fib6_check_urpf: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET6); if (rh == NULL) return (0); /* TODO: radix changes */ /* Prepare lookup key */ memset(&sin6, 0, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_addr = *dst6; /* Assume scopeid is valid and embed it directly */ if (IN6_IS_SCOPE_LINKLOCAL(dst6)) sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff); RIB_RLOCK(rh); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rt = RNTORT(rn); #ifdef RADIX_MPATH ret = check_urpf_mpath(rt, flags, src_if); #else ret = check_urpf(rt->rt_nhop, flags, src_if); #endif RIB_RUNLOCK(rh); return (ret); } RIB_RUNLOCK(rh); return (0); } struct nhop_object * fib6_lookup_debugnet(uint32_t fibnum, const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags) { struct rib_head *rh; struct radix_node *rn; struct rtentry *rt; struct nhop_object *nh; struct sockaddr_in6 sin6; KASSERT((fibnum < rt_numfibs), ("fib6_lookup: bad fibnum")); rh = rt_tables_get_rnh(fibnum, AF_INET6); if (rh == NULL) return (NULL); /* TODO: radix changes */ //addr = *dst6; /* Prepare lookup key */ memset(&sin6, 0, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_addr = *dst6; /* Assume scopeid is valid and embed it directly */ if (IN6_IS_SCOPE_LINKLOCAL(dst6)) sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff); rn = rh->rnh_matchaddr((void *)&sin6, &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rt = RNTORT(rn); nh = rt->rt_nhop; /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { if (flags & NHR_REF) nhop_ref_object(nh); return (nh); } } return (NULL); } #endif Index: head/sys/netinet6/in6_fib.h =================================================================== --- head/sys/netinet6/in6_fib.h (revision 362899) +++ head/sys/netinet6/in6_fib.h (revision 362900) @@ -1,70 +1,43 @@ /*- * Copyright (c) 2015 * Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _NETINET6_IN6_FIB_H_ #define _NETINET6_IN6_FIB_H_ -/* Basic nexthop info used for uRPF/mtu checks */ -struct nhop6_basic { - struct ifnet *nh_ifp; /* Logical egress interface */ - uint16_t nh_mtu; /* nexthop mtu */ - uint16_t nh_flags; /* nhop flags */ - uint8_t spare[4]; - struct in6_addr nh_addr; /* GW/DST IPv4 address */ -}; - -/* Extended nexthop info used for control protocols. */ -struct nhop6_extended { - struct ifnet *nh_ifp; /* Logical egress interface */ - struct in6_ifaddr *nh_ia; /* Associated address. */ - uint16_t nh_mtu; /* nexthop mtu */ - uint16_t nh_flags; /* nhop flags */ - uint8_t spare[4]; - struct in6_addr nh_addr; /* GW/DST IPv6 address */ - uint64_t spare2[1]; -}; - -int fib6_lookup_nh_basic(uint32_t fibnum, const struct in6_addr *dst, - uint32_t scopeid, uint32_t flags, uint32_t flowid,struct nhop6_basic *pnh6); -int fib6_lookup_nh_ext(uint32_t fibnum, const struct in6_addr *dst, - uint32_t scopeid, uint32_t flags, uint32_t flowid, - struct nhop6_extended *pnh6); -void fib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *pnh6); - struct nhop_object *fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags, uint32_t flowid); int fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags, const struct ifnet *src_if); struct nhop_object *fib6_lookup_debugnet(uint32_t fibnum, const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags); #endif Index: head/sys/netinet6/in6_mcast.c =================================================================== --- head/sys/netinet6/in6_mcast.c (revision 362899) +++ head/sys/netinet6/in6_mcast.c (revision 362900) @@ -1,2906 +1,2906 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2009 Bruce Simpson. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPv6 multicast socket, group, and socket option processing module. * Normative references: RFC 2292, RFC 3492, RFC 3542, RFC 3678, RFC 3810. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_MLD #define KTR_MLD KTR_INET6 #endif #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in6 sin6; }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ static MALLOC_DEFINE(M_IN6MFILTER, "in6_mfilter", "IPv6 multicast PCB-layer source filter"); MALLOC_DEFINE(M_IP6MADDR, "in6_multi", "IPv6 multicast group"); static MALLOC_DEFINE(M_IP6MOPTS, "ip6_moptions", "IPv6 multicast options"); static MALLOC_DEFINE(M_IP6MSOURCE, "ip6_msource", "IPv6 multicast MLD-layer source filter"); RB_GENERATE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp); /* * Locking: * - Lock order is: Giant, IN6_MULTI_LOCK, INP_WLOCK, * IN6_MULTI_LIST_LOCK, MLD_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by in6m_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip6_moptions and in6_mfilter are covered by the INP_WLOCK. * * struct in6_multi is covered by IN6_MULTI_LOCK. There isn't strictly * any need for in6_multi itself to be virtualized -- it is bound to an ifp * anyway no matter what happens. */ struct mtx in6_multi_list_mtx; MTX_SYSINIT(in6_multi_mtx, &in6_multi_list_mtx, "in6_multi_list_mtx", MTX_DEF); struct mtx in6_multi_free_mtx; MTX_SYSINIT(in6_multi_free_mtx, &in6_multi_free_mtx, "in6_multi_free_mtx", MTX_DEF); struct sx in6_multi_sx; SX_SYSINIT(in6_multi_sx, &in6_multi_sx, "in6_multi_sx"); static void im6f_commit(struct in6_mfilter *); static int im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin, struct in6_msource **); static struct in6_msource * im6f_graft(struct in6_mfilter *, const uint8_t, const struct sockaddr_in6 *); static void im6f_leave(struct in6_mfilter *); static int im6f_prune(struct in6_mfilter *, const struct sockaddr_in6 *); static void im6f_purge(struct in6_mfilter *); static void im6f_rollback(struct in6_mfilter *); static void im6f_reap(struct in6_mfilter *); static struct in6_mfilter * im6o_match_group(const struct ip6_moptions *, const struct ifnet *, const struct sockaddr *); static struct in6_msource * im6o_match_source(struct in6_mfilter *, const struct sockaddr *); static void im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims, const int rollback); static int in6_getmulti(struct ifnet *, const struct in6_addr *, struct in6_multi **); static int in6_joingroup_locked(struct ifnet *, const struct in6_addr *, struct in6_mfilter *, struct in6_multi **, int); static int in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr, const int noalloc, struct ip6_msource **pims); #ifdef KTR static int in6m_is_ifp_detached(const struct in6_multi *); #endif static int in6m_merge(struct in6_multi *, /*const*/ struct in6_mfilter *); static void in6m_purge(struct in6_multi *); static void in6m_reap(struct in6_multi *); static struct ip6_moptions * in6p_findmoptions(struct inpcb *); static int in6p_get_source_filters(struct inpcb *, struct sockopt *); static int in6p_join_group(struct inpcb *, struct sockopt *); static int in6p_leave_group(struct inpcb *, struct sockopt *); static struct ifnet * in6p_lookup_mcast_ifp(const struct inpcb *, const struct sockaddr_in6 *); static int in6p_block_unblock_source(struct inpcb *, struct sockopt *); static int in6p_set_multicast_if(struct inpcb *, struct sockopt *); static int in6p_set_source_filters(struct inpcb *, struct sockopt *); static int sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS); SYSCTL_DECL(_net_inet6_ip6); /* XXX Not in any common header. */ static SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, mcast, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPv6 multicast"); static u_long in6_mcast_maxgrpsrc = IPV6_MAX_GROUP_SRC_FILTER; SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxgrpsrc, CTLFLAG_RWTUN, &in6_mcast_maxgrpsrc, 0, "Max source filters per group"); static u_long in6_mcast_maxsocksrc = IPV6_MAX_SOCK_SRC_FILTER; SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxsocksrc, CTLFLAG_RWTUN, &in6_mcast_maxsocksrc, 0, "Max source filters per socket"); /* TODO Virtualize this switch. */ int in6_mcast_loop = IPV6_DEFAULT_MULTICAST_LOOP; SYSCTL_INT(_net_inet6_ip6_mcast, OID_AUTO, loop, CTLFLAG_RWTUN, &in6_mcast_loop, 0, "Loopback multicast datagrams by default"); static SYSCTL_NODE(_net_inet6_ip6_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip6_mcast_filters, "Per-interface stack-wide source filters"); #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp * is detached. */ static int __inline in6m_is_ifp_detached(const struct in6_multi *inm) { struct ifnet *ifp; KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->in6m_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that network-layer notion of ifp is the * same as that of link-layer. */ KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__)); } return (ifp == NULL); } #endif /* * Initialize an in6_mfilter structure to a known state at t0, t1 * with an empty source filter list. */ static __inline void im6f_init(struct in6_mfilter *imf, const int st0, const int st1) { memset(imf, 0, sizeof(struct in6_mfilter)); RB_INIT(&imf->im6f_sources); imf->im6f_st[0] = st0; imf->im6f_st[1] = st1; } struct in6_mfilter * ip6_mfilter_alloc(const int mflags, const int st0, const int st1) { struct in6_mfilter *imf; imf = malloc(sizeof(*imf), M_IN6MFILTER, mflags); if (imf != NULL) im6f_init(imf, st0, st1); return (imf); } void ip6_mfilter_free(struct in6_mfilter *imf) { im6f_purge(imf); free(imf, M_IN6MFILTER); } /* * Find an IPv6 multicast group entry for this ip6_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ static struct in6_mfilter * im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group) { const struct sockaddr_in6 *gsin6; struct in6_mfilter *imf; struct in6_multi *inm; gsin6 = (const struct sockaddr_in6 *)group; IP6_MFILTER_FOREACH(imf, &imo->im6o_head) { inm = imf->im6f_in6m; if (inm == NULL) continue; if ((ifp == NULL || (inm->in6m_ifp == ifp)) && IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &gsin6->sin6_addr)) { break; } } return (imf); } /* * Find an IPv6 multicast source entry for this imo which matches * the given group index for this socket, and source address. * * XXX TODO: The scope ID, if present in src, is stripped before * any comparison. We SHOULD enforce scope/zone checks where the source * filter entry has a link scope. * * NOTE: This does not check if the entry is in-mode, merely if * it exists, which may not be the desired behaviour. */ static struct in6_msource * im6o_match_source(struct in6_mfilter *imf, const struct sockaddr *src) { struct ip6_msource find; struct ip6_msource *ims; const sockunion_t *psa; KASSERT(src->sa_family == AF_INET6, ("%s: !AF_INET6", __func__)); psa = (const sockunion_t *)src; find.im6s_addr = psa->sin6.sin6_addr; in6_clearscope(&find.im6s_addr); /* XXX */ ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); return ((struct in6_msource *)ims); } /* * Perform filtering for multicast datagrams on a socket by group and source. * * Returns 0 if a datagram should be allowed through, or various error codes * if the socket was not a member of the group, or the source was muted, etc. */ int im6o_mc_filter(const struct ip6_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group, const struct sockaddr *src) { struct in6_mfilter *imf; struct in6_msource *ims; int mode; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); imf = im6o_match_group(imo, ifp, group); if (imf == NULL) return (MCAST_NOTGMEMBER); /* * Check if the source was included in an (S,G) join. * Allow reception on exclusive memberships by default, * reject reception on inclusive memberships by default. * Exclude source only if an in-mode exclude filter exists. * Include source only if an in-mode include filter exists. * NOTE: We are comparing group state here at MLD t1 (now) * with socket-layer t0 (since last downcall). */ mode = imf->im6f_st[1]; ims = im6o_match_source(imf, src); if ((ims == NULL && mode == MCAST_INCLUDE) || (ims != NULL && ims->im6sl_st[0] != mode)) return (MCAST_NOTSMEMBER); return (MCAST_PASS); } /* * Find and return a reference to an in6_multi record for (ifp, group), * and bump its reference count. * If one does not exist, try to allocate it, and update link-layer multicast * filters on ifp to listen for group. * Assumes the IN6_MULTI lock is held across the call. * Return 0 if successful, otherwise return an appropriate error code. */ static int in6_getmulti(struct ifnet *ifp, const struct in6_addr *group, struct in6_multi **pinm) { struct epoch_tracker et; struct sockaddr_in6 gsin6; struct ifmultiaddr *ifma; struct in6_multi *inm; int error; error = 0; /* * XXX: Accesses to ifma_protospec must be covered by IF_ADDR_LOCK; * if_addmulti() takes this mutex itself, so we must drop and * re-acquire around the call. */ IN6_MULTI_LOCK_ASSERT(); IN6_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); NET_EPOCH_ENTER(et); inm = in6m_lookup_locked(ifp, group); NET_EPOCH_EXIT(et); if (inm != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(inm->in6m_refcount >= 1, ("%s: bad refcount %d", __func__, inm->in6m_refcount)); in6m_acquire_locked(inm); *pinm = inm; goto out_locked; } memset(&gsin6, 0, sizeof(gsin6)); gsin6.sin6_family = AF_INET6; gsin6.sin6_len = sizeof(struct sockaddr_in6); gsin6.sin6_addr = *group; /* * Check if a link-layer group is already associated * with this network-layer group on the given ifnet. */ IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); error = if_addmulti(ifp, (struct sockaddr *)&gsin6, &ifma); if (error != 0) return (error); IN6_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); /* * If something other than netinet6 is occupying the link-layer * group, print a meaningful error message and back out of * the allocation. * Otherwise, bump the refcount on the existing network-layer * group association and return it. */ if (ifma->ifma_protospec != NULL) { inm = (struct in6_multi *)ifma->ifma_protospec; #ifdef INVARIANTS KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr", __func__)); KASSERT(ifma->ifma_addr->sa_family == AF_INET6, ("%s: ifma not AF_INET6", __func__)); KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__)); if (inm->in6m_ifma != ifma || inm->in6m_ifp != ifp || !IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, group)) panic("%s: ifma %p is inconsistent with %p (%p)", __func__, ifma, inm, group); #endif in6m_acquire_locked(inm); *pinm = inm; goto out_locked; } IF_ADDR_WLOCK_ASSERT(ifp); /* * A new in6_multi record is needed; allocate and initialize it. * We DO NOT perform an MLD join as the in6_ layer may need to * push an initial source list down to MLD to support SSM. * * The initial source filter state is INCLUDE, {} as per the RFC. * Pending state-changes per group are subject to a bounds check. */ inm = malloc(sizeof(*inm), M_IP6MADDR, M_NOWAIT | M_ZERO); if (inm == NULL) { IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); if_delmulti_ifma(ifma); return (ENOMEM); } inm->in6m_addr = *group; inm->in6m_ifp = ifp; inm->in6m_mli = MLD_IFINFO(ifp); inm->in6m_ifma = ifma; inm->in6m_refcount = 1; inm->in6m_state = MLD_NOT_MEMBER; mbufq_init(&inm->in6m_scq, MLD_MAX_STATE_CHANGES); inm->in6m_st[0].iss_fmode = MCAST_UNDEFINED; inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; RB_INIT(&inm->in6m_srcs); ifma->ifma_protospec = inm; *pinm = inm; out_locked: IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); return (error); } /* * Drop a reference to an in6_multi record. * * If the refcount drops to 0, free the in6_multi record and * delete the underlying link-layer membership. */ static void in6m_release(struct in6_multi *inm) { struct ifmultiaddr *ifma; struct ifnet *ifp; CTR2(KTR_MLD, "%s: refcount is %d", __func__, inm->in6m_refcount); MPASS(inm->in6m_refcount == 0); CTR2(KTR_MLD, "%s: freeing inm %p", __func__, inm); ifma = inm->in6m_ifma; ifp = inm->in6m_ifp; MPASS(ifma->ifma_llifma == NULL); /* XXX this access is not covered by IF_ADDR_LOCK */ CTR2(KTR_MLD, "%s: purging ifma %p", __func__, ifma); KASSERT(ifma->ifma_protospec == NULL, ("%s: ifma_protospec != NULL", __func__)); if (ifp == NULL) ifp = ifma->ifma_ifp; if (ifp != NULL) { CURVNET_SET(ifp->if_vnet); in6m_purge(inm); free(inm, M_IP6MADDR); if_delmulti_ifma_flags(ifma, 1); CURVNET_RESTORE(); if_rele(ifp); } else { in6m_purge(inm); free(inm, M_IP6MADDR); if_delmulti_ifma_flags(ifma, 1); } } /* * Interface detach can happen in a taskqueue thread context, so we must use a * dedicated thread to avoid deadlocks when draining in6m_release tasks. */ TASKQUEUE_DEFINE_THREAD(in6m_free); static struct task in6m_free_task; static struct in6_multi_head in6m_free_list = SLIST_HEAD_INITIALIZER(); static void in6m_release_task(void *arg __unused, int pending __unused); static void in6m_init(void) { TASK_INIT(&in6m_free_task, 0, in6m_release_task, NULL); } SYSINIT(in6m_init, SI_SUB_TASKQ, SI_ORDER_ANY, in6m_init, NULL); void in6m_release_list_deferred(struct in6_multi_head *inmh) { if (SLIST_EMPTY(inmh)) return; mtx_lock(&in6_multi_free_mtx); SLIST_CONCAT(&in6m_free_list, inmh, in6_multi, in6m_nrele); mtx_unlock(&in6_multi_free_mtx); taskqueue_enqueue(taskqueue_in6m_free, &in6m_free_task); } void in6m_release_wait(void) { taskqueue_drain_all(taskqueue_in6m_free); } void in6m_disconnect_locked(struct in6_multi_head *inmh, struct in6_multi *inm) { struct ifnet *ifp; struct ifaddr *ifa; struct in6_ifaddr *ifa6; struct in6_multi_mship *imm, *imm_tmp; struct ifmultiaddr *ifma, *ll_ifma; IN6_MULTI_LIST_LOCK_ASSERT(); ifp = inm->in6m_ifp; if (ifp == NULL) return; /* already called */ inm->in6m_ifp = NULL; IF_ADDR_WLOCK_ASSERT(ifp); ifma = inm->in6m_ifma; if (ifma == NULL) return; if_ref(ifp); if (ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname); if ((ll_ifma = ifma->ifma_llifma) != NULL) { MPASS(ifma != ll_ifma); ifma->ifma_llifma = NULL; MPASS(ll_ifma->ifma_llifma == NULL); MPASS(ll_ifma->ifma_ifp == ifp); if (--ll_ifma->ifma_refcount == 0) { if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname); if_freemulti(ll_ifma); } } CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (void *)ifa; LIST_FOREACH_SAFE(imm, &ifa6->ia6_memberships, i6mm_chain, imm_tmp) { if (inm == imm->i6mm_maddr) { LIST_REMOVE(imm, i6mm_chain); free(imm, M_IP6MADDR); in6m_rele_locked(inmh, inm); } } } } static void in6m_release_task(void *arg __unused, int pending __unused) { struct in6_multi_head in6m_free_tmp; struct in6_multi *inm, *tinm; SLIST_INIT(&in6m_free_tmp); mtx_lock(&in6_multi_free_mtx); SLIST_CONCAT(&in6m_free_tmp, &in6m_free_list, in6_multi, in6m_nrele); mtx_unlock(&in6_multi_free_mtx); IN6_MULTI_LOCK(); SLIST_FOREACH_SAFE(inm, &in6m_free_tmp, in6m_nrele, tinm) { SLIST_REMOVE_HEAD(&in6m_free_tmp, in6m_nrele); in6m_release(inm); } IN6_MULTI_UNLOCK(); } /* * Clear recorded source entries for a group. * Used by the MLD code. Caller must hold the IN6_MULTI lock. * FIXME: Should reap. */ void in6m_clear_recorded(struct in6_multi *inm) { struct ip6_msource *ims; IN6_MULTI_LIST_LOCK_ASSERT(); RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { if (ims->im6s_stp) { ims->im6s_stp = 0; --inm->in6m_st[1].iss_rec; } } KASSERT(inm->in6m_st[1].iss_rec == 0, ("%s: iss_rec %d not 0", __func__, inm->in6m_st[1].iss_rec)); } /* * Record a source as pending for a Source-Group MLDv2 query. * This lives here as it modifies the shared tree. * * inm is the group descriptor. * naddr is the address of the source to record in network-byte order. * * If the net.inet6.mld.sgalloc sysctl is non-zero, we will * lazy-allocate a source node in response to an SG query. * Otherwise, no allocation is performed. This saves some memory * with the trade-off that the source will not be reported to the * router if joined in the window between the query response and * the group actually being joined on the local host. * * VIMAGE: XXX: Currently the mld_sgalloc feature has been removed. * This turns off the allocation of a recorded source entry if * the group has not been joined. * * Return 0 if the source didn't exist or was already marked as recorded. * Return 1 if the source was marked as recorded by this function. * Return <0 if any error occurred (negated errno code). */ int in6m_record_source(struct in6_multi *inm, const struct in6_addr *addr) { struct ip6_msource find; struct ip6_msource *ims, *nims; IN6_MULTI_LIST_LOCK_ASSERT(); find.im6s_addr = *addr; ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find); if (ims && ims->im6s_stp) return (0); if (ims == NULL) { if (inm->in6m_nsrc == in6_mcast_maxgrpsrc) return (-ENOSPC); nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (-ENOMEM); nims->im6s_addr = find.im6s_addr; RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims); ++inm->in6m_nsrc; ims = nims; } /* * Mark the source as recorded and update the recorded * source count. */ ++ims->im6s_stp; ++inm->in6m_st[1].iss_rec; return (1); } /* * Return a pointer to an in6_msource owned by an in6_mfilter, * given its source address. * Lazy-allocate if needed. If this is a new entry its filter state is * undefined at t0. * * imf is the filter set being modified. * addr is the source address. * * SMPng: May be called with locks held; malloc must not block. */ static int im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin, struct in6_msource **plims) { struct ip6_msource find; struct ip6_msource *ims, *nims; struct in6_msource *lims; int error; error = 0; ims = NULL; lims = NULL; find.im6s_addr = psin->sin6_addr; ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); lims = (struct in6_msource *)ims; if (lims == NULL) { if (imf->im6f_nsrc == in6_mcast_maxsocksrc) return (ENOSPC); nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); lims = (struct in6_msource *)nims; lims->im6s_addr = find.im6s_addr; lims->im6sl_st[0] = MCAST_UNDEFINED; RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims); ++imf->im6f_nsrc; } *plims = lims; return (error); } /* * Graft a source entry into an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being in the new filter mode at t1. * * Return the pointer to the new node, otherwise return NULL. */ static struct in6_msource * im6f_graft(struct in6_mfilter *imf, const uint8_t st1, const struct sockaddr_in6 *psin) { struct ip6_msource *nims; struct in6_msource *lims; nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (NULL); lims = (struct in6_msource *)nims; lims->im6s_addr = psin->sin6_addr; lims->im6sl_st[0] = MCAST_UNDEFINED; lims->im6sl_st[1] = st1; RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims); ++imf->im6f_nsrc; return (lims); } /* * Prune a source entry from an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being left at t1, it is not freed. * * Return 0 if no error occurred, otherwise return an errno value. */ static int im6f_prune(struct in6_mfilter *imf, const struct sockaddr_in6 *psin) { struct ip6_msource find; struct ip6_msource *ims; struct in6_msource *lims; find.im6s_addr = psin->sin6_addr; ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); if (ims == NULL) return (ENOENT); lims = (struct in6_msource *)ims; lims->im6sl_st[1] = MCAST_UNDEFINED; return (0); } /* * Revert socket-layer filter set deltas at t1 to t0 state. */ static void im6f_rollback(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; struct in6_msource *lims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == lims->im6sl_st[1]) { /* no change at t1 */ continue; } else if (lims->im6sl_st[0] != MCAST_UNDEFINED) { /* revert change to existing source at t1 */ lims->im6sl_st[1] = lims->im6sl_st[0]; } else { /* revert source added t1 */ CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } } imf->im6f_st[1] = imf->im6f_st[0]; } /* * Mark socket-layer filter set as INCLUDE {} at t1. */ static void im6f_leave(struct in6_mfilter *imf) { struct ip6_msource *ims; struct in6_msource *lims; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; lims->im6sl_st[1] = MCAST_UNDEFINED; } imf->im6f_st[1] = MCAST_INCLUDE; } /* * Mark socket-layer filter set deltas as committed. */ static void im6f_commit(struct in6_mfilter *imf) { struct ip6_msource *ims; struct in6_msource *lims; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; lims->im6sl_st[0] = lims->im6sl_st[1]; } imf->im6f_st[0] = imf->im6f_st[1]; } /* * Reap unreferenced sources from socket-layer filter set. */ static void im6f_reap(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; struct in6_msource *lims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { lims = (struct in6_msource *)ims; if ((lims->im6sl_st[0] == MCAST_UNDEFINED) && (lims->im6sl_st[1] == MCAST_UNDEFINED)) { CTR2(KTR_MLD, "%s: free lims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } } } /* * Purge socket-layer filter set. */ static void im6f_purge(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } imf->im6f_st[0] = imf->im6f_st[1] = MCAST_UNDEFINED; KASSERT(RB_EMPTY(&imf->im6f_sources), ("%s: im6f_sources not empty", __func__)); } /* * Look up a source filter entry for a multicast group. * * inm is the group descriptor to work with. * addr is the IPv6 address to look up. * noalloc may be non-zero to suppress allocation of sources. * *pims will be set to the address of the retrieved or allocated source. * * SMPng: NOTE: may be called with locks held. * Return 0 if successful, otherwise return a non-zero error code. */ static int in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr, const int noalloc, struct ip6_msource **pims) { struct ip6_msource find; struct ip6_msource *ims, *nims; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif find.im6s_addr = *addr; ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find); if (ims == NULL && !noalloc) { if (inm->in6m_nsrc == in6_mcast_maxgrpsrc) return (ENOSPC); nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); nims->im6s_addr = *addr; RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims); ++inm->in6m_nsrc; ims = nims; CTR3(KTR_MLD, "%s: allocated %s as %p", __func__, ip6_sprintf(ip6tbuf, addr), ims); } *pims = ims; return (0); } /* * Merge socket-layer source into MLD-layer source. * If rollback is non-zero, perform the inverse of the merge. */ static void im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims, const int rollback) { int n = rollback ? -1 : 1; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; ip6_sprintf(ip6tbuf, &lims->im6s_addr); #endif if (lims->im6sl_st[0] == MCAST_EXCLUDE) { CTR3(KTR_MLD, "%s: t1 ex -= %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].ex -= n; } else if (lims->im6sl_st[0] == MCAST_INCLUDE) { CTR3(KTR_MLD, "%s: t1 in -= %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].in -= n; } if (lims->im6sl_st[1] == MCAST_EXCLUDE) { CTR3(KTR_MLD, "%s: t1 ex += %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].ex += n; } else if (lims->im6sl_st[1] == MCAST_INCLUDE) { CTR3(KTR_MLD, "%s: t1 in += %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].in += n; } } /* * Atomically update the global in6_multi state, when a membership's * filter list is being updated in any way. * * imf is the per-inpcb-membership group filter pointer. * A fake imf may be passed for in-kernel consumers. * * XXX This is a candidate for a set-symmetric-difference style loop * which would eliminate the repeated lookup from root of ims nodes, * as they share the same key space. * * If any error occurred this function will back out of refcounts * and return a non-zero value. */ static int in6m_merge(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { struct ip6_msource *ims, *nims; struct in6_msource *lims; int schanged, error; int nsrc0, nsrc1; schanged = 0; error = 0; nsrc1 = nsrc0 = 0; IN6_MULTI_LIST_LOCK_ASSERT(); /* * Update the source filters first, as this may fail. * Maintain count of in-mode filters at t0, t1. These are * used to work out if we transition into ASM mode or not. * Maintain a count of source filters whose state was * actually modified by this operation. */ RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == imf->im6f_st[0]) nsrc0++; if (lims->im6sl_st[1] == imf->im6f_st[1]) nsrc1++; if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue; error = in6m_get_source(inm, &lims->im6s_addr, 0, &nims); ++schanged; if (error) break; im6s_merge(nims, lims, 0); } if (error) { struct ip6_msource *bims; RB_FOREACH_REVERSE_FROM(ims, ip6_msource_tree, nims) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue; (void)in6m_get_source(inm, &lims->im6s_addr, 1, &bims); if (bims == NULL) continue; im6s_merge(bims, lims, 1); } goto out_reap; } CTR3(KTR_MLD, "%s: imf filters in-mode: %d at t0, %d at t1", __func__, nsrc0, nsrc1); /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ if (imf->im6f_st[0] == imf->im6f_st[1] && imf->im6f_st[1] == MCAST_INCLUDE) { if (nsrc1 == 0) { CTR1(KTR_MLD, "%s: --in on inm at t1", __func__); --inm->in6m_st[1].iss_in; } } /* Handle filter mode transition on socket. */ if (imf->im6f_st[0] != imf->im6f_st[1]) { CTR3(KTR_MLD, "%s: imf transition %d to %d", __func__, imf->im6f_st[0], imf->im6f_st[1]); if (imf->im6f_st[0] == MCAST_EXCLUDE) { CTR1(KTR_MLD, "%s: --ex on inm at t1", __func__); --inm->in6m_st[1].iss_ex; } else if (imf->im6f_st[0] == MCAST_INCLUDE) { CTR1(KTR_MLD, "%s: --in on inm at t1", __func__); --inm->in6m_st[1].iss_in; } if (imf->im6f_st[1] == MCAST_EXCLUDE) { CTR1(KTR_MLD, "%s: ex++ on inm at t1", __func__); inm->in6m_st[1].iss_ex++; } else if (imf->im6f_st[1] == MCAST_INCLUDE && nsrc1 > 0) { CTR1(KTR_MLD, "%s: in++ on inm at t1", __func__); inm->in6m_st[1].iss_in++; } } /* * Track inm filter state in terms of listener counts. * If there are any exclusive listeners, stack-wide * membership is exclusive. * Otherwise, if only inclusive listeners, stack-wide is inclusive. * If no listeners remain, state is undefined at t1, * and the MLD lifecycle for this group should finish. */ if (inm->in6m_st[1].iss_ex > 0) { CTR1(KTR_MLD, "%s: transition to EX", __func__); inm->in6m_st[1].iss_fmode = MCAST_EXCLUDE; } else if (inm->in6m_st[1].iss_in > 0) { CTR1(KTR_MLD, "%s: transition to IN", __func__); inm->in6m_st[1].iss_fmode = MCAST_INCLUDE; } else { CTR1(KTR_MLD, "%s: transition to UNDEF", __func__); inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; } /* Decrement ASM listener count on transition out of ASM mode. */ if (imf->im6f_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { if ((imf->im6f_st[1] != MCAST_EXCLUDE) || (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) { CTR1(KTR_MLD, "%s: --asm on inm at t1", __func__); --inm->in6m_st[1].iss_asm; } } /* Increment ASM listener count on transition to ASM mode. */ if (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { CTR1(KTR_MLD, "%s: asm++ on inm at t1", __func__); inm->in6m_st[1].iss_asm++; } CTR3(KTR_MLD, "%s: merged imf %p to inm %p", __func__, imf, inm); in6m_print(inm); out_reap: if (schanged > 0) { CTR1(KTR_MLD, "%s: sources changed; reaping", __func__); in6m_reap(inm); } return (error); } /* * Mark an in6_multi's filter set deltas as committed. * Called by MLD after a state change has been enqueued. */ void in6m_commit(struct in6_multi *inm) { struct ip6_msource *ims; CTR2(KTR_MLD, "%s: commit inm %p", __func__, inm); CTR1(KTR_MLD, "%s: pre commit:", __func__); in6m_print(inm); RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { ims->im6s_st[0] = ims->im6s_st[1]; } inm->in6m_st[0] = inm->in6m_st[1]; } /* * Reap unreferenced nodes from an in6_multi's filter set. */ static void in6m_reap(struct in6_multi *inm) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) { if (ims->im6s_st[0].ex > 0 || ims->im6s_st[0].in > 0 || ims->im6s_st[1].ex > 0 || ims->im6s_st[1].in > 0 || ims->im6s_stp != 0) continue; CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims); free(ims, M_IP6MSOURCE); inm->in6m_nsrc--; } } /* * Purge all source nodes from an in6_multi's filter set. */ static void in6m_purge(struct in6_multi *inm) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) { CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims); free(ims, M_IP6MSOURCE); inm->in6m_nsrc--; } /* Free state-change requests that might be queued. */ mbufq_drain(&inm->in6m_scq); } /* * Join a multicast address w/o sources. * KAME compatibility entry point. * * SMPng: Assume no mc locks held by caller. */ int in6_joingroup(struct ifnet *ifp, const struct in6_addr *mcaddr, /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm, const int delay) { int error; IN6_MULTI_LOCK(); error = in6_joingroup_locked(ifp, mcaddr, NULL, pinm, delay); IN6_MULTI_UNLOCK(); return (error); } /* * Join a multicast group; real entry point. * * Only preserves atomicity at inm level. * NOTE: imf argument cannot be const due to sys/tree.h limitations. * * If the MLD downcall fails, the group is not joined, and an error * code is returned. */ static int in6_joingroup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr, /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm, const int delay) { struct in6_multi_head inmh; struct in6_mfilter timf; struct in6_multi *inm; struct ifmultiaddr *ifma; int error; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif /* * Sanity: Check scope zone ID was set for ifp, if and * only if group is scoped to an interface. */ KASSERT(IN6_IS_ADDR_MULTICAST(mcaddr), ("%s: not a multicast address", __func__)); if (IN6_IS_ADDR_MC_LINKLOCAL(mcaddr) || IN6_IS_ADDR_MC_INTFACELOCAL(mcaddr)) { KASSERT(mcaddr->s6_addr16[1] != 0, ("%s: scope zone ID not set", __func__)); } IN6_MULTI_LOCK_ASSERT(); IN6_MULTI_LIST_UNLOCK_ASSERT(); CTR4(KTR_MLD, "%s: join %s on %p(%s))", __func__, ip6_sprintf(ip6tbuf, mcaddr), ifp, if_name(ifp)); error = 0; inm = NULL; /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { im6f_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); imf = &timf; } error = in6_getmulti(ifp, mcaddr, &inm); if (error) { CTR1(KTR_MLD, "%s: in6_getmulti() failure", __func__); return (error); } IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) { CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); goto out_in6m_release; } CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, delay); if (error) { CTR1(KTR_MLD, "%s: failed to update source", __func__); goto out_in6m_release; } out_in6m_release: SLIST_INIT(&inmh); if (error) { struct epoch_tracker et; CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm); IF_ADDR_WLOCK(ifp); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_protospec == inm) { ifma->ifma_protospec = NULL; break; } } in6m_disconnect_locked(&inmh, inm); in6m_rele_locked(&inmh, inm); NET_EPOCH_EXIT(et); IF_ADDR_WUNLOCK(ifp); } else { *pinm = inm; } IN6_MULTI_LIST_UNLOCK(); in6m_release_list_deferred(&inmh); return (error); } /* * Leave a multicast group; unlocked entry point. */ int in6_leavegroup(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { int error; IN6_MULTI_LOCK(); error = in6_leavegroup_locked(inm, imf); IN6_MULTI_UNLOCK(); return (error); } /* * Leave a multicast group; real entry point. * All source filters will be expunged. * * Only preserves atomicity at inm level. * * Holding the write lock for the INP which contains imf * is highly advisable. We can't assert for it as imf does not * contain a back-pointer to the owning inp. * * Note: This is not the same as in6m_release(*) as this function also * makes a state change downcall into MLD. */ int in6_leavegroup_locked(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { struct in6_multi_head inmh; struct in6_mfilter timf; struct ifnet *ifp; int error; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif error = 0; IN6_MULTI_LOCK_ASSERT(); CTR5(KTR_MLD, "%s: leave inm %p, %s/%s, imf %p", __func__, inm, ip6_sprintf(ip6tbuf, &inm->in6m_addr), (in6m_is_ifp_detached(inm) ? "null" : if_name(inm->in6m_ifp)), imf); /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { im6f_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); imf = &timf; } /* * Begin state merge transaction at MLD layer. * * As this particular invocation should not cause any memory * to be allocated, and there is no opportunity to roll back * the transaction, it MUST NOT fail. */ ifp = inm->in6m_ifp; IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = 0; if (ifp) error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm); if (ifp) IF_ADDR_WLOCK(ifp); SLIST_INIT(&inmh); if (inm->in6m_refcount == 1) in6m_disconnect_locked(&inmh, inm); in6m_rele_locked(&inmh, inm); if (ifp) IF_ADDR_WUNLOCK(ifp); IN6_MULTI_LIST_UNLOCK(); in6m_release_list_deferred(&inmh); return (error); } /* * Block or unblock an ASM multicast source on an inpcb. * This implements the delta-based API described in RFC 3678. * * The delta-based API applies only to exclusive-mode memberships. * An MLD downcall will be performed. * * SMPng: NOTE: Must take Giant as a join may create a new ifma. * * Return 0 if successful, otherwise return an appropriate error code. */ static int in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_msource *ims; struct in6_multi *inm; uint16_t fmode; int error, doblock; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif ifp = NULL; error = 0; doblock = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; ssa = (sockunion_t *)&gsr.gsr_source; switch (sopt->sopt_name) { case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) doblock = 1; break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); /* * Check if we are actually a member of this group. */ imo = in6p_findmoptions(inp); imf = im6o_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } inm = imf->im6f_in6m; /* * Attempting to use the delta-based API on an * non exclusive-mode membership is an error. */ fmode = imf->im6f_st[0]; if (fmode != MCAST_EXCLUDE) { error = EINVAL; goto out_in6p_locked; } /* * Deal with error cases up-front: * Asked to block, but already blocked; or * Asked to unblock, but nothing to unblock. * If adding a new block entry, allocate it. */ ims = im6o_match_source(imf, &ssa->sa); if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { CTR3(KTR_MLD, "%s: source %s %spresent", __func__, ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr), doblock ? "" : "not "); error = EADDRNOTAVAIL; goto out_in6p_locked; } INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at socket layer. */ if (doblock) { CTR2(KTR_MLD, "%s: %s source", __func__, "block"); ims = im6f_graft(imf, fmode, &ssa->sin6); if (ims == NULL) error = ENOMEM; } else { CTR2(KTR_MLD, "%s: %s source", __func__, "allow"); error = im6f_prune(imf, &ssa->sin6); } if (error) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); goto out_im6f_rollback; } /* * Begin state merge transaction at MLD layer. */ IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } IN6_MULTI_LIST_UNLOCK(); out_im6f_rollback: if (error) im6f_rollback(imf); else im6f_commit(imf); im6f_reap(imf); out_in6p_locked: INP_WUNLOCK(inp); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. * SMPng: NOTE: Returns with the INP write lock held. */ static struct ip6_moptions * in6p_findmoptions(struct inpcb *inp) { struct ip6_moptions *imo; INP_WLOCK(inp); if (inp->in6p_moptions != NULL) return (inp->in6p_moptions); INP_WUNLOCK(inp); imo = malloc(sizeof(*imo), M_IP6MOPTS, M_WAITOK); imo->im6o_multicast_ifp = NULL; imo->im6o_multicast_hlim = V_ip6_defmcasthlim; imo->im6o_multicast_loop = in6_mcast_loop; STAILQ_INIT(&imo->im6o_head); INP_WLOCK(inp); if (inp->in6p_moptions != NULL) { free(imo, M_IP6MOPTS); return (inp->in6p_moptions); } inp->in6p_moptions = imo; return (imo); } /* * Discard the IPv6 multicast options (and source filters). * * SMPng: NOTE: assumes INP write lock is held. * * XXX can all be safely deferred to epoch_call * */ static void inp_gcmoptions(struct ip6_moptions *imo) { struct in6_mfilter *imf; struct in6_multi *inm; struct ifnet *ifp; while ((imf = ip6_mfilter_first(&imo->im6o_head)) != NULL) { ip6_mfilter_remove(&imo->im6o_head, imf); im6f_leave(imf); if ((inm = imf->im6f_in6m) != NULL) { if ((ifp = inm->in6m_ifp) != NULL) { CURVNET_SET(ifp->if_vnet); (void)in6_leavegroup(inm, imf); CURVNET_RESTORE(); } else { (void)in6_leavegroup(inm, imf); } } ip6_mfilter_free(imf); } free(imo, M_IP6MOPTS); } void ip6_freemoptions(struct ip6_moptions *imo) { if (imo == NULL) return; inp_gcmoptions(imo); } /* * Atomically get source filters on a socket for an IPv6 multicast group. * Called with INP lock held; returns with lock released. */ static int in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct ip6_moptions *imo; struct in6_mfilter *imf; struct ip6_msource *ims; struct in6_msource *lims; struct sockaddr_in6 *psin; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; size_t nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); imo = inp->in6p_moptions; KASSERT(imo != NULL, ("%s: null ip6_moptions", __func__)); INP_WUNLOCK(inp); error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_group.ss_family != AF_INET6 || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); INP_WLOCK(inp); /* * Lookup group on the socket. */ imf = im6o_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } /* * Ignore memberships which are in limbo. */ if (imf->im6f_st[1] == MCAST_UNDEFINED) { INP_WUNLOCK(inp); return (EAGAIN); } msfr.msfr_fmode = imf->im6f_st[1]; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. * We only copy out the number of entries which userland * has asked for, but we always tell userland how big the * buffer really needs to be. */ if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) msfr.msfr_nsrcs = in6_mcast_maxsocksrc; tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_NOWAIT | M_ZERO); if (tss == NULL) { INP_WUNLOCK(inp); return (ENOBUFS); } } /* * Count number of sources in-mode at t0. * If buffer space exists and remains, copy out source entries. */ nsrcs = msfr.msfr_nsrcs; ncsrcs = 0; ptss = tss; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == MCAST_UNDEFINED || lims->im6sl_st[0] != imf->im6f_st[0]) continue; ++ncsrcs; if (tss != NULL && nsrcs > 0) { psin = (struct sockaddr_in6 *)ptss; psin->sin6_family = AF_INET6; psin->sin6_len = sizeof(struct sockaddr_in6); psin->sin6_addr = lims->im6s_addr; psin->sin6_port = 0; --nsrcs; ++ptss; } } INP_WUNLOCK(inp); if (tss != NULL) { error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); free(tss, M_TEMP); if (error) return (error); } msfr.msfr_nsrcs = ncsrcs; error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ int ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip6_moptions *im6o; int error; u_int optval; INP_WLOCK(inp); im6o = inp->in6p_moptions; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = 0; switch (sopt->sopt_name) { case IPV6_MULTICAST_IF: if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) { optval = 0; } else { optval = im6o->im6o_multicast_ifp->if_index; } INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MULTICAST_HOPS: if (im6o == NULL) optval = V_ip6_defmcasthlim; else optval = im6o->im6o_multicast_hlim; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MULTICAST_LOOP: if (im6o == NULL) optval = in6_mcast_loop; /* XXX VIMAGE */ else optval = im6o->im6o_multicast_loop; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MSFILTER: if (im6o == NULL) { error = EADDRNOTAVAIL; INP_WUNLOCK(inp); } else { error = in6p_get_source_filters(inp, sopt); } break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Look up the ifnet to use for a multicast group membership, * given the address of an IPv6 group. * * This routine exists to support legacy IPv6 multicast applications. * * If inp is non-NULL, use this socket's current FIB number for any * required FIB lookup. Look up the group address in the unicast FIB, * and use its ifp; usually, this points to the default next-hop. * If the FIB lookup fails, return NULL. * * FUTURE: Support multiple forwarding tables for IPv6. * * Returns NULL if no ifp could be found. */ static struct ifnet * in6p_lookup_mcast_ifp(const struct inpcb *inp, const struct sockaddr_in6 *gsin6) { - struct nhop6_basic nh6; + struct nhop_object *nh; struct in6_addr dst; uint32_t scopeid; uint32_t fibnum; KASSERT(inp->inp_vflag & INP_IPV6, ("%s: not INP_IPV6 inpcb", __func__)); KASSERT(gsin6->sin6_family == AF_INET6, ("%s: not AF_INET6 group", __func__)); in6_splitscope(&gsin6->sin6_addr, &dst, &scopeid); fibnum = inp ? inp->inp_inc.inc_fibnum : RT_DEFAULT_FIB; - if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6) != 0) - return (NULL); + nh = fib6_lookup(fibnum, &dst, scopeid, 0, 0); - return (nh6.nh_ifp); + return (nh ? nh->nh_ifp : NULL); } /* * Join an IPv6 multicast group, possibly with a source. * * FIXME: The KAME use of the unspecified address (::) * to join *all* multicast groups is currently unsupported. */ static int in6p_join_group(struct inpcb *inp, struct sockopt *sopt) { struct in6_multi_head inmh; struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_multi *inm; struct in6_msource *lims; int error, is_new; SLIST_INIT(&inmh); ifp = NULL; lims = NULL; error = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; /* * Chew everything into struct group_source_req. * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. * Ignore passed-in scope ID. */ switch (sopt->sopt_name) { case IPV6_JOIN_GROUP: { struct ipv6_mreq mreq; error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq), sizeof(struct ipv6_mreq)); if (error) return (error); gsa->sin6.sin6_family = AF_INET6; gsa->sin6.sin6_len = sizeof(struct sockaddr_in6); gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr; if (mreq.ipv6mr_interface == 0) { ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6); } else { if (V_if_index < mreq.ipv6mr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(mreq.ipv6mr_interface); } CTR3(KTR_MLD, "%s: ipv6mr_interface = %d, ifp = %p", __func__, mreq.ipv6mr_interface, ifp); } break; case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: if (sopt->sopt_name == MCAST_JOIN_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr)) return (EINVAL); /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&ssa->sin6.sin6_addr); ssa->sin6.sin6_port = 0; ssa->sin6.sin6_scope_id = 0; } if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; /* * Always set the scope zone ID on memberships created from userland. * Use the passed-in ifp to do this. * XXX The in6_setscope() return value is meaningless. * XXX SCOPE6_LOCK() is taken by in6_setscope(). */ (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); IN6_MULTI_LOCK(); /* * Find the membership in the membership list. */ imo = in6p_findmoptions(inp); imf = im6o_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { is_new = 1; inm = NULL; if (ip6_mfilter_count(&imo->im6o_head) >= IPV6_MAX_MEMBERSHIPS) { error = ENOMEM; goto out_in6p_locked; } } else { is_new = 0; inm = imf->im6f_in6m; if (ssa->ss.ss_family != AF_UNSPEC) { /* * MCAST_JOIN_SOURCE_GROUP on an exclusive membership * is an error. On an existing inclusive membership, * it just adds the source to the filter list. */ if (imf->im6f_st[1] != MCAST_INCLUDE) { error = EINVAL; goto out_in6p_locked; } /* * Throw out duplicates. * * XXX FIXME: This makes a naive assumption that * even if entries exist for *ssa in this imf, * they will be rejected as dupes, even if they * are not valid in the current mode (in-mode). * * in6_msource is transactioned just as for anything * else in SSM -- but note naive use of in6m_graft() * below for allocating new filter entries. * * This is only an issue if someone mixes the * full-state SSM API with the delta-based API, * which is discouraged in the relevant RFCs. */ lims = im6o_match_source(imf, &ssa->sa); if (lims != NULL /*&& lims->im6sl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; goto out_in6p_locked; } } else { /* * MCAST_JOIN_GROUP alone, on any existing membership, * is rejected, to stop the same inpcb tying up * multiple refs to the in_multi. * On an existing inclusive membership, this is also * an error; if you want to change filter mode, * you must use the userland API setsourcefilter(). * XXX We don't reject this for imf in UNDEFINED * state at t1, because allocation of a filter * is atomic with allocation of a membership. */ error = EINVAL; goto out_in6p_locked; } } /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * Graft new source into filter list for this inpcb's * membership of the group. The in6_multi may not have * been allocated yet if this is a new membership, however, * the in_mfilter slot will be allocated and must be initialized. * * Note: Grafting of exclusive mode filters doesn't happen * in this path. * XXX: Should check for non-NULL lims (node exists but may * not be in-mode) for interop with full-state API. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* Membership starts in IN mode */ if (is_new) { CTR1(KTR_MLD, "%s: new join w/source", __func__); imf = ip6_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_INCLUDE); if (imf == NULL) { error = ENOMEM; goto out_in6p_locked; } } else { CTR2(KTR_MLD, "%s: %s source", __func__, "allow"); } lims = im6f_graft(imf, MCAST_INCLUDE, &ssa->sin6); if (lims == NULL) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); error = ENOMEM; goto out_in6p_locked; } } else { /* No address specified; Membership starts in EX mode */ if (is_new) { CTR1(KTR_MLD, "%s: new join w/o source", __func__); imf = ip6_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_EXCLUDE); if (imf == NULL) { error = ENOMEM; goto out_in6p_locked; } } } /* * Begin state merge transaction at MLD layer. */ if (is_new) { in_pcbref(inp); INP_WUNLOCK(inp); error = in6_joingroup_locked(ifp, &gsa->sin6.sin6_addr, imf, &imf->im6f_in6m, 0); INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { error = ENXIO; goto out_in6p_unlocked; } if (error) { goto out_in6p_locked; } /* * NOTE: Refcount from in6_joingroup_locked() * is protecting membership. */ ip6_mfilter_insert(&imo->im6o_head, imf); } else { CTR1(KTR_MLD, "%s: merge inm state", __func__); IN6_MULTI_LIST_LOCK(); error = in6m_merge(inm, imf); if (error) { CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); IN6_MULTI_LIST_UNLOCK(); im6f_rollback(imf); im6f_reap(imf); goto out_in6p_locked; } CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); IN6_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_MLD, "%s: failed mld downcall", __func__); im6f_rollback(imf); im6f_reap(imf); goto out_in6p_locked; } } im6f_commit(imf); imf = NULL; out_in6p_locked: INP_WUNLOCK(inp); out_in6p_unlocked: IN6_MULTI_UNLOCK(); if (is_new && imf) { if (imf->im6f_in6m != NULL) { struct in6_multi_head inmh; SLIST_INIT(&inmh); SLIST_INSERT_HEAD(&inmh, imf->im6f_in6m, in6m_defer); in6m_release_list_deferred(&inmh); } ip6_mfilter_free(imf); } return (error); } /* * Leave an IPv6 multicast group on an inpcb, possibly with a source. */ static int in6p_leave_group(struct inpcb *inp, struct sockopt *sopt) { struct ipv6_mreq mreq; struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_msource *ims; struct in6_multi *inm; uint32_t ifindex; int error; bool is_final; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif ifp = NULL; ifindex = 0; error = 0; is_final = true; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; /* * Chew everything passed in up into a struct group_source_req * as that is easier to process. * Note: Any embedded scope ID in the multicast group passed * in by userland is ignored, the interface index is the recommended * mechanism to specify an interface; see below. */ switch (sopt->sopt_name) { case IPV6_LEAVE_GROUP: error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq), sizeof(struct ipv6_mreq)); if (error) return (error); gsa->sin6.sin6_family = AF_INET6; gsa->sin6.sin6_len = sizeof(struct sockaddr_in6); gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr; gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; ifindex = mreq.ipv6mr_interface; break; case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: if (sopt->sopt_name == MCAST_LEAVE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr)) return (EINVAL); /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&ssa->sin6.sin6_addr); } gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; ifindex = gsr.gsr_interface; break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); /* * Validate interface index if provided. If no interface index * was provided separately, attempt to look the membership up * from the default scope as a last resort to disambiguate * the membership we are being asked to leave. * XXX SCOPE6 lock potentially taken here. */ if (ifindex != 0) { if (V_if_index < ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); } else { error = sa6_embedscope(&gsa->sin6, V_ip6_use_defzone); if (error) return (EADDRNOTAVAIL); /* * Some badly behaved applications don't pass an ifindex * or a scope ID, which is an API violation. In this case, * perform a lookup as per a v6 join. * * XXX For now, stomp on zone ID for the corner case. * This is not the 'KAME way', but we need to see the ifp * directly until such time as this implementation is * refactored, assuming the scope IDs are the way to go. */ ifindex = ntohs(gsa->sin6.sin6_addr.s6_addr16[1]); if (ifindex == 0) { CTR2(KTR_MLD, "%s: warning: no ifindex, looking up " "ifp for group %s.", __func__, ip6_sprintf(ip6tbuf, &gsa->sin6.sin6_addr)); ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6); } else { ifp = ifnet_byindex(ifindex); } if (ifp == NULL) return (EADDRNOTAVAIL); } CTR2(KTR_MLD, "%s: ifp = %p", __func__, ifp); KASSERT(ifp != NULL, ("%s: ifp did not resolve", __func__)); IN6_MULTI_LOCK(); /* * Find the membership in the membership list. */ imo = in6p_findmoptions(inp); imf = im6o_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } inm = imf->im6f_in6m; if (ssa->ss.ss_family != AF_UNSPEC) is_final = false; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * If we were instructed only to leave a given source, do so. * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ if (is_final) { ip6_mfilter_remove(&imo->im6o_head, imf); im6f_leave(imf); /* * Give up the multicast address record to which * the membership points. */ (void)in6_leavegroup_locked(inm, imf); } else { if (imf->im6f_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; goto out_in6p_locked; } ims = im6o_match_source(imf, &ssa->sa); if (ims == NULL) { CTR3(KTR_MLD, "%s: source %p %spresent", __func__, ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr), "not "); error = EADDRNOTAVAIL; goto out_in6p_locked; } CTR2(KTR_MLD, "%s: %s source", __func__, "block"); error = im6f_prune(imf, &ssa->sin6); if (error) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); goto out_in6p_locked; } } /* * Begin state merge transaction at MLD layer. */ if (!is_final) { CTR1(KTR_MLD, "%s: merge inm state", __func__); IN6_MULTI_LIST_LOCK(); error = in6m_merge(inm, imf); if (error) { CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); IN6_MULTI_LIST_UNLOCK(); im6f_rollback(imf); im6f_reap(imf); goto out_in6p_locked; } CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); IN6_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_MLD, "%s: failed mld downcall", __func__); im6f_rollback(imf); im6f_reap(imf); goto out_in6p_locked; } } im6f_commit(imf); im6f_reap(imf); out_in6p_locked: INP_WUNLOCK(inp); if (is_final && imf) ip6_mfilter_free(imf); IN6_MULTI_UNLOCK(); return (error); } /* * Select the interface for transmitting IPv6 multicast datagrams. * * Either an instance of struct in6_addr or an instance of struct ipv6_mreqn * may be passed to this socket option. An address of in6addr_any or an * interface index of 0 is used to remove a previous selection. * When no interface is selected, one is chosen for every send. */ static int in6p_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { struct ifnet *ifp; struct ip6_moptions *imo; u_int ifindex; int error; if (sopt->sopt_valsize != sizeof(u_int)) return (EINVAL); error = sooptcopyin(sopt, &ifindex, sizeof(u_int), sizeof(u_int)); if (error) return (error); if (V_if_index < ifindex) return (EINVAL); if (ifindex == 0) ifp = NULL; else { ifp = ifnet_byindex(ifindex); if (ifp == NULL) return (EINVAL); if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); } imo = in6p_findmoptions(inp); imo->im6o_multicast_ifp = ifp; INP_WUNLOCK(inp); return (0); } /* * Atomically set source filters on a socket for an IPv6 multicast group. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. */ static int in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_multi *inm; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) return (ENOBUFS); if (msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE) return (EINVAL); if (msfr.msfr_group.ss_family != AF_INET6 || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); gsa->sin6.sin6_port = 0; /* ignore port */ if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); /* * Take the INP write lock. * Check if this socket is a member of this group. */ imo = in6p_findmoptions(inp); imf = im6o_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } inm = imf->im6f_in6m; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); imf->im6f_st[1] = msfr.msfr_fmode; /* * Apply any new source filters, if present. * Make a copy of the user-space source vector so * that we may copy them with a single copyin. This * allows us to deal with page faults up-front. */ if (msfr.msfr_nsrcs > 0) { struct in6_msource *lims; struct sockaddr_in6 *psin; struct sockaddr_storage *kss, *pkss; int i; INP_WUNLOCK(inp); CTR2(KTR_MLD, "%s: loading %lu source list entries", __func__, (unsigned long)msfr.msfr_nsrcs); kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); if (error) { free(kss, M_TEMP); return (error); } INP_WLOCK(inp); /* * Mark all source filters as UNDEFINED at t1. * Restore new group filter mode, as im6f_leave() * will set it to INCLUDE. */ im6f_leave(imf); imf->im6f_st[1] = msfr.msfr_fmode; /* * Update socket layer filters at t1, lazy-allocating * new entries. This saves a bunch of memory at the * cost of one RB_FIND() per source entry; duplicate * entries in the msfr_nsrcs vector are ignored. * If we encounter an error, rollback transaction. * * XXX This too could be replaced with a set-symmetric * difference like loop to avoid walking from root * every time, as the key space is common. */ for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { psin = (struct sockaddr_in6 *)pkss; if (psin->sin6_family != AF_INET6) { error = EAFNOSUPPORT; break; } if (psin->sin6_len != sizeof(struct sockaddr_in6)) { error = EINVAL; break; } if (IN6_IS_ADDR_MULTICAST(&psin->sin6_addr)) { error = EINVAL; break; } /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&psin->sin6_addr); error = im6f_get_source(imf, psin, &lims); if (error) break; lims->im6sl_st[1] = imf->im6f_st[1]; } free(kss, M_TEMP); } if (error) goto out_im6f_rollback; INP_WLOCK_ASSERT(inp); IN6_MULTI_LIST_LOCK(); /* * Begin state merge transaction at MLD layer. */ CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } IN6_MULTI_LIST_UNLOCK(); out_im6f_rollback: if (error) im6f_rollback(imf); else im6f_commit(imf); im6f_reap(imf); out_in6p_locked: INP_WUNLOCK(inp); return (error); } /* * Set the IP multicast options in response to user setsockopt(). * * Many of the socket options handled in this function duplicate the * functionality of socket options in the regular unicast API. However, * it is not possible to merge the duplicate code, because the idempotence * of the IPv6 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. * * SMPng: XXX: Unlocked read of inp_socket believed OK. */ int ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip6_moptions *im6o; int error; error = 0; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) return (EOPNOTSUPP); switch (sopt->sopt_name) { case IPV6_MULTICAST_IF: error = in6p_set_multicast_if(inp, sopt); break; case IPV6_MULTICAST_HOPS: { int hlim; if (sopt->sopt_valsize != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &hlim, sizeof(hlim), sizeof(int)); if (error) break; if (hlim < -1 || hlim > 255) { error = EINVAL; break; } else if (hlim == -1) { hlim = V_ip6_defmcasthlim; } im6o = in6p_findmoptions(inp); im6o->im6o_multicast_hlim = hlim; INP_WUNLOCK(inp); break; } case IPV6_MULTICAST_LOOP: { u_int loop; /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. */ if (sopt->sopt_valsize != sizeof(u_int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &loop, sizeof(u_int), sizeof(u_int)); if (error) break; if (loop > 1) { error = EINVAL; break; } im6o = in6p_findmoptions(inp); im6o->im6o_multicast_loop = loop; INP_WUNLOCK(inp); break; } case IPV6_JOIN_GROUP: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: error = in6p_join_group(inp, sopt); break; case IPV6_LEAVE_GROUP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = in6p_leave_group(inp, sopt); break; case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = in6p_block_unblock_source(inp, sopt); break; case IPV6_MSFILTER: error = in6p_set_source_filters(inp, sopt); break; default: error = EOPNOTSUPP; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Expose MLD's multicast filter mode and source list(s) to userland, * keyed by (ifindex, group). * The filter mode is written out as a uint32_t, followed by * 0..n of struct in6_addr. * For use by ifmcstat(8). * SMPng: NOTE: unlocked read of ifindex space. */ static int sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS) { struct in6_addr mcaddr; struct in6_addr src; struct epoch_tracker et; struct ifnet *ifp; struct ifmultiaddr *ifma; struct in6_multi *inm; struct ip6_msource *ims; int *name; int retval; u_int namelen; uint32_t fmode, ifindex; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); /* int: ifindex + 4 * 32 bits of IPv6 address */ if (namelen != 5) return (EINVAL); ifindex = name[0]; if (ifindex <= 0 || ifindex > V_if_index) { CTR2(KTR_MLD, "%s: ifindex %u out of range", __func__, ifindex); return (ENOENT); } memcpy(&mcaddr, &name[1], sizeof(struct in6_addr)); if (!IN6_IS_ADDR_MULTICAST(&mcaddr)) { CTR2(KTR_MLD, "%s: group %s is not multicast", __func__, ip6_sprintf(ip6tbuf, &mcaddr)); return (EINVAL); } NET_EPOCH_ENTER(et); ifp = ifnet_byindex(ifindex); if (ifp == NULL) { NET_EPOCH_EXIT(et); CTR2(KTR_MLD, "%s: no ifp for ifindex %u", __func__, ifindex); return (ENOENT); } /* * Internal MLD lookups require that scope/zone ID is set. */ (void)in6_setscope(&mcaddr, ifp, NULL); retval = sysctl_wire_old_buffer(req, sizeof(uint32_t) + (in6_mcast_maxgrpsrc * sizeof(struct in6_addr))); if (retval) { NET_EPOCH_EXIT(et); return (retval); } IN6_MULTI_LOCK(); IN6_MULTI_LIST_LOCK(); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { inm = in6m_ifmultiaddr_get_inm(ifma); if (inm == NULL) continue; if (!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &mcaddr)) continue; fmode = inm->in6m_st[1].iss_fmode; retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); if (retval != 0) break; RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { CTR2(KTR_MLD, "%s: visit node %p", __func__, ims); /* * Only copy-out sources which are in-mode. */ if (fmode != im6s_get_mode(inm, ims, 1)) { CTR1(KTR_MLD, "%s: skip non-in-mode", __func__); continue; } src = ims->im6s_addr; retval = SYSCTL_OUT(req, &src, sizeof(struct in6_addr)); if (retval != 0) break; } } IN6_MULTI_LIST_UNLOCK(); IN6_MULTI_UNLOCK(); NET_EPOCH_EXIT(et); return (retval); } #ifdef KTR static const char *in6m_modestrs[] = { "un", "in", "ex" }; static const char * in6m_mode_str(const int mode) { if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) return (in6m_modestrs[mode]); return ("??"); } static const char *in6m_statestrs[] = { "not-member", "silent", "idle", "lazy", "sleeping", "awakening", "query-pending", "sg-query-pending", "leaving" }; static const char * in6m_state_str(const int state) { if (state >= MLD_NOT_MEMBER && state <= MLD_LEAVING_MEMBER) return (in6m_statestrs[state]); return ("??"); } /* * Dump an in6_multi structure to the console. */ void in6m_print(const struct in6_multi *inm) { int t; char ip6tbuf[INET6_ADDRSTRLEN]; if ((ktr_mask & KTR_MLD) == 0) return; printf("%s: --- begin in6m %p ---\n", __func__, inm); printf("addr %s ifp %p(%s) ifma %p\n", ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp), inm->in6m_ifma); printf("timer %u state %s refcount %u scq.len %u\n", inm->in6m_timer, in6m_state_str(inm->in6m_state), inm->in6m_refcount, mbufq_len(&inm->in6m_scq)); printf("mli %p nsrc %lu sctimer %u scrv %u\n", inm->in6m_mli, inm->in6m_nsrc, inm->in6m_sctimer, inm->in6m_scrv); for (t = 0; t < 2; t++) { printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, in6m_mode_str(inm->in6m_st[t].iss_fmode), inm->in6m_st[t].iss_asm, inm->in6m_st[t].iss_ex, inm->in6m_st[t].iss_in, inm->in6m_st[t].iss_rec); } printf("%s: --- end in6m %p ---\n", __func__, inm); } #else /* !KTR */ void in6m_print(const struct in6_multi *inm) { } #endif /* KTR */ Index: head/sys/netinet6/in6_src.c =================================================================== --- head/sys/netinet6/in6_src.c (revision 362899) +++ head/sys/netinet6/in6_src.c (revision 362900) @@ -1,1225 +1,1226 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_src.c,v 1.132 2003/08/26 04:42:27 keiichi Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct mtx addrsel_lock; #define ADDRSEL_LOCK_INIT() mtx_init(&addrsel_lock, "addrsel_lock", NULL, MTX_DEF) #define ADDRSEL_LOCK() mtx_lock(&addrsel_lock) #define ADDRSEL_UNLOCK() mtx_unlock(&addrsel_lock) #define ADDRSEL_LOCK_ASSERT() mtx_assert(&addrsel_lock, MA_OWNED) static struct sx addrsel_sxlock; #define ADDRSEL_SXLOCK_INIT() sx_init(&addrsel_sxlock, "addrsel_sxlock") #define ADDRSEL_SLOCK() sx_slock(&addrsel_sxlock) #define ADDRSEL_SUNLOCK() sx_sunlock(&addrsel_sxlock) #define ADDRSEL_XLOCK() sx_xlock(&addrsel_sxlock) #define ADDRSEL_XUNLOCK() sx_xunlock(&addrsel_sxlock) #define ADDR_LABEL_NOTAPP (-1) VNET_DEFINE_STATIC(struct in6_addrpolicy, defaultaddrpolicy); #define V_defaultaddrpolicy VNET(defaultaddrpolicy) VNET_DEFINE(int, ip6_prefer_tempaddr) = 0; static int selectroute(struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, struct ifnet **, struct nhop_object **, int, u_int, uint32_t); static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct ifnet **, struct ifnet *, u_int); static int in6_selectsrc(uint32_t, struct sockaddr_in6 *, struct ip6_pktopts *, struct inpcb *, struct ucred *, struct ifnet **, struct in6_addr *); static struct in6_addrpolicy *lookup_addrsel_policy(struct sockaddr_in6 *); static void init_policy_queue(void); static int add_addrsel_policyent(struct in6_addrpolicy *); static int delete_addrsel_policyent(struct in6_addrpolicy *); static int walk_addrsel_policy(int (*)(struct in6_addrpolicy *, void *), void *); static int dump_addrsel_policyent(struct in6_addrpolicy *, void *); static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *); /* * Return an IPv6 address, which is the most appropriate for a given * destination and user specified options. * If necessary, this function lookups the routing table and returns * an entry to the caller for later use. */ #define REPLACE(r) do {\ IP6STAT_INC(ip6s_sources_rule[(r)]); \ /* { \ char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \ printf("in6_selectsrc: replace %s with %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \ } */ \ goto replace; \ } while(0) #define NEXT(r) do {\ /* { \ char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \ printf("in6_selectsrc: keep %s against %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \ } */ \ goto next; /* XXX: we can't use 'continue' here */ \ } while(0) #define BREAK(r) do { \ IP6STAT_INC(ip6s_sources_rule[(r)]); \ goto out; /* XXX: we can't use 'break' here */ \ } while(0) static int in6_selectsrc(uint32_t fibnum, struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct inpcb *inp, struct ucred *cred, struct ifnet **ifpp, struct in6_addr *srcp) { struct rm_priotracker in6_ifa_tracker; struct in6_addr dst, tmp; struct ifnet *ifp = NULL, *oifp = NULL; struct in6_ifaddr *ia = NULL, *ia_best = NULL; struct in6_pktinfo *pi = NULL; int dst_scope = -1, best_scope = -1, best_matchlen = -1; struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL; u_int32_t odstzone; int prefer_tempaddr; int error; struct ip6_moptions *mopts; KASSERT(srcp != NULL, ("%s: srcp is NULL", __func__)); dst = dstsock->sin6_addr; /* make a copy for local operation */ if (ifpp) { /* * Save a possibly passed in ifp for in6_selectsrc. Only * neighbor discovery code should use this feature, where * we may know the interface but not the FIB number holding * the connected subnet in case someone deleted it from the * default FIB and we need to check the interface. */ if (*ifpp != NULL) oifp = *ifpp; *ifpp = NULL; } if (inp != NULL) { INP_LOCK_ASSERT(inp); mopts = inp->in6p_moptions; } else { mopts = NULL; } /* * If the source address is explicitly specified by the caller, * check if the requested source address is indeed a unicast address * assigned to the node, and can be used as the packet's source * address. If everything is okay, use the address as source. */ if (opts && (pi = opts->ip6po_pktinfo) && !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) { /* get the outgoing interface */ if ((error = in6_selectif(dstsock, opts, mopts, &ifp, oifp, fibnum)) != 0) return (error); /* * determine the appropriate zone id of the source based on * the zone of the destination and the outgoing interface. * If the specified address is ambiguous wrt the scope zone, * the interface must be specified; otherwise, ifa_ifwithaddr() * will fail matching the address. */ tmp = pi->ipi6_addr; if (ifp) { error = in6_setscope(&tmp, ifp, &odstzone); if (error) return (error); } if (cred != NULL && (error = prison_local_ip6(cred, &tmp, (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)) != 0) return (error); /* * If IPV6_BINDANY socket option is set, we allow to specify * non local addresses as source address in IPV6_PKTINFO * ancillary data. */ if ((inp->inp_flags & INP_BINDANY) == 0) { ia = in6ifa_ifwithaddr(&tmp, 0 /* XXX */); if (ia == NULL || (ia->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY))) { if (ia != NULL) ifa_free(&ia->ia_ifa); return (EADDRNOTAVAIL); } bcopy(&ia->ia_addr.sin6_addr, srcp, sizeof(*srcp)); ifa_free(&ia->ia_ifa); } else bcopy(&tmp, srcp, sizeof(*srcp)); pi->ipi6_addr = tmp; /* XXX: this overrides pi */ if (ifpp) *ifpp = ifp; return (0); } /* * Otherwise, if the socket has already bound the source, just use it. */ if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (cred != NULL && (error = prison_local_ip6(cred, &inp->in6p_laddr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) return (error); bcopy(&inp->in6p_laddr, srcp, sizeof(*srcp)); return (0); } /* * Bypass source address selection and use the primary jail IP * if requested. */ if (cred != NULL && !prison_saddrsel_ip6(cred, srcp)) return (0); /* * If the address is not specified, choose the best one based on * the outgoing interface and the destination address. */ /* get the outgoing interface */ if ((error = in6_selectif(dstsock, opts, mopts, &ifp, oifp, (inp != NULL) ? inp->inp_inc.inc_fibnum : fibnum)) != 0) return (error); #ifdef DIAGNOSTIC if (ifp == NULL) /* this should not happen */ panic("in6_selectsrc: NULL ifp"); #endif error = in6_setscope(&dst, ifp, &odstzone); if (error) return (error); IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { int new_scope = -1, new_matchlen = -1; struct in6_addrpolicy *new_policy = NULL; u_int32_t srczone, osrczone, dstzone; struct in6_addr src; struct ifnet *ifp1 = ia->ia_ifp; /* * We'll never take an address that breaks the scope zone * of the destination. We also skip an address if its zone * does not contain the outgoing interface. * XXX: we should probably use sin6_scope_id here. */ if (in6_setscope(&dst, ifp1, &dstzone) || odstzone != dstzone) { continue; } src = ia->ia_addr.sin6_addr; if (in6_setscope(&src, ifp, &osrczone) || in6_setscope(&src, ifp1, &srczone) || osrczone != srczone) { continue; } /* avoid unusable addresses */ if ((ia->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_ANYCAST | IN6_IFF_DETACHED))) { continue; } if (!V_ip6_use_deprecated && IFA6_IS_DEPRECATED(ia)) continue; /* If jailed only take addresses of the jail into account. */ if (cred != NULL && prison_check_ip6(cred, &ia->ia_addr.sin6_addr) != 0) continue; /* Rule 1: Prefer same address */ if (IN6_ARE_ADDR_EQUAL(&dst, &ia->ia_addr.sin6_addr)) { ia_best = ia; BREAK(1); /* there should be no better candidate */ } if (ia_best == NULL) REPLACE(0); /* Rule 2: Prefer appropriate scope */ if (dst_scope < 0) dst_scope = in6_addrscope(&dst); new_scope = in6_addrscope(&ia->ia_addr.sin6_addr); if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) { if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0) REPLACE(2); NEXT(2); } else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) { if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0) NEXT(2); REPLACE(2); } /* * Rule 3: Avoid deprecated addresses. Note that the case of * !ip6_use_deprecated is already rejected above. */ if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia)) NEXT(3); if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia)) REPLACE(3); /* Rule 4: Prefer home addresses */ /* * XXX: This is a TODO. We should probably merge the MIP6 * case above. */ /* Rule 5: Prefer outgoing interface */ if (!(ND_IFINFO(ifp)->flags & ND6_IFF_NO_PREFER_IFACE)) { if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp) NEXT(5); if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp) REPLACE(5); } /* * Rule 6: Prefer matching label * Note that best_policy should be non-NULL here. */ if (dst_policy == NULL) dst_policy = lookup_addrsel_policy(dstsock); if (dst_policy->label != ADDR_LABEL_NOTAPP) { new_policy = lookup_addrsel_policy(&ia->ia_addr); if (dst_policy->label == best_policy->label && dst_policy->label != new_policy->label) NEXT(6); if (dst_policy->label != best_policy->label && dst_policy->label == new_policy->label) REPLACE(6); } /* * Rule 7: Prefer public addresses. * We allow users to reverse the logic by configuring * a sysctl variable, so that privacy conscious users can * always prefer temporary addresses. */ if (opts == NULL || opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) { prefer_tempaddr = V_ip6_prefer_tempaddr; } else if (opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_NOTPREFER) { prefer_tempaddr = 0; } else prefer_tempaddr = 1; if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) && (ia->ia6_flags & IN6_IFF_TEMPORARY)) { if (prefer_tempaddr) REPLACE(7); else NEXT(7); } if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) && !(ia->ia6_flags & IN6_IFF_TEMPORARY)) { if (prefer_tempaddr) NEXT(7); else REPLACE(7); } /* * Rule 8: prefer addresses on alive interfaces. * This is a KAME specific rule. */ if ((ia_best->ia_ifp->if_flags & IFF_UP) && !(ia->ia_ifp->if_flags & IFF_UP)) NEXT(8); if (!(ia_best->ia_ifp->if_flags & IFF_UP) && (ia->ia_ifp->if_flags & IFF_UP)) REPLACE(8); /* * Rule 9: prefer address with better virtual status. */ if (ifa_preferred(&ia_best->ia_ifa, &ia->ia_ifa)) REPLACE(9); if (ifa_preferred(&ia->ia_ifa, &ia_best->ia_ifa)) NEXT(9); /* * Rule 10: prefer address with `prefer_source' flag. */ if ((ia_best->ia6_flags & IN6_IFF_PREFER_SOURCE) == 0 && (ia->ia6_flags & IN6_IFF_PREFER_SOURCE) != 0) REPLACE(10); if ((ia_best->ia6_flags & IN6_IFF_PREFER_SOURCE) != 0 && (ia->ia6_flags & IN6_IFF_PREFER_SOURCE) == 0) NEXT(10); /* * Rule 14: Use longest matching prefix. * Note: in the address selection draft, this rule is * documented as "Rule 8". However, since it is also * documented that this rule can be overridden, we assign * a large number so that it is easy to assign smaller numbers * to more preferred rules. */ new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, &dst); if (best_matchlen < new_matchlen) REPLACE(14); if (new_matchlen < best_matchlen) NEXT(14); /* Rule 15 is reserved. */ /* * Last resort: just keep the current candidate. * Or, do we need more rules? */ continue; replace: ia_best = ia; best_scope = (new_scope >= 0 ? new_scope : in6_addrscope(&ia_best->ia_addr.sin6_addr)); best_policy = (new_policy ? new_policy : lookup_addrsel_policy(&ia_best->ia_addr)); best_matchlen = (new_matchlen >= 0 ? new_matchlen : in6_matchlen(&ia_best->ia_addr.sin6_addr, &dst)); next: continue; out: break; } if ((ia = ia_best) == NULL) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); IP6STAT_INC(ip6s_sources_none); return (EADDRNOTAVAIL); } /* * At this point at least one of the addresses belonged to the jail * but it could still be, that we want to further restrict it, e.g. * theoratically IN6_IS_ADDR_LOOPBACK. * It must not be IN6_IS_ADDR_UNSPECIFIED anymore. * prison_local_ip6() will fix an IN6_IS_ADDR_LOOPBACK but should * let all others previously selected pass. * Use tmp to not change ::1 on lo0 to the primary jail address. */ tmp = ia->ia_addr.sin6_addr; if (cred != NULL && prison_local_ip6(cred, &tmp, (inp != NULL && (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)) != 0) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); IP6STAT_INC(ip6s_sources_none); return (EADDRNOTAVAIL); } if (ifpp) *ifpp = ifp; bcopy(&tmp, srcp, sizeof(*srcp)); if (ia->ia_ifp == ifp) IP6STAT_INC(ip6s_sources_sameif[best_scope]); else IP6STAT_INC(ip6s_sources_otherif[best_scope]); if (dst_scope == best_scope) IP6STAT_INC(ip6s_sources_samescope[best_scope]); else IP6STAT_INC(ip6s_sources_otherscope[best_scope]); if (IFA6_IS_DEPRECATED(ia)) IP6STAT_INC(ip6s_sources_deprecated[best_scope]); IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Select source address based on @inp, @dstsock and @opts. * Stores selected address to @srcp. If @scope_ambiguous is set, * embed scope from selected outgoing interface. If @hlim pointer * is provided, stores calculated hop limit there. * Returns 0 on success. */ int in6_selectsrc_socket(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct inpcb *inp, struct ucred *cred, int scope_ambiguous, struct in6_addr *srcp, int *hlim) { struct ifnet *retifp; uint32_t fibnum; int error; fibnum = inp->inp_inc.inc_fibnum; retifp = NULL; error = in6_selectsrc(fibnum, dstsock, opts, inp, cred, &retifp, srcp); if (error != 0) return (error); if (hlim != NULL) *hlim = in6_selecthlim(inp, retifp); if (retifp == NULL || scope_ambiguous == 0) return (0); /* * Application should provide a proper zone ID or the use of * default zone IDs should be enabled. Unfortunately, some * applications do not behave as it should, so we need a * workaround. Even if an appropriate ID is not determined * (when it's required), if we can determine the outgoing * interface. determine the zone ID based on the interface. */ error = in6_setscope(&dstsock->sin6_addr, retifp, NULL); return (error); } /* * Select source address based on @fibnum, @dst and @scopeid. * Stores selected address to @srcp. * Returns 0 on success. * * Used by non-socket based consumers (ND code mostly) */ int in6_selectsrc_addr(uint32_t fibnum, const struct in6_addr *dst, uint32_t scopeid, struct ifnet *ifp, struct in6_addr *srcp, int *hlim) { struct ifnet *retifp; struct sockaddr_in6 dst_sa; int error; retifp = ifp; bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = *dst; dst_sa.sin6_scope_id = scopeid; sa6_embedscope(&dst_sa, 0); error = in6_selectsrc(fibnum, &dst_sa, NULL, NULL, NULL, &retifp, srcp); if (hlim != NULL) *hlim = in6_selecthlim(NULL, retifp); return (error); } /* * clone - meaningful only for bsdi and freebsd */ static int selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp, struct nhop_object **retnh, int norouteok, u_int fibnum, uint32_t flowid) { int error = 0; struct ifnet *ifp = NULL; struct nhop_object *nh = NULL; struct sockaddr_in6 *sin6_next; struct in6_pktinfo *pi = NULL; struct in6_addr *dst = &dstsock->sin6_addr; uint32_t zoneid; #if 0 char ip6buf[INET6_ADDRSTRLEN]; if (dstsock->sin6_addr.s6_addr32[0] == 0 && dstsock->sin6_addr.s6_addr32[1] == 0 && !IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) { printf("%s: strange destination %s\n", __func__, ip6_sprintf(ip6buf, &dstsock->sin6_addr)); } else { printf("%s: destination = %s%%%d\n", __func__, ip6_sprintf(ip6buf, &dstsock->sin6_addr), dstsock->sin6_scope_id); /* for debug */ } #endif /* If the caller specify the outgoing interface explicitly, use it. */ if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) { /* XXX boundary check is assumed to be already done. */ ifp = ifnet_byindex(pi->ipi6_ifindex); if (ifp != NULL && (norouteok || retnh == NULL || IN6_IS_ADDR_MULTICAST(dst))) { /* * we do not have to check or get the route for * multicast. */ goto done; } else goto getroute; } /* * If the destination address is a multicast address and the outgoing * interface for the address is specified by the caller, use it. */ if (IN6_IS_ADDR_MULTICAST(dst) && mopts != NULL && (ifp = mopts->im6o_multicast_ifp) != NULL) { goto done; /* we do not need a route for multicast. */ } /* * If destination address is LLA or link- or node-local multicast, * use it's embedded scope zone id to determine outgoing interface. */ if (IN6_IS_ADDR_MC_LINKLOCAL(dst) || IN6_IS_ADDR_MC_NODELOCAL(dst)) { zoneid = ntohs(in6_getscope(dst)); if (zoneid > 0) { ifp = in6_getlinkifnet(zoneid); goto done; } } getroute: /* * If the next hop address for the packet is specified by the caller, * use it as the gateway. */ if (opts && opts->ip6po_nexthop) { struct route_in6 *ron; sin6_next = satosin6(opts->ip6po_nexthop); if (IN6_IS_ADDR_LINKLOCAL(&sin6_next->sin6_addr)) { /* * Next hop is LLA, thus it should be neighbor. * Determine outgoing interface by zone index. */ zoneid = ntohs(in6_getscope(&sin6_next->sin6_addr)); if (zoneid > 0) { ifp = in6_getlinkifnet(zoneid); goto done; } } ron = &opts->ip6po_nextroute; /* Use a cached route if it exists and is valid. */ if (ron->ro_nh != NULL && ( !NH_IS_VALID(ron->ro_nh) || ron->ro_dst.sin6_family != AF_INET6 || !IN6_ARE_ADDR_EQUAL(&ron->ro_dst.sin6_addr, &sin6_next->sin6_addr))) RO_NHFREE(ron); if (ron->ro_nh == NULL) { ron->ro_dst = *sin6_next; /* * sin6_next is not link-local OR scopeid is 0, * no need to clear scope */ ron->ro_nh = fib6_lookup(fibnum, &sin6_next->sin6_addr, 0, NHR_REF, flowid); } /* * The node identified by that address must be a * neighbor of the sending host. */ if (ron->ro_nh == NULL || (ron->ro_nh->nh_flags & NHF_GATEWAY) != 0) error = EHOSTUNREACH; else { nh = ron->ro_nh; ifp = nh->nh_ifp; } goto done; } /* * Use a cached route if it exists and is valid, else try to allocate * a new one. Note that we should check the address family of the * cached destination, in case of sharing the cache with IPv4. */ if (ro) { if (ro->ro_nh && (!NH_IS_VALID(ro->ro_nh) || ((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 || !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr, dst))) { RO_NHFREE(ro); } if (ro->ro_nh == (struct nhop_object *)NULL) { struct sockaddr_in6 *sa6; /* No route yet, so try to acquire one */ bzero(&ro->ro_dst, sizeof(struct sockaddr_in6)); sa6 = (struct sockaddr_in6 *)&ro->ro_dst; *sa6 = *dstsock; sa6->sin6_scope_id = 0; /* * Currently dst has scopeid embedded iff it is LL. * New routing API accepts scopeid as a separate argument. * Convert dst before/after doing lookup */ uint32_t scopeid = 0; if (IN6_IS_SCOPE_LINKLOCAL(&sa6->sin6_addr)) { /* Unwrap in6_getscope() and in6_clearscope() */ scopeid = ntohs(sa6->sin6_addr.s6_addr16[1]); sa6->sin6_addr.s6_addr16[1] = 0; } ro->ro_nh = fib6_lookup(fibnum, &sa6->sin6_addr, scopeid, NHR_REF, flowid); if (IN6_IS_SCOPE_LINKLOCAL(&sa6->sin6_addr)) sa6->sin6_addr.s6_addr16[1] = htons(scopeid); } /* * do not care about the result if we have the nexthop * explicitly specified. */ if (opts && opts->ip6po_nexthop) goto done; if (ro->ro_nh) ifp = ro->ro_nh->nh_ifp; else error = EHOSTUNREACH; nh = ro->ro_nh; /* * Check if the outgoing interface conflicts with * the interface specified by ipi6_ifindex (if specified). * Note that loopback interface is always okay. * (this may happen when we are sending a packet to one of * our own addresses.) */ if (ifp && opts && opts->ip6po_pktinfo && opts->ip6po_pktinfo->ipi6_ifindex) { if (!(ifp->if_flags & IFF_LOOPBACK) && ifp->if_index != opts->ip6po_pktinfo->ipi6_ifindex) { error = EHOSTUNREACH; goto done; } } } done: if (ifp == NULL && nh == NULL) { /* * This can happen if the caller did not pass a cached route * nor any other hints. We treat this case an error. */ error = EHOSTUNREACH; } if (error == EHOSTUNREACH) IP6STAT_INC(ip6s_noroute); if (retifp != NULL) { if (nh != NULL) *retifp = nh->nh_aifp; else *retifp = ifp; } if (retnh != NULL) *retnh = nh; /* nh may be NULL */ return (error); } static int in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct ifnet **retifp, struct ifnet *oifp, u_int fibnum) { int error; struct route_in6 sro; struct nhop_object *nh = NULL; uint16_t nh_flags; KASSERT(retifp != NULL, ("%s: retifp is NULL", __func__)); bzero(&sro, sizeof(sro)); nh_flags = 0; error = selectroute(dstsock, opts, mopts, &sro, retifp, &nh, 1, fibnum, 0); if (nh != NULL) nh_flags = nh->nh_flags; if (nh != NULL && nh == sro.ro_nh) NH_FREE(nh); if (error != 0) { /* Help ND. See oifp comment in in6_selectsrc(). */ if (oifp != NULL && fibnum == RT_DEFAULT_FIB) { *retifp = oifp; error = 0; } return (error); } /* * do not use a rejected or black hole route. * XXX: this check should be done in the L2 output routine. * However, if we skipped this check here, we'd see the following * scenario: * - install a rejected route for a scoped address prefix * (like fe80::/10) * - send a packet to a destination that matches the scoped prefix, * with ambiguity about the scope zone. * - pick the outgoing interface from the route, and disambiguate the * scope zone with the interface. * - ip6_output() would try to get another route with the "new" * destination, which may be valid. * - we'd see no error on output. * Although this may not be very harmful, it should still be confusing. * We thus reject the case here. */ if (nh_flags & (NHF_REJECT | NHF_BLACKHOLE)) { error = (nh_flags & NHF_HOST ? EHOSTUNREACH : ENETUNREACH); return (error); } return (0); } /* Public wrapper function to selectroute(). */ int in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp, struct nhop_object **retnh, u_int fibnum, uint32_t flowid) { return (selectroute(dstsock, opts, mopts, ro, retifp, retnh, 0, fibnum, flowid)); } /* * Default hop limit selection. The precedence is as follows: * 1. Hoplimit value specified via ioctl. * 2. (If the outgoing interface is detected) the current * hop limit of the interface specified by router advertisement. * 3. The system default hoplimit. */ int in6_selecthlim(struct inpcb *inp, struct ifnet *ifp) { if (inp && inp->in6p_hops >= 0) return (inp->in6p_hops); else if (ifp) return (ND_IFINFO(ifp)->chlim); else if (inp && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { - struct nhop6_basic nh6; + struct nhop_object *nh; struct in6_addr dst; uint32_t fibnum, scopeid; int hlim; fibnum = inp->inp_inc.inc_fibnum; in6_splitscope(&inp->in6p_faddr, &dst, &scopeid); - if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6)==0){ - hlim = ND_IFINFO(nh6.nh_ifp)->chlim; + nh = fib6_lookup(fibnum, &dst, scopeid, 0, 0); + if (nh != NULL) { + hlim = ND_IFINFO(nh->nh_ifp)->chlim; return (hlim); } } return (V_ip6_defhlim); } /* * XXX: this is borrowed from in6_pcbbind(). If possible, we should * share this function by all *bsd*... */ int in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct ucred *cred) { struct socket *so = inp->inp_socket; u_int16_t lport = 0; int error, lookupflags = 0; #ifdef INVARIANTS struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; #endif INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); error = prison_local_ip6(cred, laddr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)); if (error) return(error); /* XXX: this is redundant when called from in6_pcbbind */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) lookupflags = INPLOOKUP_WILDCARD; inp->inp_flags |= INP_ANONPORT; error = in_pcb_lport(inp, NULL, &lport, cred, lookupflags); if (error != 0) return (error); inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; return (EAGAIN); } return (0); } void addrsel_policy_init(void) { init_policy_queue(); /* initialize the "last resort" policy */ bzero(&V_defaultaddrpolicy, sizeof(V_defaultaddrpolicy)); V_defaultaddrpolicy.label = ADDR_LABEL_NOTAPP; if (!IS_DEFAULT_VNET(curvnet)) return; ADDRSEL_LOCK_INIT(); ADDRSEL_SXLOCK_INIT(); } static struct in6_addrpolicy * lookup_addrsel_policy(struct sockaddr_in6 *key) { struct in6_addrpolicy *match = NULL; ADDRSEL_LOCK(); match = match_addrsel_policy(key); if (match == NULL) match = &V_defaultaddrpolicy; else match->use++; ADDRSEL_UNLOCK(); return (match); } /* * Subroutines to manage the address selection policy table via sysctl. */ struct walkarg { struct sysctl_req *w_req; }; static int in6_src_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_DECL(_net_inet6_ip6); static SYSCTL_NODE(_net_inet6_ip6, IPV6CTL_ADDRCTLPOLICY, addrctlpolicy, CTLFLAG_RD | CTLFLAG_MPSAFE, in6_src_sysctl, ""); static int in6_src_sysctl(SYSCTL_HANDLER_ARGS) { struct walkarg w; if (req->newptr) return EPERM; bzero(&w, sizeof(w)); w.w_req = req; return (walk_addrsel_policy(dump_addrsel_policyent, &w)); } int in6_src_ioctl(u_long cmd, caddr_t data) { struct in6_addrpolicy ent0; if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY) return (EOPNOTSUPP); /* check for safety */ ent0 = *(struct in6_addrpolicy *)data; if (ent0.label == ADDR_LABEL_NOTAPP) return (EINVAL); /* check if the prefix mask is consecutive. */ if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0) return (EINVAL); /* clear trailing garbages (if any) of the prefix address. */ IN6_MASK_ADDR(&ent0.addr.sin6_addr, &ent0.addrmask.sin6_addr); ent0.use = 0; switch (cmd) { case SIOCAADDRCTL_POLICY: return (add_addrsel_policyent(&ent0)); case SIOCDADDRCTL_POLICY: return (delete_addrsel_policyent(&ent0)); } return (0); /* XXX: compromise compilers */ } /* * The followings are implementation of the policy table using a * simple tail queue. * XXX such details should be hidden. * XXX implementation using binary tree should be more efficient. */ struct addrsel_policyent { TAILQ_ENTRY(addrsel_policyent) ape_entry; struct in6_addrpolicy ape_policy; }; TAILQ_HEAD(addrsel_policyhead, addrsel_policyent); VNET_DEFINE_STATIC(struct addrsel_policyhead, addrsel_policytab); #define V_addrsel_policytab VNET(addrsel_policytab) static void init_policy_queue(void) { TAILQ_INIT(&V_addrsel_policytab); } static int add_addrsel_policyent(struct in6_addrpolicy *newpolicy) { struct addrsel_policyent *new, *pol; new = malloc(sizeof(*new), M_IFADDR, M_WAITOK); ADDRSEL_XLOCK(); ADDRSEL_LOCK(); /* duplication check */ TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) { if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr, &pol->ape_policy.addr.sin6_addr) && IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr, &pol->ape_policy.addrmask.sin6_addr)) { ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); free(new, M_IFADDR); return (EEXIST); /* or override it? */ } } bzero(new, sizeof(*new)); /* XXX: should validate entry */ new->ape_policy = *newpolicy; TAILQ_INSERT_TAIL(&V_addrsel_policytab, new, ape_entry); ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); return (0); } static int delete_addrsel_policyent(struct in6_addrpolicy *key) { struct addrsel_policyent *pol; ADDRSEL_XLOCK(); ADDRSEL_LOCK(); /* search for the entry in the table */ TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) { if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr, &pol->ape_policy.addr.sin6_addr) && IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr, &pol->ape_policy.addrmask.sin6_addr)) { break; } } if (pol == NULL) { ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); return (ESRCH); } TAILQ_REMOVE(&V_addrsel_policytab, pol, ape_entry); ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); free(pol, M_IFADDR); return (0); } static int walk_addrsel_policy(int (*callback)(struct in6_addrpolicy *, void *), void *w) { struct addrsel_policyent *pol; int error = 0; ADDRSEL_SLOCK(); TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) { if ((error = (*callback)(&pol->ape_policy, w)) != 0) { ADDRSEL_SUNLOCK(); return (error); } } ADDRSEL_SUNLOCK(); return (error); } static int dump_addrsel_policyent(struct in6_addrpolicy *pol, void *arg) { int error = 0; struct walkarg *w = arg; error = SYSCTL_OUT(w->w_req, pol, sizeof(*pol)); return (error); } static struct in6_addrpolicy * match_addrsel_policy(struct sockaddr_in6 *key) { struct addrsel_policyent *pent; struct in6_addrpolicy *bestpol = NULL, *pol; int matchlen, bestmatchlen = -1; u_char *mp, *ep, *k, *p, m; TAILQ_FOREACH(pent, &V_addrsel_policytab, ape_entry) { matchlen = 0; pol = &pent->ape_policy; mp = (u_char *)&pol->addrmask.sin6_addr; ep = mp + 16; /* XXX: scope field? */ k = (u_char *)&key->sin6_addr; p = (u_char *)&pol->addr.sin6_addr; for (; mp < ep && *mp; mp++, k++, p++) { m = *mp; if ((*k & m) != *p) goto next; /* not match */ if (m == 0xff) /* short cut for a typical case */ matchlen += 8; else { while (m >= 0x80) { matchlen++; m <<= 1; } } } /* matched. check if this is better than the current best. */ if (bestpol == NULL || matchlen > bestmatchlen) { bestpol = pol; bestmatchlen = matchlen; } next: continue; } return (bestpol); } Index: head/sys/netpfil/ipfw/ip_fw2.c =================================================================== --- head/sys/netpfil/ipfw/ip_fw2.c (revision 362899) +++ head/sys/netpfil/ipfw/ip_fw2.c (revision 362900) @@ -1,3557 +1,3560 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * The FreeBSD IP packet firewall, main file */ #include "opt_ipfw.h" #include "opt_ipdivert.h" #include "opt_inet.h" #ifndef INET #error "IPFIREWALL requires INET" #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for ETHERTYPE_IP */ #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #endif #include /* for struct grehdr */ #include #include /* XXX for in_cksum */ #ifdef MAC #include #endif /* * static variables followed by global ones. * All ipfw global variables are here. */ VNET_DEFINE_STATIC(int, fw_deny_unknown_exthdrs); #define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) VNET_DEFINE_STATIC(int, fw_permit_single_frag6) = 1; #define V_fw_permit_single_frag6 VNET(fw_permit_single_frag6) #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT static int default_to_accept = 1; #else static int default_to_accept; #endif VNET_DEFINE(int, autoinc_step); VNET_DEFINE(int, fw_one_pass) = 1; VNET_DEFINE(unsigned int, fw_tables_max); VNET_DEFINE(unsigned int, fw_tables_sets) = 0; /* Don't use set-aware tables */ /* Use 128 tables by default */ static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT; #ifndef LINEAR_SKIPTO static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards); #define JUMP(ch, f, num, targ, back) jump_fast(ch, f, num, targ, back) #else static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards); #define JUMP(ch, f, num, targ, back) jump_linear(ch, f, num, targ, back) #endif /* * Each rule belongs to one of 32 different sets (0..31). * The variable set_disable contains one bit per set. * If the bit is set, all rules in the corresponding set * are disabled. Set RESVD_SET(31) is reserved for the default rule * and rules that are not deleted by the flush command, * and CANNOT be disabled. * Rules in set RESVD_SET can only be deleted individually. */ VNET_DEFINE(u_int32_t, set_disable); #define V_set_disable VNET(set_disable) VNET_DEFINE(int, fw_verbose); /* counter for ipfw_log(NULL...) */ VNET_DEFINE(u_int64_t, norule_counter); VNET_DEFINE(int, verbose_limit); /* layer3_chain contains the list of rules for layer 3 */ VNET_DEFINE(struct ip_fw_chain, layer3_chain); /* ipfw_vnet_ready controls when we are open for business */ VNET_DEFINE(int, ipfw_vnet_ready) = 0; VNET_DEFINE(int, ipfw_nat_ready) = 0; ipfw_nat_t *ipfw_nat_ptr = NULL; struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_del_ptr; ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; #ifdef SYSCTL_NODE uint32_t dummy_def = IPFW_DEFAULT_RULE; static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS); static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS); SYSBEGIN(f3) SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Firewall"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, "Only do a single pass through ipfw when using dummynet(4)"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, "Rule number auto-increment step"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, "Log matches to ipfw rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, "Set upper limit of matches of ipfw rules logged"); SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, &dummy_def, 0, "The default/max possible rule number."); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_ipfw_table_num, "IU", "Maximum number of concurrently used tables"); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_sets, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_ipfw_tables_sets, "IU", "Use per-set namespace for tables"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, &default_to_accept, 0, "Make the default rule accept all packets."); TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, "Number of static rules"); #ifdef INET6 SYSCTL_DECL(_net_inet6_ip6); SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Firewall"); SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0, "Deny packets with unknown IPv6 Extension Headers"); SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_permit_single_frag6), 0, "Permit single packet IPv6 fragments"); #endif /* INET6 */ SYSEND #endif /* SYSCTL_NODE */ /* * Some macros used in the various matching options. * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) static __inline int icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) { int type = icmp->icmp_type; return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<icmp_type; return (type <= ICMP_MAXTYPE && (TT & (1<arg1 or cmd->d[0]. * * We scan options and store the bits we find set. We succeed if * * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear * * The code is sometimes optimized not to store additional variables. */ static int flags_match(ipfw_insn *cmd, u_int8_t bits) { u_char want_clear; bits = ~bits; if ( ((cmd->arg1 & 0xff) & bits) != 0) return 0; /* some bits we want set were clear */ want_clear = (cmd->arg1 >> 8) & 0xff; if ( (want_clear & bits) != want_clear) return 0; /* some bits we want clear were set */ return 1; } static int ipopts_match(struct ip *ip, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(ip + 1); int x = (ip->ip_hl << 2) - sizeof (struct ip); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { optlen = cp[IPOPT_OLEN]; if (optlen <= 0 || optlen > x) return 0; /* invalid or truncated */ } switch (opt) { default: break; case IPOPT_LSRR: bits |= IP_FW_IPOPT_LSRR; break; case IPOPT_SSRR: bits |= IP_FW_IPOPT_SSRR; break; case IPOPT_RR: bits |= IP_FW_IPOPT_RR; break; case IPOPT_TS: bits |= IP_FW_IPOPT_TS; break; } } return (flags_match(cmd, bits)); } /* * Parse TCP options. The logic copied from tcp_dooptions(). */ static int tcpopts_parse(const struct tcphdr *tcp, uint16_t *mss) { const u_char *cp = (const u_char *)(tcp + 1); int optlen, bits = 0; int cnt = (tcp->th_off << 2) - sizeof(struct tcphdr); for (; cnt > 0; cnt -= optlen, cp += optlen) { int opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { default: break; case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) break; bits |= IP_FW_TCPOPT_MSS; if (mss != NULL) *mss = be16dec(cp + 2); break; case TCPOPT_WINDOW: if (optlen == TCPOLEN_WINDOW) bits |= IP_FW_TCPOPT_WINDOW; break; case TCPOPT_SACK_PERMITTED: if (optlen == TCPOLEN_SACK_PERMITTED) bits |= IP_FW_TCPOPT_SACK; break; case TCPOPT_SACK: if (optlen > 2 && (optlen - 2) % TCPOLEN_SACK == 0) bits |= IP_FW_TCPOPT_SACK; break; case TCPOPT_TIMESTAMP: if (optlen == TCPOLEN_TIMESTAMP) bits |= IP_FW_TCPOPT_TS; break; } } return (bits); } static int tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) { return (flags_match(cmd, tcpopts_parse(tcp, NULL))); } static int iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, uint32_t *tablearg) { if (ifp == NULL) /* no iface with this packet, match fails */ return (0); /* Check by name or by IP address */ if (cmd->name[0] != '\0') { /* match by name */ if (cmd->name[0] == '\1') /* use tablearg to match */ return ipfw_lookup_table(chain, cmd->p.kidx, 0, &ifp->if_index, tablearg); /* Check name */ if (cmd->p.glob) { if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) return(1); } else { if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) return(1); } } else { #if !defined(USERSPACE) && defined(__FreeBSD__) /* and OSX too ? */ struct ifaddr *ia; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET) continue; if (cmd->p.ip.s_addr == ((struct sockaddr_in *) (ia->ifa_addr))->sin_addr.s_addr) return (1); /* match */ } #endif /* __FreeBSD__ */ } return(0); /* no match, fail ... */ } /* * The verify_path function checks if a route to the src exists and * if it is reachable via ifp (when provided). * * The 'verrevpath' option checks that the interface that an IP packet * arrives on is the same interface that traffic destined for the * packet's source address would be routed out of. * The 'versrcreach' option just checks that the source address is * reachable via any route (except default) in the routing table. * These two are a measure to block forged packets. This is also * commonly known as "anti-spoofing" or Unicast Reverse Path * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs * is purposely reminiscent of the Cisco IOS command, * * ip verify unicast reverse-path * ip verify unicast source reachable-via any * * which implements the same functionality. But note that the syntax * is misleading, and the check may be performed on all IP packets * whether unicast, multicast, or broadcast. */ static int verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) { #if defined(USERSPACE) || !defined(__FreeBSD__) return 0; #else - struct nhop4_basic nh4; + struct nhop_object *nh; - if (fib4_lookup_nh_basic(fib, src, NHR_IFAIF, 0, &nh4) != 0) + nh = fib4_lookup(fib, src, 0, NHR_NONE, 0); + if (nh == NULL) return (0); /* * If ifp is provided, check for equality with rtentry. * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * in order to pass packets injected back by if_simloop(): * routing entry (via lo0) for our own address * may exist, so we need to handle routing assymetry. */ - if (ifp != NULL && ifp != nh4.nh_ifp) + if (ifp != NULL && ifp != nh->nh_aifp) return (0); /* if no ifp provided, check if rtentry is not default route */ - if (ifp == NULL && (nh4.nh_flags & NHF_DEFAULT) != 0) + if (ifp == NULL && (nh->nh_flags & NHF_DEFAULT) != 0) return (0); /* or if this is a blackhole/reject route */ - if (ifp == NULL && (nh4.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0) + if (ifp == NULL && (nh->nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0) return (0); /* found valid route */ return 1; #endif /* __FreeBSD__ */ } /* * Generate an SCTP packet containing an ABORT chunk. The verification tag * is given by vtag. The T-bit is set in the ABORT chunk if and only if * reflected is not 0. */ static struct mbuf * ipfw_send_abort(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t vtag, int reflected) { struct mbuf *m; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif struct sctphdr *sctp; struct sctp_chunkhdr *chunk; u_int16_t hlen, plen, tlen; MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); M_SETFIB(m, id->fib); #ifdef MAC if (replyto != NULL) mac_netinet_firewall_reply(replyto, m); else mac_netinet_firewall_send(m); #else (void)replyto; /* don't warn about unused arg */ #endif switch (id->addr_type) { case 4: hlen = sizeof(struct ip); break; #ifdef INET6 case 6: hlen = sizeof(struct ip6_hdr); break; #endif default: /* XXX: log me?!? */ FREE_PKT(m); return (NULL); } plen = sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); tlen = hlen + plen; m->m_data += max_linkhdr; m->m_flags |= M_SKIP_FIREWALL; m->m_pkthdr.len = m->m_len = tlen; m->m_pkthdr.rcvif = NULL; bzero(m->m_data, tlen); switch (id->addr_type) { case 4: ip = mtod(m, struct ip *); ip->ip_v = 4; ip->ip_hl = sizeof(struct ip) >> 2; ip->ip_tos = IPTOS_LOWDELAY; ip->ip_len = htons(tlen); ip->ip_id = htons(0); ip->ip_off = htons(0); ip->ip_ttl = V_ip_defttl; ip->ip_p = IPPROTO_SCTP; ip->ip_sum = 0; ip->ip_src.s_addr = htonl(id->dst_ip); ip->ip_dst.s_addr = htonl(id->src_ip); sctp = (struct sctphdr *)(ip + 1); break; #ifdef INET6 case 6: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_plen = htons(plen); ip6->ip6_nxt = IPPROTO_SCTP; ip6->ip6_hlim = IPV6_DEFHLIM; ip6->ip6_src = id->dst_ip6; ip6->ip6_dst = id->src_ip6; sctp = (struct sctphdr *)(ip6 + 1); break; #endif } sctp->src_port = htons(id->dst_port); sctp->dest_port = htons(id->src_port); sctp->v_tag = htonl(vtag); sctp->checksum = htonl(0); chunk = (struct sctp_chunkhdr *)(sctp + 1); chunk->chunk_type = SCTP_ABORT_ASSOCIATION; chunk->chunk_flags = 0; if (reflected != 0) { chunk->chunk_flags |= SCTP_HAD_NO_TCB; } chunk->chunk_length = htons(sizeof(struct sctp_chunkhdr)); sctp->checksum = sctp_calculate_cksum(m, hlen); return (m); } /* * Generate a TCP packet, containing either a RST or a keepalive. * When flags & TH_RST, we are sending a RST packet, because of a * "reset" action matched the packet. * Otherwise we are sending a keepalive, and flags & TH_ * The 'replyto' mbuf is the mbuf being replied to, if any, and is required * so that MAC can label the reply appropriately. */ struct mbuf * ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) { struct mbuf *m = NULL; /* stupid compiler */ struct ip *h = NULL; /* stupid compiler */ #ifdef INET6 struct ip6_hdr *h6 = NULL; #endif struct tcphdr *th = NULL; int len, dir; MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); M_SETFIB(m, id->fib); #ifdef MAC if (replyto != NULL) mac_netinet_firewall_reply(replyto, m); else mac_netinet_firewall_send(m); #else (void)replyto; /* don't warn about unused arg */ #endif switch (id->addr_type) { case 4: len = sizeof(struct ip) + sizeof(struct tcphdr); break; #ifdef INET6 case 6: len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); break; #endif default: /* XXX: log me?!? */ FREE_PKT(m); return (NULL); } dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN); m->m_data += max_linkhdr; m->m_flags |= M_SKIP_FIREWALL; m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; bzero(m->m_data, len); switch (id->addr_type) { case 4: h = mtod(m, struct ip *); /* prepare for checksum */ h->ip_p = IPPROTO_TCP; h->ip_len = htons(sizeof(struct tcphdr)); if (dir) { h->ip_src.s_addr = htonl(id->src_ip); h->ip_dst.s_addr = htonl(id->dst_ip); } else { h->ip_src.s_addr = htonl(id->dst_ip); h->ip_dst.s_addr = htonl(id->src_ip); } th = (struct tcphdr *)(h + 1); break; #ifdef INET6 case 6: h6 = mtod(m, struct ip6_hdr *); /* prepare for checksum */ h6->ip6_nxt = IPPROTO_TCP; h6->ip6_plen = htons(sizeof(struct tcphdr)); if (dir) { h6->ip6_src = id->src_ip6; h6->ip6_dst = id->dst_ip6; } else { h6->ip6_src = id->dst_ip6; h6->ip6_dst = id->src_ip6; } th = (struct tcphdr *)(h6 + 1); break; #endif } if (dir) { th->th_sport = htons(id->src_port); th->th_dport = htons(id->dst_port); } else { th->th_sport = htons(id->dst_port); th->th_dport = htons(id->src_port); } th->th_off = sizeof(struct tcphdr) >> 2; if (flags & TH_RST) { if (flags & TH_ACK) { th->th_seq = htonl(ack); th->th_flags = TH_RST; } else { if (flags & TH_SYN) seq++; th->th_ack = htonl(seq); th->th_flags = TH_RST | TH_ACK; } } else { /* * Keepalive - use caller provided sequence numbers */ th->th_seq = htonl(seq); th->th_ack = htonl(ack); th->th_flags = TH_ACK; } switch (id->addr_type) { case 4: th->th_sum = in_cksum(m, len); /* finish the ip header */ h->ip_v = 4; h->ip_hl = sizeof(*h) >> 2; h->ip_tos = IPTOS_LOWDELAY; h->ip_off = htons(0); h->ip_len = htons(len); h->ip_ttl = V_ip_defttl; h->ip_sum = 0; break; #ifdef INET6 case 6: th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6), sizeof(struct tcphdr)); /* finish the ip6 header */ h6->ip6_vfc |= IPV6_VERSION; h6->ip6_hlim = IPV6_DEFHLIM; break; #endif } return (m); } #ifdef INET6 /* * ipv6 specific rules here... */ static __inline int icmp6type_match (int type, ipfw_insn_u32 *cmd) { return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); } static int flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) { int i; for (i=0; i <= cmd->o.arg1; ++i ) if (curr_flow == cmd->d[i] ) return 1; return 0; } /* support for IP6_*_ME opcodes */ static const struct in6_addr lla_mask = {{{ 0xff, 0xff, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }}}; static int ipfw_localip6(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; if (IN6_IS_ADDR_MULTICAST(in6)) return (0); if (!IN6_IS_ADDR_LINKLOCAL(in6)) return (in6_localip(in6)); IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { if (!IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) continue; if (IN6_ARE_MASKED_ADDR_EQUAL(&ia->ia_addr.sin6_addr, in6, &lla_mask)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } static int verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib) { - struct nhop6_basic nh6; + struct nhop_object *nh; if (IN6_IS_SCOPE_LINKLOCAL(src)) return (1); - if (fib6_lookup_nh_basic(fib, src, 0, NHR_IFAIF, 0, &nh6) != 0) + nh = fib6_lookup(fib, src, 0, NHR_NONE, 0); + if (nh == NULL) return (0); /* If ifp is provided, check for equality with route table. */ - if (ifp != NULL && ifp != nh6.nh_ifp) + if (ifp != NULL && ifp != nh->nh_aifp) return (0); /* if no ifp provided, check if rtentry is not default route */ - if (ifp == NULL && (nh6.nh_flags & NHF_DEFAULT) != 0) + if (ifp == NULL && (nh->nh_flags & NHF_DEFAULT) != 0) return (0); /* or if this is a blackhole/reject route */ - if (ifp == NULL && (nh6.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0) + if (ifp == NULL && (nh->nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0) return (0); /* found valid route */ return 1; } static int is_icmp6_query(int icmp6_type) { if ((icmp6_type <= ICMP6_MAXTYPE) && (icmp6_type == ICMP6_ECHO_REQUEST || icmp6_type == ICMP6_MEMBERSHIP_QUERY || icmp6_type == ICMP6_WRUREQUEST || icmp6_type == ICMP6_FQDN_QUERY || icmp6_type == ICMP6_NI_QUERY)) return (1); return (0); } static int map_icmp_unreach(int code) { /* RFC 7915 p4.2 */ switch (code) { case ICMP_UNREACH_NET: case ICMP_UNREACH_HOST: case ICMP_UNREACH_SRCFAIL: case ICMP_UNREACH_NET_UNKNOWN: case ICMP_UNREACH_HOST_UNKNOWN: case ICMP_UNREACH_TOSNET: case ICMP_UNREACH_TOSHOST: return (ICMP6_DST_UNREACH_NOROUTE); case ICMP_UNREACH_PORT: return (ICMP6_DST_UNREACH_NOPORT); default: /* * Map the rest of codes into admit prohibited. * XXX: unreach proto should be mapped into ICMPv6 * parameter problem, but we use only unreach type. */ return (ICMP6_DST_UNREACH_ADMIN); } } static void send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) { struct mbuf *m; m = args->m; if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *tcp; tcp = (struct tcphdr *)((char *)ip6 + hlen); if ((tcp->th_flags & TH_RST) == 0) { struct mbuf *m0; m0 = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m0 != NULL) ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); } FREE_PKT(m); } else if (code == ICMP6_UNREACH_ABORT && args->f_id.proto == IPPROTO_SCTP) { struct mbuf *m0; struct sctphdr *sctp; u_int32_t v_tag; int reflected; sctp = (struct sctphdr *)((char *)ip6 + hlen); reflected = 1; v_tag = ntohl(sctp->v_tag); /* Investigate the first chunk header if available */ if (m->m_len >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr)) { struct sctp_chunkhdr *chunk; chunk = (struct sctp_chunkhdr *)(sctp + 1); switch (chunk->chunk_type) { case SCTP_INITIATION: /* * Packets containing an INIT chunk MUST have * a zero v-tag. */ if (v_tag != 0) { v_tag = 0; break; } /* INIT chunk MUST NOT be bundled */ if (m->m_pkthdr.len > hlen + sizeof(struct sctphdr) + ntohs(chunk->chunk_length) + 3) { break; } /* Use the initiate tag if available */ if ((m->m_len >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd))) { struct sctp_init *init; init = (struct sctp_init *)(chunk + 1); v_tag = ntohl(init->initiate_tag); reflected = 0; } break; case SCTP_ABORT_ASSOCIATION: /* * If the packet contains an ABORT chunk, don't * reply. * XXX: We should search through all chunks, * but do not do that to avoid attacks. */ v_tag = 0; break; } } if (v_tag == 0) { m0 = NULL; } else { m0 = ipfw_send_abort(args->m, &(args->f_id), v_tag, reflected); } if (m0 != NULL) ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); FREE_PKT(m); } else if (code != ICMP6_UNREACH_RST && code != ICMP6_UNREACH_ABORT) { /* Send an ICMPv6 unreach. */ #if 0 /* * Unlike above, the mbufs need to line up with the ip6 hdr, * as the contents are read. We need to m_adj() the * needed amount. * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif icmp6_error(m, ICMP6_DST_UNREACH, code, 0); } else FREE_PKT(m); args->m = NULL; } #endif /* INET6 */ /* * sends a reject message, consuming the mbuf passed as an argument. */ static void send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) { #if 0 /* XXX When ip is not guaranteed to be at mtod() we will * need to account for this */ * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif if (code != ICMP_REJECT_RST && code != ICMP_REJECT_ABORT) { /* Send an ICMP unreach */ icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); } else if (code == ICMP_REJECT_RST && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *const tcp = L3HDR(struct tcphdr, mtod(args->m, struct ip *)); if ( (tcp->th_flags & TH_RST) == 0) { struct mbuf *m; m = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); } FREE_PKT(args->m); } else if (code == ICMP_REJECT_ABORT && args->f_id.proto == IPPROTO_SCTP) { struct mbuf *m; struct sctphdr *sctp; struct sctp_chunkhdr *chunk; struct sctp_init *init; u_int32_t v_tag; int reflected; sctp = L3HDR(struct sctphdr, mtod(args->m, struct ip *)); reflected = 1; v_tag = ntohl(sctp->v_tag); if (iplen >= (ip->ip_hl << 2) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr)) { /* Look at the first chunk header if available */ chunk = (struct sctp_chunkhdr *)(sctp + 1); switch (chunk->chunk_type) { case SCTP_INITIATION: /* * Packets containing an INIT chunk MUST have * a zero v-tag. */ if (v_tag != 0) { v_tag = 0; break; } /* INIT chunk MUST NOT be bundled */ if (iplen > (ip->ip_hl << 2) + sizeof(struct sctphdr) + ntohs(chunk->chunk_length) + 3) { break; } /* Use the initiate tag if available */ if ((iplen >= (ip->ip_hl << 2) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd))) { init = (struct sctp_init *)(chunk + 1); v_tag = ntohl(init->initiate_tag); reflected = 0; } break; case SCTP_ABORT_ASSOCIATION: /* * If the packet contains an ABORT chunk, don't * reply. * XXX: We should search through all chunks, * but do not do that to avoid attacks. */ v_tag = 0; break; } } if (v_tag == 0) { m = NULL; } else { m = ipfw_send_abort(args->m, &(args->f_id), v_tag, reflected); } if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); FREE_PKT(args->m); } else FREE_PKT(args->m); args->m = NULL; } /* * Support for uid/gid/jail lookup. These tests are expensive * (because we may need to look into the list of active sockets) * so we cache the results. ugid_lookupp is 0 if we have not * yet done a lookup, 1 if we succeeded, and -1 if we tried * and failed. The function always returns the match value. * We could actually spare the variable and use *uc, setting * it to '(void *)check_uidgid if we have no info, NULL if * we tried and failed, or any other value if successful. */ static int check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp, struct ucred **uc) { #if defined(USERSPACE) return 0; // not supported in userspace #else #ifndef __FreeBSD__ /* XXX */ return cred_check(insn, proto, oif, dst_ip, dst_port, src_ip, src_port, (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); #else /* FreeBSD */ struct in_addr src_ip, dst_ip; struct inpcbinfo *pi; struct ipfw_flow_id *id; struct inpcb *pcb, *inp; int lookupflags; int match; id = &args->f_id; inp = args->inp; /* * Check to see if the UDP or TCP stack supplied us with * the PCB. If so, rather then holding a lock and looking * up the PCB, we can use the one that was supplied. */ if (inp && *ugid_lookupp == 0) { INP_LOCK_ASSERT(inp); if (inp->inp_socket != NULL) { *uc = crhold(inp->inp_cred); *ugid_lookupp = 1; } else *ugid_lookupp = -1; } /* * If we have already been here and the packet has no * PCB entry associated with it, then we can safely * assume that this is a no match. */ if (*ugid_lookupp == -1) return (0); if (id->proto == IPPROTO_TCP) { lookupflags = 0; pi = &V_tcbinfo; } else if (id->proto == IPPROTO_UDP) { lookupflags = INPLOOKUP_WILDCARD; pi = &V_udbinfo; } else if (id->proto == IPPROTO_UDPLITE) { lookupflags = INPLOOKUP_WILDCARD; pi = &V_ulitecbinfo; } else return 0; lookupflags |= INPLOOKUP_RLOCKPCB; match = 0; if (*ugid_lookupp == 0) { if (id->addr_type == 6) { #ifdef INET6 if (args->flags & IPFW_ARGS_IN) pcb = in6_pcblookup_mbuf(pi, &id->src_ip6, htons(id->src_port), &id->dst_ip6, htons(id->dst_port), lookupflags, NULL, args->m); else pcb = in6_pcblookup_mbuf(pi, &id->dst_ip6, htons(id->dst_port), &id->src_ip6, htons(id->src_port), lookupflags, args->ifp, args->m); #else *ugid_lookupp = -1; return (0); #endif } else { src_ip.s_addr = htonl(id->src_ip); dst_ip.s_addr = htonl(id->dst_ip); if (args->flags & IPFW_ARGS_IN) pcb = in_pcblookup_mbuf(pi, src_ip, htons(id->src_port), dst_ip, htons(id->dst_port), lookupflags, NULL, args->m); else pcb = in_pcblookup_mbuf(pi, dst_ip, htons(id->dst_port), src_ip, htons(id->src_port), lookupflags, args->ifp, args->m); } if (pcb != NULL) { INP_RLOCK_ASSERT(pcb); *uc = crhold(pcb->inp_cred); *ugid_lookupp = 1; INP_RUNLOCK(pcb); } if (*ugid_lookupp == 0) { /* * We tried and failed, set the variable to -1 * so we will not try again on this packet. */ *ugid_lookupp = -1; return (0); } } if (insn->o.opcode == O_UID) match = ((*uc)->cr_uid == (uid_t)insn->d[0]); else if (insn->o.opcode == O_GID) match = groupmember((gid_t)insn->d[0], *uc); else if (insn->o.opcode == O_JAIL) match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); return (match); #endif /* __FreeBSD__ */ #endif /* not supported in userspace */ } /* * Helper function to set args with info on the rule after the matching * one. slot is precise, whereas we guess rule_id as they are * assigned sequentially. */ static inline void set_match(struct ip_fw_args *args, int slot, struct ip_fw_chain *chain) { args->rule.chain_id = chain->id; args->rule.slot = slot + 1; /* we use 0 as a marker */ args->rule.rule_id = 1 + chain->map[slot]->id; args->rule.rulenum = chain->map[slot]->rulenum; args->flags |= IPFW_ARGS_REF; } #ifndef LINEAR_SKIPTO /* * Helper function to enable cached rule lookups using * cached_id and cached_pos fields in ipfw rule. */ static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards) { int f_pos; /* If possible use cached f_pos (in f->cached_pos), * whose version is written in f->cached_id * (horrible hacks to avoid changing the ABI). */ if (num != IP_FW_TARG && f->cached_id == chain->id) f_pos = f->cached_pos; else { int i = IP_FW_ARG_TABLEARG(chain, num, skipto); /* make sure we do not jump backward */ if (jump_backwards == 0 && i <= f->rulenum) i = f->rulenum + 1; if (chain->idxmap != NULL) f_pos = chain->idxmap[i]; else f_pos = ipfw_find_rule(chain, i, 0); /* update the cache */ if (num != IP_FW_TARG) { f->cached_id = chain->id; f->cached_pos = f_pos; } } return (f_pos); } #else /* * Helper function to enable real fast rule lookups. */ static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards) { int f_pos; num = IP_FW_ARG_TABLEARG(chain, num, skipto); /* make sure we do not jump backward */ if (jump_backwards == 0 && num <= f->rulenum) num = f->rulenum + 1; f_pos = chain->idxmap[num]; return (f_pos); } #endif #define TARG(k, f) IP_FW_ARG_TABLEARG(chain, k, f) /* * The main check routine for the firewall. * * All arguments are in args so we can modify them and return them * back to the caller. * * Parameters: * * args->m (in/out) The packet; we set to NULL when/if we nuke it. * Starts with the IP header. * args->L3offset Number of bytes bypassed if we came from L2. * e.g. often sizeof(eh) ** NOTYET ** * args->ifp Incoming or outgoing interface. * args->divert_rule (in/out) * Skip up to the first rule past this rule number; * upon return, non-zero port number for divert or tee. * * args->rule Pointer to the last matching rule (in/out) * args->next_hop Socket we are forwarding to (out). * args->next_hop6 IPv6 next hop we are forwarding to (out). * args->f_id Addresses grabbed from the packet (out) * args->rule.info a cookie depending on rule action * * Return value: * * IP_FW_PASS the packet must be accepted * IP_FW_DENY the packet must be dropped * IP_FW_DIVERT divert packet, port in m_tag * IP_FW_TEE tee packet, port in m_tag * IP_FW_DUMMYNET to dummynet, pipe in args->cookie * IP_FW_NETGRAPH into netgraph, cookie args->cookie * args->rule contains the matching rule, * args->rule.info has additional information. * */ int ipfw_chk(struct ip_fw_args *args) { /* * Local variables holding state while processing a packet: * * IMPORTANT NOTE: to speed up the processing of rules, there * are some assumption on the values of the variables, which * are documented here. Should you change them, please check * the implementation of the various instructions to make sure * that they still work. * * m | args->m Pointer to the mbuf, as received from the caller. * It may change if ipfw_chk() does an m_pullup, or if it * consumes the packet because it calls send_reject(). * XXX This has to change, so that ipfw_chk() never modifies * or consumes the buffer. * OR * args->mem Pointer to contigous memory chunk. * ip Is the beginning of the ip(4 or 6) header. * eh Ethernet header in case if input is Layer2. */ struct mbuf *m; struct ip *ip; struct ether_header *eh; /* * For rules which contain uid/gid or jail constraints, cache * a copy of the users credentials after the pcb lookup has been * executed. This will speed up the processing of rules with * these types of constraints, as well as decrease contention * on pcb related locks. */ #ifndef __FreeBSD__ struct bsd_ucred ucred_cache; #else struct ucred *ucred_cache = NULL; #endif int ucred_lookup = 0; int f_pos = 0; /* index of current rule in the array */ int retval = 0; struct ifnet *oif, *iif; /* * hlen The length of the IP header. */ u_int hlen = 0; /* hlen >0 means we have an IP pkt */ /* * offset The offset of a fragment. offset != 0 means that * we have a fragment at this offset of an IPv4 packet. * offset == 0 means that (if this is an IPv4 packet) * this is the first or only fragment. * For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header * or there is a single packet fragment (fragment header added * without needed). We will treat a single packet fragment as if * there was no fragment header (or log/block depending on the * V_fw_permit_single_frag6 sysctl setting). */ u_short offset = 0; u_short ip6f_mf = 0; /* * Local copies of addresses. They are only valid if we have * an IP packet. * * proto The protocol. Set to 0 for non-ip packets, * or to the protocol read from the packet otherwise. * proto != 0 means that we have an IPv4 packet. * * src_port, dst_port port numbers, in HOST format. Only * valid for TCP and UDP packets. * * src_ip, dst_ip ip addresses, in NETWORK format. * Only valid for IPv4 packets. */ uint8_t proto; uint16_t src_port, dst_port; /* NOTE: host format */ struct in_addr src_ip, dst_ip; /* NOTE: network format */ int iplen = 0; int pktlen; struct ipfw_dyn_info dyn_info; struct ip_fw *q = NULL; struct ip_fw_chain *chain = &V_layer3_chain; /* * We store in ulp a pointer to the upper layer protocol header. * In the ipv4 case this is easy to determine from the header, * but for ipv6 we might have some additional headers in the middle. * ulp is NULL if not found. */ void *ulp = NULL; /* upper layer protocol pointer. */ /* XXX ipv6 variables */ int is_ipv6 = 0; uint8_t icmp6_type = 0; uint16_t ext_hd = 0; /* bits vector for extension header filtering */ /* end of ipv6 variables */ int is_ipv4 = 0; int done = 0; /* flag to exit the outer loop */ IPFW_RLOCK_TRACKER; bool mem; if ((mem = (args->flags & IPFW_ARGS_LENMASK))) { if (args->flags & IPFW_ARGS_ETHER) { eh = (struct ether_header *)args->mem; if (eh->ether_type == htons(ETHERTYPE_VLAN)) ip = (struct ip *) ((struct ether_vlan_header *)eh + 1); else ip = (struct ip *)(eh + 1); } else { eh = NULL; ip = (struct ip *)args->mem; } pktlen = IPFW_ARGS_LENGTH(args->flags); args->f_id.fib = args->ifp->if_fib; /* best guess */ } else { m = args->m; if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready)) return (IP_FW_PASS); /* accept */ if (args->flags & IPFW_ARGS_ETHER) { /* We need some amount of data to be contiguous. */ if (m->m_len < min(m->m_pkthdr.len, max_protohdr) && (args->m = m = m_pullup(m, min(m->m_pkthdr.len, max_protohdr))) == NULL) goto pullup_failed; eh = mtod(m, struct ether_header *); ip = (struct ip *)(eh + 1); } else { eh = NULL; ip = mtod(m, struct ip *); } pktlen = m->m_pkthdr.len; args->f_id.fib = M_GETFIB(m); /* mbuf not altered */ } dst_ip.s_addr = 0; /* make sure it is initialized */ src_ip.s_addr = 0; /* make sure it is initialized */ src_port = dst_port = 0; DYN_INFO_INIT(&dyn_info); /* * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, * then it sets p to point at the offset "len" in the mbuf. WARNING: the * pointer might become stale after other pullups (but we never use it * this way). */ #define PULLUP_TO(_len, p, T) PULLUP_LEN(_len, p, sizeof(T)) #define EHLEN (eh != NULL ? ((char *)ip - (char *)eh) : 0) #define _PULLUP_LOCKED(_len, p, T, unlock) \ do { \ int x = (_len) + T + EHLEN; \ if (mem) { \ if (__predict_false(pktlen < x)) { \ unlock; \ goto pullup_failed; \ } \ p = (char *)args->mem + (_len) + EHLEN; \ } else { \ if (__predict_false((m)->m_len < x)) { \ args->m = m = m_pullup(m, x); \ if (m == NULL) { \ unlock; \ goto pullup_failed; \ } \ } \ p = mtod(m, char *) + (_len) + EHLEN; \ } \ } while (0) #define PULLUP_LEN(_len, p, T) _PULLUP_LOCKED(_len, p, T, ) #define PULLUP_LEN_LOCKED(_len, p, T) \ _PULLUP_LOCKED(_len, p, T, IPFW_PF_RUNLOCK(chain)); \ UPDATE_POINTERS() /* * In case pointers got stale after pullups, update them. */ #define UPDATE_POINTERS() \ do { \ if (!mem) { \ if (eh != NULL) { \ eh = mtod(m, struct ether_header *); \ ip = (struct ip *)(eh + 1); \ } else \ ip = mtod(m, struct ip *); \ args->m = m; \ } \ } while (0) /* Identify IP packets and fill up variables. */ if (pktlen >= sizeof(struct ip6_hdr) && (eh == NULL || eh->ether_type == htons(ETHERTYPE_IPV6)) && ip->ip_v == 6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; is_ipv6 = 1; args->flags |= IPFW_ARGS_IP6; hlen = sizeof(struct ip6_hdr); proto = ip6->ip6_nxt; /* Search extension headers to find upper layer protocols */ while (ulp == NULL && offset == 0) { switch (proto) { case IPPROTO_ICMPV6: PULLUP_TO(hlen, ulp, struct icmp6_hdr); icmp6_type = ICMP6(ulp)->icmp6_type; break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: if (pktlen >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)) PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)); else if (pktlen >= hlen + sizeof(struct sctphdr)) PULLUP_LEN(hlen, ulp, pktlen - hlen); else PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr)); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_HOPOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_HOPOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_ROUTING: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_rthdr); switch (((struct ip6_rthdr *)ulp)->ip6r_type) { case 0: ext_hd |= EXT_RTHDR0; break; case 2: ext_hd |= EXT_RTHDR2; break; default: if (V_fw_verbose) printf("IPFW2: IPV6 - Unknown " "Routing Header type(%d)\n", ((struct ip6_rthdr *) ulp)->ip6r_type); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } ext_hd |= EXT_ROUTING; hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; ulp = NULL; break; case IPPROTO_FRAGMENT: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_frag); ext_hd |= EXT_FRAGMENT; hlen += sizeof (struct ip6_frag); proto = ((struct ip6_frag *)ulp)->ip6f_nxt; offset = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_OFF_MASK; ip6f_mf = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_MORE_FRAG; if (V_fw_permit_single_frag6 == 0 && offset == 0 && ip6f_mf == 0) { if (V_fw_verbose) printf("IPFW2: IPV6 - Invalid " "Fragment Header\n"); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } args->f_id.extra = ntohl(((struct ip6_frag *)ulp)->ip6f_ident); ulp = NULL; break; case IPPROTO_DSTOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_DSTOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_AH: /* RFC 2402 */ PULLUP_TO(hlen, ulp, struct ip6_ext); ext_hd |= EXT_AH; hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; proto = ((struct ip6_ext *)ulp)->ip6e_nxt; ulp = NULL; break; case IPPROTO_ESP: /* RFC 2406 */ PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ /* Anything past Seq# is variable length and * data past this ext. header is encrypted. */ ext_hd |= EXT_ESP; break; case IPPROTO_NONE: /* RFC 2460 */ /* * Packet ends here, and IPv6 header has * already been pulled up. If ip6e_len!=0 * then octets must be ignored. */ ulp = ip; /* non-NULL to get out of loop. */ break; case IPPROTO_OSPFIGP: /* XXX OSPF header check? */ PULLUP_TO(hlen, ulp, struct ip6_ext); break; case IPPROTO_PIM: /* XXX PIM header check? */ PULLUP_TO(hlen, ulp, struct pim); break; case IPPROTO_GRE: /* RFC 1701 */ /* XXX GRE header check? */ PULLUP_TO(hlen, ulp, struct grehdr); break; case IPPROTO_CARP: PULLUP_TO(hlen, ulp, offsetof( struct carp_header, carp_counter)); if (CARP_ADVERTISEMENT != ((struct carp_header *)ulp)->carp_type) return (IP_FW_DENY); break; case IPPROTO_IPV6: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip6_hdr); break; case IPPROTO_IPV4: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip); break; default: if (V_fw_verbose) printf("IPFW2: IPV6 - Unknown " "Extension Header(%d), ext_hd=%x\n", proto, ext_hd); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); PULLUP_TO(hlen, ulp, struct ip6_ext); break; } /*switch */ } UPDATE_POINTERS(); ip6 = (struct ip6_hdr *)ip; args->f_id.addr_type = 6; args->f_id.src_ip6 = ip6->ip6_src; args->f_id.dst_ip6 = ip6->ip6_dst; args->f_id.flow_id6 = ntohl(ip6->ip6_flow); iplen = ntohs(ip6->ip6_plen) + sizeof(*ip6); } else if (pktlen >= sizeof(struct ip) && (eh == NULL || eh->ether_type == htons(ETHERTYPE_IP)) && ip->ip_v == 4) { is_ipv4 = 1; args->flags |= IPFW_ARGS_IP4; hlen = ip->ip_hl << 2; /* * Collect parameters into local variables for faster * matching. */ proto = ip->ip_p; src_ip = ip->ip_src; dst_ip = ip->ip_dst; offset = ntohs(ip->ip_off) & IP_OFFMASK; iplen = ntohs(ip->ip_len); if (offset == 0) { switch (proto) { case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: if (pktlen >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)) PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)); else if (pktlen >= hlen + sizeof(struct sctphdr)) PULLUP_LEN(hlen, ulp, pktlen - hlen); else PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr)); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_ICMP: PULLUP_TO(hlen, ulp, struct icmphdr); //args->f_id.flags = ICMP(ulp)->icmp_type; break; default: break; } } else { if (offset == 1 && proto == IPPROTO_TCP) { /* RFC 3128 */ goto pullup_failed; } } UPDATE_POINTERS(); args->f_id.addr_type = 4; args->f_id.src_ip = ntohl(src_ip.s_addr); args->f_id.dst_ip = ntohl(dst_ip.s_addr); } else { proto = 0; dst_ip.s_addr = src_ip.s_addr = 0; args->f_id.addr_type = 1; /* XXX */ } #undef PULLUP_TO pktlen = iplen < pktlen ? iplen: pktlen; /* Properly initialize the rest of f_id */ args->f_id.proto = proto; args->f_id.src_port = src_port = ntohs(src_port); args->f_id.dst_port = dst_port = ntohs(dst_port); IPFW_PF_RLOCK(chain); if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */ IPFW_PF_RUNLOCK(chain); return (IP_FW_PASS); /* accept */ } if (args->flags & IPFW_ARGS_REF) { /* * Packet has already been tagged as a result of a previous * match on rule args->rule aka args->rule_id (PIPE, QUEUE, * REASS, NETGRAPH, DIVERT/TEE...) * Validate the slot and continue from the next one * if still present, otherwise do a lookup. */ f_pos = (args->rule.chain_id == chain->id) ? args->rule.slot : ipfw_find_rule(chain, args->rule.rulenum, args->rule.rule_id); } else { f_pos = 0; } if (args->flags & IPFW_ARGS_IN) { iif = args->ifp; oif = NULL; } else { MPASS(args->flags & IPFW_ARGS_OUT); iif = mem ? NULL : m_rcvif(m); oif = args->ifp; } /* * Now scan the rules, and parse microinstructions for each rule. * We have two nested loops and an inner switch. Sometimes we * need to break out of one or both loops, or re-enter one of * the loops with updated variables. Loop variables are: * * f_pos (outer loop) points to the current rule. * On output it points to the matching rule. * done (outer loop) is used as a flag to break the loop. * l (inner loop) residual length of current rule. * cmd points to the current microinstruction. * * We break the inner loop by setting l=0 and possibly * cmdlen=0 if we don't want to advance cmd. * We break the outer loop by setting done=1 * We can restart the inner loop by setting l>0 and f_pos, f, cmd * as needed. */ for (; f_pos < chain->n_rules; f_pos++) { ipfw_insn *cmd; uint32_t tablearg = 0; int l, cmdlen, skip_or; /* skip rest of OR block */ struct ip_fw *f; f = chain->map[f_pos]; if (V_set_disable & (1 << f->set) ) continue; skip_or = 0; for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { int match; /* * check_body is a jump target used when we find a * CHECK_STATE, and need to jump to the body of * the target rule. */ /* check_body: */ cmdlen = F_LEN(cmd); /* * An OR block (insn_1 || .. || insn_n) has the * F_OR bit set in all but the last instruction. * The first match will set "skip_or", and cause * the following instructions to be skipped until * past the one with the F_OR bit clear. */ if (skip_or) { /* skip this instruction */ if ((cmd->len & F_OR) == 0) skip_or = 0; /* next one is good */ continue; } match = 0; /* set to 1 if we succeed */ switch (cmd->opcode) { /* * The first set of opcodes compares the packet's * fields with some pattern, setting 'match' if a * match is found. At the end of the loop there is * logic to deal with F_NOT and F_OR flags associated * with the opcode. */ case O_NOP: match = 1; break; case O_FORWARD_MAC: printf("ipfw: opcode %d unimplemented\n", cmd->opcode); break; case O_GID: case O_UID: case O_JAIL: /* * We only check offset == 0 && proto != 0, * as this ensures that we have a * packet with the ports info. */ if (offset != 0) break; if (proto == IPPROTO_TCP || proto == IPPROTO_UDP || proto == IPPROTO_UDPLITE) match = check_uidgid( (ipfw_insn_u32 *)cmd, args, &ucred_lookup, #ifdef __FreeBSD__ &ucred_cache); #else (void *)&ucred_cache); #endif break; case O_RECV: match = iface_match(iif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_XMIT: match = iface_match(oif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_VIA: match = iface_match(args->ifp, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_MACADDR2: if (args->flags & IPFW_ARGS_ETHER) { u_int32_t *want = (u_int32_t *) ((ipfw_insn_mac *)cmd)->addr; u_int32_t *mask = (u_int32_t *) ((ipfw_insn_mac *)cmd)->mask; u_int32_t *hdr = (u_int32_t *)eh; match = ( want[0] == (hdr[0] & mask[0]) && want[1] == (hdr[1] & mask[1]) && want[2] == (hdr[2] & mask[2]) ); } break; case O_MAC_TYPE: if (args->flags & IPFW_ARGS_ETHER) { u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (ntohs(eh->ether_type) >= p[0] && ntohs(eh->ether_type) <= p[1]); } break; case O_FRAG: match = (offset != 0); break; case O_IN: /* "out" is "not in" */ match = (oif == NULL); break; case O_LAYER2: match = (args->flags & IPFW_ARGS_ETHER); break; case O_DIVERTED: if ((args->flags & IPFW_ARGS_REF) == 0) break; /* * For diverted packets, args->rule.info * contains the divert port (in host format) * reason and direction. */ match = ((args->rule.info & IPFW_IS_MASK) == IPFW_IS_DIVERT) && ( ((args->rule.info & IPFW_INFO_IN) ? 1: 2) & cmd->arg1); break; case O_PROTO: /* * We do not allow an arg of 0 so the * check of "proto" only suffices. */ match = (proto == cmd->arg1); break; case O_IP_SRC: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == src_ip.s_addr); break; case O_IP_DST_LOOKUP: { void *pkey; uint32_t vidx, key; uint16_t keylen; if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) { /* Determine lookup key type */ vidx = ((ipfw_insn_u32 *)cmd)->d[1]; if (vidx != 4 /* uid */ && vidx != 5 /* jail */ && is_ipv6 == 0 && is_ipv4 == 0) break; /* Determine key length */ if (vidx == 0 /* dst-ip */ || vidx == 1 /* src-ip */) keylen = is_ipv6 ? sizeof(struct in6_addr): sizeof(in_addr_t); else { keylen = sizeof(key); pkey = &key; } if (vidx == 0 /* dst-ip */) pkey = is_ipv4 ? (void *)&dst_ip: (void *)&args->f_id.dst_ip6; else if (vidx == 1 /* src-ip */) pkey = is_ipv4 ? (void *)&src_ip: (void *)&args->f_id.src_ip6; else if (vidx == 6 /* dscp */) { if (is_ipv4) key = ip->ip_tos >> 2; else { key = args->f_id.flow_id6; key = (key & 0x0f) << 2 | (key & 0xf000) >> 14; } key &= 0x3f; } else if (vidx == 2 /* dst-port */ || vidx == 3 /* src-port */) { /* Skip fragments */ if (offset != 0) break; /* Skip proto without ports */ if (proto != IPPROTO_TCP && proto != IPPROTO_UDP && proto != IPPROTO_UDPLITE && proto != IPPROTO_SCTP) break; if (vidx == 2 /* dst-port */) key = dst_port; else key = src_port; } #ifndef USERSPACE else if (vidx == 4 /* uid */ || vidx == 5 /* jail */) { check_uidgid( (ipfw_insn_u32 *)cmd, args, &ucred_lookup, #ifdef __FreeBSD__ &ucred_cache); if (vidx == 4 /* uid */) key = ucred_cache->cr_uid; else if (vidx == 5 /* jail */) key = ucred_cache->cr_prison->pr_id; #else /* !__FreeBSD__ */ (void *)&ucred_cache); if (vidx == 4 /* uid */) key = ucred_cache.uid; else if (vidx == 5 /* jail */) key = ucred_cache.xid; #endif /* !__FreeBSD__ */ } #endif /* !USERSPACE */ else break; match = ipfw_lookup_table(chain, cmd->arg1, keylen, pkey, &vidx); if (!match) break; tablearg = vidx; break; } /* cmdlen =< F_INSN_SIZE(ipfw_insn_u32) */ /* FALLTHROUGH */ } case O_IP_SRC_LOOKUP: { void *pkey; uint32_t vidx; uint16_t keylen; if (is_ipv4) { keylen = sizeof(in_addr_t); if (cmd->opcode == O_IP_DST_LOOKUP) pkey = &dst_ip; else pkey = &src_ip; } else if (is_ipv6) { keylen = sizeof(struct in6_addr); if (cmd->opcode == O_IP_DST_LOOKUP) pkey = &args->f_id.dst_ip6; else pkey = &args->f_id.src_ip6; } else break; match = ipfw_lookup_table(chain, cmd->arg1, keylen, pkey, &vidx); if (!match) break; if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) { match = ((ipfw_insn_u32 *)cmd)->d[0] == TARG_VAL(chain, vidx, tag); if (!match) break; } tablearg = vidx; break; } case O_IP_FLOW_LOOKUP: { uint32_t v = 0; match = ipfw_lookup_table(chain, cmd->arg1, 0, &args->f_id, &v); if (!match) break; if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) match = ((ipfw_insn_u32 *)cmd)->d[0] == TARG_VAL(chain, v, tag); if (match) tablearg = v; } break; case O_IP_SRC_MASK: case O_IP_DST_MASK: if (is_ipv4) { uint32_t a = (cmd->opcode == O_IP_DST_MASK) ? dst_ip.s_addr : src_ip.s_addr; uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; int i = cmdlen-1; for (; !match && i>0; i-= 2, p+= 2) match = (p[0] == (a & p[1])); } break; case O_IP_SRC_ME: if (is_ipv4) { match = in_localip(src_ip); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_SRC_ME: match = is_ipv6 && ipfw_localip6(&args->f_id.src_ip6); #endif break; case O_IP_DST_SET: case O_IP_SRC_SET: if (is_ipv4) { u_int32_t *d = (u_int32_t *)(cmd+1); u_int32_t addr = cmd->opcode == O_IP_DST_SET ? args->f_id.dst_ip : args->f_id.src_ip; if (addr < d[0]) break; addr -= d[0]; /* subtract base */ match = (addr < cmd->arg1) && ( d[ 1 + (addr>>5)] & (1<<(addr & 0x1f)) ); } break; case O_IP_DST: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == dst_ip.s_addr); break; case O_IP_DST_ME: if (is_ipv4) { match = in_localip(dst_ip); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_DST_ME: match = is_ipv6 && ipfw_localip6(&args->f_id.dst_ip6); #endif break; case O_IP_SRCPORT: case O_IP_DSTPORT: /* * offset == 0 && proto != 0 is enough * to guarantee that we have a * packet with port info. */ if ((proto == IPPROTO_UDP || proto == IPPROTO_UDPLITE || proto == IPPROTO_TCP || proto == IPPROTO_SCTP) && offset == 0) { u_int16_t x = (cmd->opcode == O_IP_SRCPORT) ? src_port : dst_port ; u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (x>=p[0] && x<=p[1]); } break; case O_ICMPTYPE: match = (offset == 0 && proto==IPPROTO_ICMP && icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); break; #ifdef INET6 case O_ICMP6TYPE: match = is_ipv6 && offset == 0 && proto==IPPROTO_ICMPV6 && icmp6type_match( ICMP6(ulp)->icmp6_type, (ipfw_insn_u32 *)cmd); break; #endif /* INET6 */ case O_IPOPT: match = (is_ipv4 && ipopts_match(ip, cmd) ); break; case O_IPVER: match = ((is_ipv4 || is_ipv6) && cmd->arg1 == ip->ip_v); break; case O_IPID: case O_IPTTL: if (!is_ipv4) break; case O_IPLEN: { /* only for IP packets */ uint16_t x; uint16_t *p; int i; if (cmd->opcode == O_IPLEN) x = iplen; else if (cmd->opcode == O_IPTTL) x = ip->ip_ttl; else /* must be IPID */ x = ntohs(ip->ip_id); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_IPPRECEDENCE: match = (is_ipv4 && (cmd->arg1 == (ip->ip_tos & 0xe0)) ); break; case O_IPTOS: match = (is_ipv4 && flags_match(cmd, ip->ip_tos)); break; case O_DSCP: { uint32_t *p; uint16_t x; p = ((ipfw_insn_u32 *)cmd)->d; if (is_ipv4) x = ip->ip_tos >> 2; else if (is_ipv6) { uint8_t *v; v = &((struct ip6_hdr *)ip)->ip6_vfc; x = (*v & 0x0F) << 2; v++; x |= *v >> 6; } else break; /* DSCP bitmask is stored as low_u32 high_u32 */ if (x >= 32) match = *(p + 1) & (1 << (x - 32)); else match = *p & (1 << x); } break; case O_TCPDATALEN: if (proto == IPPROTO_TCP && offset == 0) { struct tcphdr *tcp; uint16_t x; uint16_t *p; int i; #ifdef INET6 if (is_ipv6) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip; if (ip6->ip6_plen == 0) { /* * Jumbo payload is not * supported by this * opcode. */ break; } x = iplen - hlen; } else #endif /* INET6 */ x = iplen - (ip->ip_hl << 2); tcp = TCP(ulp); x -= tcp->th_off << 2; if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_TCPFLAGS: match = (proto == IPPROTO_TCP && offset == 0 && flags_match(cmd, TCP(ulp)->th_flags)); break; case O_TCPOPTS: if (proto == IPPROTO_TCP && offset == 0 && ulp){ PULLUP_LEN_LOCKED(hlen, ulp, (TCP(ulp)->th_off << 2)); match = tcpopts_match(TCP(ulp), cmd); } break; case O_TCPSEQ: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_seq); break; case O_TCPACK: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_ack); break; case O_TCPMSS: if (proto == IPPROTO_TCP && (args->f_id._flags & TH_SYN) != 0 && ulp != NULL) { uint16_t mss, *p; int i; PULLUP_LEN_LOCKED(hlen, ulp, (TCP(ulp)->th_off << 2)); if ((tcpopts_parse(TCP(ulp), &mss) & IP_FW_TCPOPT_MSS) == 0) break; if (cmdlen == 1) { match = (cmd->arg1 == mss); break; } /* Otherwise we have ranges. */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i > 0; i--, p += 2) match = (mss >= p[0] && mss <= p[1]); } break; case O_TCPWIN: if (proto == IPPROTO_TCP && offset == 0) { uint16_t x; uint16_t *p; int i; x = ntohs(TCP(ulp)->th_win); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* Otherwise we have ranges. */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i > 0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_ESTAB: /* reject packets which have SYN only */ /* XXX should i also check for TH_ACK ? */ match = (proto == IPPROTO_TCP && offset == 0 && (TCP(ulp)->th_flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); break; case O_ALTQ: { struct pf_mtag *at; struct m_tag *mtag; ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; /* * ALTQ uses mbuf tags from another * packet filtering system - pf(4). * We allocate a tag in its format * and fill it in, pretending to be pf(4). */ match = 1; at = pf_find_mtag(m); if (at != NULL && at->qid != 0) break; mtag = m_tag_get(PACKET_TAG_PF, sizeof(struct pf_mtag), M_NOWAIT | M_ZERO); if (mtag == NULL) { /* * Let the packet fall back to the * default ALTQ. */ break; } m_tag_prepend(m, mtag); at = (struct pf_mtag *)(mtag + 1); at->qid = altq->qid; at->hdr = ip; break; } case O_LOG: ipfw_log(chain, f, hlen, args, offset | ip6f_mf, tablearg, ip); match = 1; break; case O_PROB: match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); break; case O_VERREVPATH: /* Outgoing packets automatically pass/match */ match = (args->flags & IPFW_ARGS_OUT || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), iif, args->f_id.fib) : #endif verify_path(src_ip, iif, args->f_id.fib))); break; case O_VERSRCREACH: /* Outgoing packets automatically pass/match */ match = (hlen > 0 && ((oif != NULL) || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), NULL, args->f_id.fib) : #endif verify_path(src_ip, NULL, args->f_id.fib)))); break; case O_ANTISPOOF: /* Outgoing packets automatically pass/match */ if (oif == NULL && hlen > 0 && ( (is_ipv4 && in_localaddr(src_ip)) #ifdef INET6 || (is_ipv6 && in6_localaddr(&(args->f_id.src_ip6))) #endif )) match = #ifdef INET6 is_ipv6 ? verify_path6( &(args->f_id.src_ip6), iif, args->f_id.fib) : #endif verify_path(src_ip, iif, args->f_id.fib); else match = 1; break; case O_IPSEC: match = (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); /* otherwise no match */ break; #ifdef INET6 case O_IP6_SRC: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_DST: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if (is_ipv6) { int i = cmdlen - 1; struct in6_addr p; struct in6_addr *d = &((ipfw_insn_ip6 *)cmd)->addr6; for (; !match && i > 0; d += 2, i -= F_INSN_SIZE(struct in6_addr) * 2) { p = (cmd->opcode == O_IP6_SRC_MASK) ? args->f_id.src_ip6: args->f_id.dst_ip6; APPLY_MASK(&p, &d[1]); match = IN6_ARE_ADDR_EQUAL(&d[0], &p); } } break; case O_FLOW6ID: match = is_ipv6 && flow6id_match(args->f_id.flow_id6, (ipfw_insn_u32 *) cmd); break; case O_EXT_HDR: match = is_ipv6 && (ext_hd & ((ipfw_insn *) cmd)->arg1); break; case O_IP6: match = is_ipv6; break; #endif case O_IP4: match = is_ipv4; break; case O_TAG: { struct m_tag *mtag; uint32_t tag = TARG(cmd->arg1, tag); /* Packet is already tagged with this tag? */ mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); /* We have `untag' action when F_NOT flag is * present. And we must remove this mtag from * mbuf and reset `match' to zero (`match' will * be inversed later). * Otherwise we should allocate new mtag and * push it into mbuf. */ if (cmd->len & F_NOT) { /* `untag' action */ if (mtag != NULL) m_tag_delete(m, mtag); match = 0; } else { if (mtag == NULL) { mtag = m_tag_alloc( MTAG_IPFW, tag, 0, M_NOWAIT); if (mtag != NULL) m_tag_prepend(m, mtag); } match = 1; } break; } case O_FIB: /* try match the specified fib */ if (args->f_id.fib == cmd->arg1) match = 1; break; case O_SOCKARG: { #ifndef USERSPACE /* not supported in userspace */ struct inpcb *inp = args->inp; struct inpcbinfo *pi; if (is_ipv6) /* XXX can we remove this ? */ break; if (proto == IPPROTO_TCP) pi = &V_tcbinfo; else if (proto == IPPROTO_UDP) pi = &V_udbinfo; else if (proto == IPPROTO_UDPLITE) pi = &V_ulitecbinfo; else break; /* * XXXRW: so_user_cookie should almost * certainly be inp_user_cookie? */ /* For incoming packet, lookup up the inpcb using the src/dest ip/port tuple */ if (inp == NULL) { inp = in_pcblookup(pi, src_ip, htons(src_port), dst_ip, htons(dst_port), INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { tablearg = inp->inp_socket->so_user_cookie; if (tablearg) match = 1; INP_RUNLOCK(inp); } } else { if (inp->inp_socket) { tablearg = inp->inp_socket->so_user_cookie; if (tablearg) match = 1; } } #endif /* !USERSPACE */ break; } case O_TAGGED: { struct m_tag *mtag; uint32_t tag = TARG(cmd->arg1, tag); if (cmdlen == 1) { match = m_tag_locate(m, MTAG_IPFW, tag, NULL) != NULL; break; } /* we have ranges */ for (mtag = m_tag_first(m); mtag != NULL && !match; mtag = m_tag_next(m, mtag)) { uint16_t *p; int i; if (mtag->m_tag_cookie != MTAG_IPFW) continue; p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for(; !match && i > 0; i--, p += 2) match = mtag->m_tag_id >= p[0] && mtag->m_tag_id <= p[1]; } break; } /* * The second set of opcodes represents 'actions', * i.e. the terminal part of a rule once the packet * matches all previous patterns. * Typically there is only one action for each rule, * and the opcode is stored at the end of the rule * (but there are exceptions -- see below). * * In general, here we set retval and terminate the * outer loop (would be a 'break 3' in some language, * but we need to set l=0, done=1) * * Exceptions: * O_COUNT and O_SKIPTO actions: * instead of terminating, we jump to the next rule * (setting l=0), or to the SKIPTO target (setting * f/f_len, cmd and l as needed), respectively. * * O_TAG, O_LOG and O_ALTQ action parameters: * perform some action and set match = 1; * * O_LIMIT and O_KEEP_STATE: these opcodes are * not real 'actions', and are stored right * before the 'action' part of the rule (one * exception is O_SKIP_ACTION which could be * between these opcodes and 'action' one). * These opcodes try to install an entry in the * state tables; if successful, we continue with * the next opcode (match=1; break;), otherwise * the packet must be dropped (set retval, * break loops with l=0, done=1) * * O_PROBE_STATE and O_CHECK_STATE: these opcodes * cause a lookup of the state table, and a jump * to the 'action' part of the parent rule * if an entry is found, or * (CHECK_STATE only) a jump to the next rule if * the entry is not found. * The result of the lookup is cached so that * further instances of these opcodes become NOPs. * The jump to the next rule is done by setting * l=0, cmdlen=0. * * O_SKIP_ACTION: this opcode is not a real 'action' * either, and is stored right before the 'action' * part of the rule, right after the O_KEEP_STATE * opcode. It causes match failure so the real * 'action' could be executed only if the rule * is checked via dynamic rule from the state * table, as in such case execution starts * from the true 'action' opcode directly. * */ case O_LIMIT: case O_KEEP_STATE: if (ipfw_dyn_install_state(chain, f, (ipfw_insn_limit *)cmd, args, ulp, pktlen, &dyn_info, tablearg)) { /* error or limit violation */ retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ } match = 1; break; case O_PROBE_STATE: case O_CHECK_STATE: /* * dynamic rules are checked at the first * keep-state or check-state occurrence, * with the result being stored in dyn_info. * The compiler introduces a PROBE_STATE * instruction for us when we have a * KEEP_STATE (because PROBE_STATE needs * to be run first). */ if (DYN_LOOKUP_NEEDED(&dyn_info, cmd) && (q = ipfw_dyn_lookup_state(args, ulp, pktlen, cmd, &dyn_info)) != NULL) { /* * Found dynamic entry, jump to the * 'action' part of the parent rule * by setting f, cmd, l and clearing * cmdlen. */ f = q; f_pos = dyn_info.f_pos; cmd = ACTION_PTR(f); l = f->cmd_len - f->act_ofs; cmdlen = 0; match = 1; break; } /* * Dynamic entry not found. If CHECK_STATE, * skip to next rule, if PROBE_STATE just * ignore and continue with next opcode. */ if (cmd->opcode == O_CHECK_STATE) l = 0; /* exit inner loop */ match = 1; break; case O_SKIP_ACTION: match = 0; /* skip to the next rule */ l = 0; /* exit inner loop */ break; case O_ACCEPT: retval = 0; /* accept */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_PIPE: case O_QUEUE: set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, pipe); if (cmd->opcode == O_PIPE) args->rule.info |= IPFW_IS_PIPE; if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = IP_FW_DUMMYNET; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_DIVERT: case O_TEE: if (args->flags & IPFW_ARGS_ETHER) break; /* not on layer 2 */ /* otherwise this is terminal */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ retval = (cmd->opcode == O_DIVERT) ? IP_FW_DIVERT : IP_FW_TEE; set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, divert); break; case O_COUNT: IPFW_INC_RULE_COUNTER(f, pktlen); l = 0; /* exit inner loop */ break; case O_SKIPTO: IPFW_INC_RULE_COUNTER(f, pktlen); f_pos = JUMP(chain, f, cmd->arg1, tablearg, 0); /* * Skip disabled rules, and re-enter * the inner loop with the correct * f_pos, f, l and cmd. * Also clear cmdlen and skip_or */ for (; f_pos < chain->n_rules - 1 && (V_set_disable & (1 << chain->map[f_pos]->set)); f_pos++) ; /* Re-enter the inner loop at the skipto rule. */ f = chain->map[f_pos]; l = f->cmd_len; cmd = f->cmd; match = 1; cmdlen = 0; skip_or = 0; continue; break; /* not reached */ case O_CALLRETURN: { /* * Implementation of `subroutine' call/return, * in the stack carried in an mbuf tag. This * is different from `skipto' in that any call * address is possible (`skipto' must prevent * backward jumps to avoid endless loops). * We have `return' action when F_NOT flag is * present. The `m_tag_id' field is used as * stack pointer. */ struct m_tag *mtag; uint16_t jmpto, *stack; #define IS_CALL ((cmd->len & F_NOT) == 0) #define IS_RETURN ((cmd->len & F_NOT) != 0) /* * Hand-rolled version of m_tag_locate() with * wildcard `type'. * If not already tagged, allocate new tag. */ mtag = m_tag_first(m); while (mtag != NULL) { if (mtag->m_tag_cookie == MTAG_IPFW_CALL) break; mtag = m_tag_next(m, mtag); } if (mtag == NULL && IS_CALL) { mtag = m_tag_alloc(MTAG_IPFW_CALL, 0, IPFW_CALLSTACK_SIZE * sizeof(uint16_t), M_NOWAIT); if (mtag != NULL) m_tag_prepend(m, mtag); } /* * On error both `call' and `return' just * continue with next rule. */ if (IS_RETURN && (mtag == NULL || mtag->m_tag_id == 0)) { l = 0; /* exit inner loop */ break; } if (IS_CALL && (mtag == NULL || mtag->m_tag_id >= IPFW_CALLSTACK_SIZE)) { printf("ipfw: call stack error, " "go to next rule\n"); l = 0; /* exit inner loop */ break; } IPFW_INC_RULE_COUNTER(f, pktlen); stack = (uint16_t *)(mtag + 1); /* * The `call' action may use cached f_pos * (in f->next_rule), whose version is written * in f->next_rule. * The `return' action, however, doesn't have * fixed jump address in cmd->arg1 and can't use * cache. */ if (IS_CALL) { stack[mtag->m_tag_id] = f->rulenum; mtag->m_tag_id++; f_pos = JUMP(chain, f, cmd->arg1, tablearg, 1); } else { /* `return' action */ mtag->m_tag_id--; jmpto = stack[mtag->m_tag_id] + 1; f_pos = ipfw_find_rule(chain, jmpto, 0); } /* * Skip disabled rules, and re-enter * the inner loop with the correct * f_pos, f, l and cmd. * Also clear cmdlen and skip_or */ for (; f_pos < chain->n_rules - 1 && (V_set_disable & (1 << chain->map[f_pos]->set)); f_pos++) ; /* Re-enter the inner loop at the dest rule. */ f = chain->map[f_pos]; l = f->cmd_len; cmd = f->cmd; cmdlen = 0; skip_or = 0; continue; break; /* NOTREACHED */ } #undef IS_CALL #undef IS_RETURN case O_REJECT: /* * Drop the packet and send a reject notice * if the packet is not ICMP (or is an ICMP * query), and it is not multicast/broadcast. */ if (hlen > 0 && is_ipv4 && offset == 0 && (proto != IPPROTO_ICMP || is_icmp_query(ICMP(ulp))) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN_MULTICAST(ntohl(dst_ip.s_addr))) { send_reject(args, cmd->arg1, iplen, ip); m = args->m; } /* FALLTHROUGH */ #ifdef INET6 case O_UNREACH6: if (hlen > 0 && is_ipv6 && ((offset & IP6F_OFF_MASK) == 0) && (proto != IPPROTO_ICMPV6 || (is_icmp6_query(icmp6_type) == 1)) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN6_IS_ADDR_MULTICAST( &args->f_id.dst_ip6)) { send_reject6(args, cmd->opcode == O_REJECT ? map_icmp_unreach(cmd->arg1): cmd->arg1, hlen, (struct ip6_hdr *)ip); m = args->m; } /* FALLTHROUGH */ #endif case O_DENY: retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_FORWARD_IP: if (args->flags & IPFW_ARGS_ETHER) break; /* not valid on layer2 pkts */ if (q != f || dyn_info.direction == MATCH_FORWARD) { struct sockaddr_in *sa; sa = &(((ipfw_insn_sa *)cmd)->sa); if (sa->sin_addr.s_addr == INADDR_ANY) { #ifdef INET6 /* * We use O_FORWARD_IP opcode for * fwd rule with tablearg, but tables * now support IPv6 addresses. And * when we are inspecting IPv6 packet, * we can use nh6 field from * table_value as next_hop6 address. */ if (is_ipv6) { struct ip_fw_nh6 *nh6; args->flags |= IPFW_ARGS_NH6; nh6 = &args->hopstore6; nh6->sin6_addr = TARG_VAL( chain, tablearg, nh6); nh6->sin6_port = sa->sin_port; nh6->sin6_scope_id = TARG_VAL( chain, tablearg, zoneid); } else #endif { args->flags |= IPFW_ARGS_NH4; args->hopstore.sin_port = sa->sin_port; sa = &args->hopstore; sa->sin_family = AF_INET; sa->sin_len = sizeof(*sa); sa->sin_addr.s_addr = htonl( TARG_VAL(chain, tablearg, nh4)); } } else { args->flags |= IPFW_ARGS_NH4PTR; args->next_hop = sa; } } retval = IP_FW_PASS; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; #ifdef INET6 case O_FORWARD_IP6: if (args->flags & IPFW_ARGS_ETHER) break; /* not valid on layer2 pkts */ if (q != f || dyn_info.direction == MATCH_FORWARD) { struct sockaddr_in6 *sin6; sin6 = &(((ipfw_insn_sa6 *)cmd)->sa); args->flags |= IPFW_ARGS_NH6PTR; args->next_hop6 = sin6; } retval = IP_FW_PASS; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; #endif case O_NETGRAPH: case O_NGTEE: set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, netgraph); if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = (cmd->opcode == O_NETGRAPH) ? IP_FW_NETGRAPH : IP_FW_NGTEE; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_SETFIB: { uint32_t fib; IPFW_INC_RULE_COUNTER(f, pktlen); fib = TARG(cmd->arg1, fib) & 0x7FFF; if (fib >= rt_numfibs) fib = 0; M_SETFIB(m, fib); args->f_id.fib = fib; /* XXX */ l = 0; /* exit inner loop */ break; } case O_SETDSCP: { uint16_t code; code = TARG(cmd->arg1, dscp) & 0x3F; l = 0; /* exit inner loop */ if (is_ipv4) { uint16_t old; old = *(uint16_t *)ip; ip->ip_tos = (code << 2) | (ip->ip_tos & 0x03); ip->ip_sum = cksum_adjust(ip->ip_sum, old, *(uint16_t *)ip); } else if (is_ipv6) { uint8_t *v; v = &((struct ip6_hdr *)ip)->ip6_vfc; *v = (*v & 0xF0) | (code >> 2); v++; *v = (*v & 0x3F) | ((code & 0x03) << 6); } else break; IPFW_INC_RULE_COUNTER(f, pktlen); break; } case O_NAT: l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ /* * Ensure that we do not invoke NAT handler for * non IPv4 packets. Libalias expects only IPv4. */ if (!is_ipv4 || !IPFW_NAT_LOADED) { retval = IP_FW_DENY; break; } struct cfg_nat *t; int nat_id; args->rule.info = 0; set_match(args, f_pos, chain); /* Check if this is 'global' nat rule */ if (cmd->arg1 == IP_FW_NAT44_GLOBAL) { retval = ipfw_nat_ptr(args, NULL, m); break; } t = ((ipfw_insn_nat *)cmd)->nat; if (t == NULL) { nat_id = TARG(cmd->arg1, nat); t = (*lookup_nat_ptr)(&chain->nat, nat_id); if (t == NULL) { retval = IP_FW_DENY; break; } if (cmd->arg1 != IP_FW_TARG) ((ipfw_insn_nat *)cmd)->nat = t; } retval = ipfw_nat_ptr(args, t, m); break; case O_REASS: { int ip_off; l = 0; /* in any case exit inner loop */ if (is_ipv6) /* IPv6 is not supported yet */ break; IPFW_INC_RULE_COUNTER(f, pktlen); ip_off = ntohs(ip->ip_off); /* if not fragmented, go to next rule */ if ((ip_off & (IP_MF | IP_OFFMASK)) == 0) break; args->m = m = ip_reass(m); /* * do IP header checksum fixup. */ if (m == NULL) { /* fragment got swallowed */ retval = IP_FW_DENY; } else { /* good, packet complete */ int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(m, hlen); retval = IP_FW_REASS; args->rule.info = 0; set_match(args, f_pos, chain); } done = 1; /* exit outer loop */ break; } case O_EXTERNAL_ACTION: l = 0; /* in any case exit inner loop */ retval = ipfw_run_eaction(chain, args, cmd, &done); /* * If both @retval and @done are zero, * consider this as rule matching and * update counters. */ if (retval == 0 && done == 0) { IPFW_INC_RULE_COUNTER(f, pktlen); /* * Reset the result of the last * dynamic state lookup. * External action can change * @args content, and it may be * used for new state lookup later. */ DYN_INFO_INIT(&dyn_info); } break; default: panic("-- unknown opcode %d\n", cmd->opcode); } /* end of switch() on opcodes */ /* * if we get here with l=0, then match is irrelevant. */ if (cmd->len & F_NOT) match = !match; if (match) { if (cmd->len & F_OR) skip_or = 1; } else { if (!(cmd->len & F_OR)) /* not an OR block, */ break; /* try next rule */ } } /* end of inner loop, scan opcodes */ #undef PULLUP_LEN #undef PULLUP_LEN_LOCKED if (done) break; /* next_rule:; */ /* try next rule */ } /* end of outer for, scan rules */ if (done) { struct ip_fw *rule = chain->map[f_pos]; /* Update statistics */ IPFW_INC_RULE_COUNTER(rule, pktlen); } else { retval = IP_FW_DENY; printf("ipfw: ouch!, skip past end of rules, denying packet\n"); } IPFW_PF_RUNLOCK(chain); #ifdef __FreeBSD__ if (ucred_cache != NULL) crfree(ucred_cache); #endif return (retval); pullup_failed: if (V_fw_verbose) printf("ipfw: pullup failed\n"); return (IP_FW_DENY); } /* * Set maximum number of tables that can be used in given VNET ipfw instance. */ #ifdef SYSCTL_NODE static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS) { int error; unsigned int ntables; ntables = V_fw_tables_max; error = sysctl_handle_int(oidp, &ntables, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); return (ipfw_resize_tables(&V_layer3_chain, ntables)); } /* * Switches table namespace between global and per-set. */ static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS) { int error; unsigned int sets; sets = V_fw_tables_sets; error = sysctl_handle_int(oidp, &sets, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); return (ipfw_switch_tables_namespace(&V_layer3_chain, sets)); } #endif /* * Module and VNET glue */ /* * Stuff that must be initialised only on boot or module load */ static int ipfw_init(void) { int error = 0; /* * Only print out this stuff the first time around, * when called from the sysinit code. */ printf("ipfw2 " #ifdef INET6 "(+ipv6) " #endif "initialized, divert %s, nat %s, " "default to %s, logging ", #ifdef IPDIVERT "enabled", #else "loadable", #endif #ifdef IPFIREWALL_NAT "enabled", #else "loadable", #endif default_to_accept ? "accept" : "deny"); /* * Note: V_xxx variables can be accessed here but the vnet specific * initializer may not have been called yet for the VIMAGE case. * Tuneables will have been processed. We will print out values for * the default vnet. * XXX This should all be rationalized AFTER 8.0 */ if (V_fw_verbose == 0) printf("disabled\n"); else if (V_verbose_limit == 0) printf("unlimited\n"); else printf("limited to %d packets/entry by default\n", V_verbose_limit); /* Check user-supplied table count for validness */ if (default_fw_tables > IPFW_TABLES_MAX) default_fw_tables = IPFW_TABLES_MAX; ipfw_init_sopt_handler(); ipfw_init_obj_rewriter(); ipfw_iface_init(); return (error); } /* * Called for the removal of the last instance only on module unload. */ static void ipfw_destroy(void) { ipfw_iface_destroy(); ipfw_destroy_sopt_handler(); ipfw_destroy_obj_rewriter(); printf("IP firewall unloaded\n"); } /* * Stuff that must be initialized for every instance * (including the first of course). */ static int vnet_ipfw_init(const void *unused) { int error, first; struct ip_fw *rule = NULL; struct ip_fw_chain *chain; chain = &V_layer3_chain; first = IS_DEFAULT_VNET(curvnet) ? 1 : 0; /* First set up some values that are compile time options */ V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ V_fw_deny_unknown_exthdrs = 1; #ifdef IPFIREWALL_VERBOSE V_fw_verbose = 1; #endif #ifdef IPFIREWALL_VERBOSE_LIMIT V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; #endif #ifdef IPFIREWALL_NAT LIST_INIT(&chain->nat); #endif /* Init shared services hash table */ ipfw_init_srv(chain); ipfw_init_counters(); /* Set initial number of tables */ V_fw_tables_max = default_fw_tables; error = ipfw_init_tables(chain, first); if (error) { printf("ipfw2: setting up tables failed\n"); free(chain->map, M_IPFW); free(rule, M_IPFW); return (ENOSPC); } IPFW_LOCK_INIT(chain); /* fill and insert the default rule */ rule = ipfw_alloc_rule(chain, sizeof(struct ip_fw)); rule->flags |= IPFW_RULE_NOOPT; rule->cmd_len = 1; rule->cmd[0].len = 1; rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; chain->default_rule = rule; ipfw_add_protected_rule(chain, rule, 0); ipfw_dyn_init(chain); ipfw_eaction_init(chain, first); #ifdef LINEAR_SKIPTO ipfw_init_skipto_cache(chain); #endif ipfw_bpf_init(first); /* First set up some values that are compile time options */ V_ipfw_vnet_ready = 1; /* Open for business */ /* * Hook the sockopt handler and pfil hooks for ipv4 and ipv6. * Even if the latter two fail we still keep the module alive * because the sockopt and layer2 paths are still useful. * ipfw[6]_hook return 0 on success, ENOENT on failure, * so we can ignore the exact return value and just set a flag. * * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so * changes in the underlying (per-vnet) variables trigger * immediate hook()/unhook() calls. * In layer2 we have the same behaviour, except that V_ether_ipfw * is checked on each packet because there are no pfil hooks. */ V_ip_fw_ctl_ptr = ipfw_ctl3; error = ipfw_attach_hooks(); return (error); } /* * Called for the removal of each instance. */ static int vnet_ipfw_uninit(const void *unused) { struct ip_fw *reap; struct ip_fw_chain *chain = &V_layer3_chain; int i, last; V_ipfw_vnet_ready = 0; /* tell new callers to go away */ /* * disconnect from ipv4, ipv6, layer2 and sockopt. * Then grab, release and grab again the WLOCK so we make * sure the update is propagated and nobody will be in. */ ipfw_detach_hooks(); V_ip_fw_ctl_ptr = NULL; last = IS_DEFAULT_VNET(curvnet) ? 1 : 0; IPFW_UH_WLOCK(chain); IPFW_UH_WUNLOCK(chain); ipfw_dyn_uninit(0); /* run the callout_drain */ IPFW_UH_WLOCK(chain); reap = NULL; IPFW_WLOCK(chain); for (i = 0; i < chain->n_rules; i++) ipfw_reap_add(chain, &reap, chain->map[i]); free(chain->map, M_IPFW); #ifdef LINEAR_SKIPTO ipfw_destroy_skipto_cache(chain); #endif IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); ipfw_destroy_tables(chain, last); ipfw_eaction_uninit(chain, last); if (reap != NULL) ipfw_reap_rules(reap); vnet_ipfw_iface_destroy(chain); ipfw_destroy_srv(chain); IPFW_LOCK_DESTROY(chain); ipfw_dyn_uninit(1); /* free the remaining parts */ ipfw_destroy_counters(); ipfw_bpf_uninit(last); return (0); } /* * Module event handler. * In general we have the choice of handling most of these events by the * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to * use the SYSINIT handlers as they are more capable of expressing the * flow of control during module and vnet operations, so this is just * a skeleton. Note there is no SYSINIT equivalent of the module * SHUTDOWN handler, but we don't have anything to do in that case anyhow. */ static int ipfw_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: /* Called once at module load or * system boot if compiled in. */ break; case MOD_QUIESCE: /* Called before unload. May veto unloading. */ break; case MOD_UNLOAD: /* Called during unload. */ break; case MOD_SHUTDOWN: /* Called during system shutdown. */ break; default: err = EOPNOTSUPP; break; } return err; } static moduledata_t ipfwmod = { "ipfw", ipfw_modevent, 0 }; /* Define startup order. */ #define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_FIREWALL #define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */ #define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */ #define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */ DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER); FEATURE(ipfw_ctl3, "ipfw new sockopt calls"); MODULE_VERSION(ipfw, 3); /* should declare some dependencies here */ /* * Starting up. Done in order after ipfwmod() has been called. * VNET_SYSINIT is also called for each existing vnet and each new vnet. */ SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_init, NULL); VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_init, NULL); /* * Closing up shop. These are done in REVERSE ORDER, but still * after ipfwmod() has been called. Not called on reboot. * VNET_SYSUNINIT is also called for each exiting vnet as it exits. * or when the module is unloaded. */ SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_destroy, NULL); VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_uninit, NULL); /* end of file */ Index: head/sys/netpfil/ipfw/ip_fw_table_algo.c =================================================================== --- head/sys/netpfil/ipfw/ip_fw_table_algo.c (revision 362899) +++ head/sys/netpfil/ipfw/ip_fw_table_algo.c (revision 362900) @@ -1,4113 +1,4110 @@ /*- * Copyright (c) 2014 Yandex LLC * Copyright (c) 2014 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Lookup table algorithms. * */ #include "opt_ipfw.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include /* ip_fw.h requires IFNAMSIZ */ #include #include +#include #include #include #include #include /* struct ipfw_rule_ref */ #include #include #include #include /* * IPFW table lookup algorithms. * * What is needed to add another table algo? * * Algo init: * * struct table_algo has to be filled with: * name: "type:algoname" format, e.g. "addr:radix". Currently * there are the following types: "addr", "iface", "number" and "flow". * type: one of IPFW_TABLE_* types * flags: one or more TA_FLAGS_* * ta_buf_size: size of structure used to store add/del item state. * Needs to be less than TA_BUF_SZ. * callbacks: see below for description. * * ipfw_add_table_algo / ipfw_del_table_algo has to be called * * Callbacks description: * * -init: request to initialize new table instance. * typedef int (ta_init)(struct ip_fw_chain *ch, void **ta_state, * struct table_info *ti, char *data, uint8_t tflags); * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success. * * Allocate all structures needed for normal operations. * * Caller may want to parse @data for some algo-specific * options provided by userland. * * Caller may want to save configuration state pointer to @ta_state * * Caller needs to save desired runtime structure pointer(s) * inside @ti fields. Note that it is not correct to save * @ti pointer at this moment. Use -change_ti hook for that. * * Caller has to fill in ti->lookup to appropriate function * pointer. * * * * -destroy: request to destroy table instance. * typedef void (ta_destroy)(void *ta_state, struct table_info *ti); * MANDATORY, unlocked. (M_WAITOK). * * Frees all table entries and all tables structures allocated by -init. * * * * -prepare_add: request to allocate state for adding new entry. * typedef int (ta_prepare_add)(struct ip_fw_chain *ch, struct tentry_info *tei, * void *ta_buf); * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success. * * Allocates state and fills it in with all necessary data (EXCEPT value) * from @tei to minimize operations needed to be done under WLOCK. * "value" field has to be copied to new entry in @add callback. * Buffer ta_buf of size ta->ta_buf_sz may be used to store * allocated state. * * * * -prepare_del: request to set state for deleting existing entry. * typedef int (ta_prepare_del)(struct ip_fw_chain *ch, struct tentry_info *tei, * void *ta_buf); * MANDATORY, locked, UH. (M_NOWAIT). Returns 0 on success. * * Buffer ta_buf of size ta->ta_buf_sz may be used to store * allocated state. Caller should use on-stack ta_buf allocation * instead of doing malloc(). * * * * -add: request to insert new entry into runtime/config structures. * typedef int (ta_add)(void *ta_state, struct table_info *ti, * struct tentry_info *tei, void *ta_buf, uint32_t *pnum); * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success. * * Insert new entry using previously-allocated state in @ta_buf. * * @tei may have the following flags: * TEI_FLAGS_UPDATE: request to add or update entry. * TEI_FLAGS_DONTADD: request to update (but not add) entry. * * Caller is required to do the following: * copy real entry value from @tei * entry added: return 0, set 1 to @pnum * entry updated: return 0, store 0 to @pnum, store old value in @tei, * add TEI_FLAGS_UPDATED flag to @tei. * entry exists: return EEXIST * entry not found: return ENOENT * other error: return non-zero error code. * * * * -del: request to delete existing entry from runtime/config structures. * typedef int (ta_del)(void *ta_state, struct table_info *ti, * struct tentry_info *tei, void *ta_buf, uint32_t *pnum); * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success. * * Delete entry using previously set up in @ta_buf. * * Caller is required to do the following: * entry deleted: return 0, set 1 to @pnum, store old value in @tei. * entry not found: return ENOENT * other error: return non-zero error code. * * * * -flush_entry: flush entry state created by -prepare_add / -del / others * typedef void (ta_flush_entry)(struct ip_fw_chain *ch, * struct tentry_info *tei, void *ta_buf); * MANDATORY, may be locked. (M_NOWAIT). * * Delete state allocated by: * -prepare_add (-add returned EEXIST|UPDATED) * -prepare_del (if any) * -del * * Caller is required to handle empty @ta_buf correctly. * * * -find_tentry: finds entry specified by key @tei * typedef int ta_find_tentry(void *ta_state, struct table_info *ti, * ipfw_obj_tentry *tent); * OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 on success. * * Finds entry specified by given key. * * Caller is required to do the following: * entry found: returns 0, export entry to @tent * entry not found: returns ENOENT * * * -need_modify: checks if @ti has enough space to hold another @count items. * typedef int (ta_need_modify)(void *ta_state, struct table_info *ti, * uint32_t count, uint64_t *pflags); * OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 if has. * * Checks if given table has enough space to add @count items without * resize. Caller may use @pflags to store desired modification data. * * * * -prepare_mod: allocate structures for table modification. * typedef int (ta_prepare_mod)(void *ta_buf, uint64_t *pflags); * OPTIONAL(need_modify), unlocked. (M_WAITOK). Returns 0 on success. * * Allocate all needed state for table modification. Caller * should use `struct mod_item` to store new state in @ta_buf. * Up to TA_BUF_SZ (128 bytes) can be stored in @ta_buf. * * * * -fill_mod: copy some data to new state/ * typedef int (ta_fill_mod)(void *ta_state, struct table_info *ti, * void *ta_buf, uint64_t *pflags); * OPTIONAL(need_modify), locked (UH). (M_NOWAIT). Returns 0 on success. * * Copy as much data as we can to minimize changes under WLOCK. * For example, array can be merged inside this callback. * * * * -modify: perform final modification. * typedef void (ta_modify)(void *ta_state, struct table_info *ti, * void *ta_buf, uint64_t pflags); * OPTIONAL(need_modify), locked (UH+WLOCK). (M_NOWAIT). * * Performs all changes necessary to switch to new structures. * * Caller should save old pointers to @ta_buf storage. * * * * -flush_mod: flush table modification state. * typedef void (ta_flush_mod)(void *ta_buf); * OPTIONAL(need_modify), unlocked. (M_WAITOK). * * Performs flush for the following: * - prepare_mod (modification was not necessary) * - modify (for the old state) * * * * -change_gi: monitor table info pointer changes * typedef void (ta_change_ti)(void *ta_state, struct table_info *ti); * OPTIONAL, locked (UH). (M_NOWAIT). * * Called on @ti pointer changed. Called immediately after -init * to set initial state. * * * * -foreach: calls @f for each table entry * typedef void ta_foreach(void *ta_state, struct table_info *ti, * ta_foreach_f *f, void *arg); * MANDATORY, locked(UH). (M_NOWAIT). * * Runs callback with specified argument for each table entry, * Typically used for dumping table entries. * * * * -dump_tentry: dump table entry in current @tentry format. * typedef int ta_dump_tentry(void *ta_state, struct table_info *ti, void *e, * ipfw_obj_tentry *tent); * MANDATORY, locked(UH). (M_NOWAIT). Returns 0 on success. * * Dumps entry @e to @tent. * * * -print_config: prints custom algorithm options into buffer. * typedef void (ta_print_config)(void *ta_state, struct table_info *ti, * char *buf, size_t bufsize); * OPTIONAL. locked(UH). (M_NOWAIT). * * Prints custom algorithm options in the format suitable to pass * back to -init callback. * * * * -dump_tinfo: dumps algo-specific info. * typedef void ta_dump_tinfo(void *ta_state, struct table_info *ti, * ipfw_ta_tinfo *tinfo); * OPTIONAL. locked(UH). (M_NOWAIT). * * Dumps options like items size/hash size, etc. */ MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); /* * Utility structures/functions common to more than one algo */ struct mod_item { void *main_ptr; size_t size; void *main_ptr6; size_t size6; }; static int badd(const void *key, void *item, void *base, size_t nmemb, size_t size, int (*compar) (const void *, const void *)); static int bdel(const void *key, void *base, size_t nmemb, size_t size, int (*compar) (const void *, const void *)); /* * ADDR implementation using radix * */ /* * The radix code expects addr and mask to be array of bytes, * with the first byte being the length of the array. rn_inithead * is called with the offset in bits of the lookup key within the * array. If we use a sockaddr_in as the underlying type, * sin_len is conveniently located at offset 0, sin_addr is at * offset 4 and normally aligned. * But for portability, let's avoid assumption and make the code explicit */ #define KEY_LEN(v) *((uint8_t *)&(v)) /* * Do not require radix to compare more than actual IPv4/IPv6 address */ #define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t)) #define KEY_LEN_INET6 (offsetof(struct sa_in6, sin6_addr) + sizeof(struct in6_addr)) #define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr)) #define OFF_LEN_INET6 (8 * offsetof(struct sa_in6, sin6_addr)) struct radix_addr_entry { struct radix_node rn[2]; struct sockaddr_in addr; uint32_t value; uint8_t masklen; }; struct sa_in6 { uint8_t sin6_len; uint8_t sin6_family; uint8_t pad[2]; struct in6_addr sin6_addr; }; struct radix_addr_xentry { struct radix_node rn[2]; struct sa_in6 addr6; uint32_t value; uint8_t masklen; }; struct radix_cfg { struct radix_node_head *head4; struct radix_node_head *head6; size_t count4; size_t count6; }; struct ta_buf_radix { void *ent_ptr; struct sockaddr *addr_ptr; struct sockaddr *mask_ptr; union { struct { struct sockaddr_in sa; struct sockaddr_in ma; } a4; struct { struct sa_in6 sa; struct sa_in6 ma; } a6; } addr; }; static int ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_init_radix(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static int flush_radix_entry(struct radix_node *rn, void *arg); static void ta_destroy_radix(void *ta_state, struct table_info *ti); static void ta_dump_radix_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_dump_radix_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int ta_find_radix_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_radix(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); static void tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa, struct sockaddr *ma, int *set_mask); static int ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_del_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_need_modify_radix(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct radix_node_head *rnh; if (keylen == sizeof(in_addr_t)) { struct radix_addr_entry *ent; struct sockaddr_in sa; KEY_LEN(sa) = KEY_LEN_INET; sa.sin_addr.s_addr = *((in_addr_t *)key); rnh = (struct radix_node_head *)ti->state; ent = (struct radix_addr_entry *)(rnh->rnh_matchaddr(&sa, &rnh->rh)); if (ent != NULL) { *val = ent->value; return (1); } } else { struct radix_addr_xentry *xent; struct sa_in6 sa6; KEY_LEN(sa6) = KEY_LEN_INET6; memcpy(&sa6.sin6_addr, key, sizeof(struct in6_addr)); rnh = (struct radix_node_head *)ti->xstate; xent = (struct radix_addr_xentry *)(rnh->rnh_matchaddr(&sa6, &rnh->rh)); if (xent != NULL) { *val = xent->value; return (1); } } return (0); } /* * New table */ static int ta_init_radix(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { struct radix_cfg *cfg; if (!rn_inithead(&ti->state, OFF_LEN_INET)) return (ENOMEM); if (!rn_inithead(&ti->xstate, OFF_LEN_INET6)) { rn_detachhead(&ti->state); return (ENOMEM); } cfg = malloc(sizeof(struct radix_cfg), M_IPFW, M_WAITOK | M_ZERO); *ta_state = cfg; ti->lookup = ta_lookup_radix; return (0); } static int flush_radix_entry(struct radix_node *rn, void *arg) { struct radix_node_head * const rnh = arg; struct radix_addr_entry *ent; ent = (struct radix_addr_entry *) rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, &rnh->rh); if (ent != NULL) free(ent, M_IPFW_TBL); return (0); } static void ta_destroy_radix(void *ta_state, struct table_info *ti) { struct radix_cfg *cfg; struct radix_node_head *rnh; cfg = (struct radix_cfg *)ta_state; rnh = (struct radix_node_head *)(ti->state); rnh->rnh_walktree(&rnh->rh, flush_radix_entry, rnh); rn_detachhead(&ti->state); rnh = (struct radix_node_head *)(ti->xstate); rnh->rnh_walktree(&rnh->rh, flush_radix_entry, rnh); rn_detachhead(&ti->xstate); free(cfg, M_IPFW); } /* * Provide algo-specific table info */ static void ta_dump_radix_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct radix_cfg *cfg; cfg = (struct radix_cfg *)ta_state; tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM; tinfo->taclass4 = IPFW_TACLASS_RADIX; tinfo->count4 = cfg->count4; tinfo->itemsize4 = sizeof(struct radix_addr_entry); tinfo->taclass6 = IPFW_TACLASS_RADIX; tinfo->count6 = cfg->count6; tinfo->itemsize6 = sizeof(struct radix_addr_xentry); } static int ta_dump_radix_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct radix_addr_entry *n; #ifdef INET6 struct radix_addr_xentry *xn; #endif n = (struct radix_addr_entry *)e; /* Guess IPv4/IPv6 radix by sockaddr family */ if (n->addr.sin_family == AF_INET) { tent->k.addr.s_addr = n->addr.sin_addr.s_addr; tent->masklen = n->masklen; tent->subtype = AF_INET; tent->v.kidx = n->value; #ifdef INET6 } else { xn = (struct radix_addr_xentry *)e; memcpy(&tent->k.addr6, &xn->addr6.sin6_addr, sizeof(struct in6_addr)); tent->masklen = xn->masklen; tent->subtype = AF_INET6; tent->v.kidx = xn->value; #endif } return (0); } static int ta_find_radix_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct radix_node_head *rnh; void *e; e = NULL; if (tent->subtype == AF_INET) { struct sockaddr_in sa; KEY_LEN(sa) = KEY_LEN_INET; sa.sin_addr.s_addr = tent->k.addr.s_addr; rnh = (struct radix_node_head *)ti->state; e = rnh->rnh_matchaddr(&sa, &rnh->rh); } else { struct sa_in6 sa6; KEY_LEN(sa6) = KEY_LEN_INET6; memcpy(&sa6.sin6_addr, &tent->k.addr6, sizeof(struct in6_addr)); rnh = (struct radix_node_head *)ti->xstate; e = rnh->rnh_matchaddr(&sa6, &rnh->rh); } if (e != NULL) { ta_dump_radix_tentry(ta_state, ti, e, tent); return (0); } return (ENOENT); } static void ta_foreach_radix(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct radix_node_head *rnh; rnh = (struct radix_node_head *)(ti->state); rnh->rnh_walktree(&rnh->rh, (walktree_f_t *)f, arg); rnh = (struct radix_node_head *)(ti->xstate); rnh->rnh_walktree(&rnh->rh, (walktree_f_t *)f, arg); } #ifdef INET6 static inline void ipv6_writemask(struct in6_addr *addr6, uint8_t mask); static inline void ipv6_writemask(struct in6_addr *addr6, uint8_t mask) { uint32_t *cp; for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32) *cp++ = 0xFFFFFFFF; if (mask > 0) *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0); } #endif static void tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa, struct sockaddr *ma, int *set_mask) { int mlen; #ifdef INET struct sockaddr_in *addr, *mask; #endif #ifdef INET6 struct sa_in6 *addr6, *mask6; #endif in_addr_t a4; mlen = tei->masklen; if (tei->subtype == AF_INET) { #ifdef INET addr = (struct sockaddr_in *)sa; mask = (struct sockaddr_in *)ma; /* Set 'total' structure length */ KEY_LEN(*addr) = KEY_LEN_INET; KEY_LEN(*mask) = KEY_LEN_INET; addr->sin_family = AF_INET; mask->sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); a4 = *((in_addr_t *)tei->paddr); addr->sin_addr.s_addr = a4 & mask->sin_addr.s_addr; if (mlen != 32) *set_mask = 1; else *set_mask = 0; #endif #ifdef INET6 } else if (tei->subtype == AF_INET6) { /* IPv6 case */ addr6 = (struct sa_in6 *)sa; mask6 = (struct sa_in6 *)ma; /* Set 'total' structure length */ KEY_LEN(*addr6) = KEY_LEN_INET6; KEY_LEN(*mask6) = KEY_LEN_INET6; addr6->sin6_family = AF_INET6; ipv6_writemask(&mask6->sin6_addr, mlen); memcpy(&addr6->sin6_addr, tei->paddr, sizeof(struct in6_addr)); APPLY_MASK(&addr6->sin6_addr, &mask6->sin6_addr); if (mlen != 128) *set_mask = 1; else *set_mask = 0; #endif } } static int ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_radix *tb; struct radix_addr_entry *ent; #ifdef INET6 struct radix_addr_xentry *xent; #endif struct sockaddr *addr, *mask; int mlen, set_mask; tb = (struct ta_buf_radix *)ta_buf; mlen = tei->masklen; set_mask = 0; if (tei->subtype == AF_INET) { #ifdef INET if (mlen > 32) return (EINVAL); ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); ent->masklen = mlen; addr = (struct sockaddr *)&ent->addr; mask = (struct sockaddr *)&tb->addr.a4.ma; tb->ent_ptr = ent; #endif #ifdef INET6 } else if (tei->subtype == AF_INET6) { /* IPv6 case */ if (mlen > 128) return (EINVAL); xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO); xent->masklen = mlen; addr = (struct sockaddr *)&xent->addr6; mask = (struct sockaddr *)&tb->addr.a6.ma; tb->ent_ptr = xent; #endif } else { /* Unknown CIDR type */ return (EINVAL); } tei_to_sockaddr_ent(tei, addr, mask, &set_mask); /* Set pointers */ tb->addr_ptr = addr; if (set_mask != 0) tb->mask_ptr = mask; return (0); } static int ta_add_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct radix_cfg *cfg; struct radix_node_head *rnh; struct radix_node *rn; struct ta_buf_radix *tb; uint32_t *old_value, value; cfg = (struct radix_cfg *)ta_state; tb = (struct ta_buf_radix *)ta_buf; /* Save current entry value from @tei */ if (tei->subtype == AF_INET) { rnh = ti->state; ((struct radix_addr_entry *)tb->ent_ptr)->value = tei->value; } else { rnh = ti->xstate; ((struct radix_addr_xentry *)tb->ent_ptr)->value = tei->value; } /* Search for an entry first */ rn = rnh->rnh_lookup(tb->addr_ptr, tb->mask_ptr, &rnh->rh); if (rn != NULL) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Record already exists. Update value if we're asked to */ if (tei->subtype == AF_INET) old_value = &((struct radix_addr_entry *)rn)->value; else old_value = &((struct radix_addr_xentry *)rn)->value; value = *old_value; *old_value = tei->value; tei->value = value; /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; return (0); } if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); rn = rnh->rnh_addaddr(tb->addr_ptr, tb->mask_ptr, &rnh->rh,tb->ent_ptr); if (rn == NULL) { /* Unknown error */ return (EINVAL); } if (tei->subtype == AF_INET) cfg->count4++; else cfg->count6++; tb->ent_ptr = NULL; *pnum = 1; return (0); } static int ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_radix *tb; struct sockaddr *addr, *mask; int mlen, set_mask; tb = (struct ta_buf_radix *)ta_buf; mlen = tei->masklen; set_mask = 0; if (tei->subtype == AF_INET) { if (mlen > 32) return (EINVAL); addr = (struct sockaddr *)&tb->addr.a4.sa; mask = (struct sockaddr *)&tb->addr.a4.ma; #ifdef INET6 } else if (tei->subtype == AF_INET6) { if (mlen > 128) return (EINVAL); addr = (struct sockaddr *)&tb->addr.a6.sa; mask = (struct sockaddr *)&tb->addr.a6.ma; #endif } else return (EINVAL); tei_to_sockaddr_ent(tei, addr, mask, &set_mask); tb->addr_ptr = addr; if (set_mask != 0) tb->mask_ptr = mask; return (0); } static int ta_del_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct radix_cfg *cfg; struct radix_node_head *rnh; struct radix_node *rn; struct ta_buf_radix *tb; cfg = (struct radix_cfg *)ta_state; tb = (struct ta_buf_radix *)ta_buf; if (tei->subtype == AF_INET) rnh = ti->state; else rnh = ti->xstate; rn = rnh->rnh_deladdr(tb->addr_ptr, tb->mask_ptr, &rnh->rh); if (rn == NULL) return (ENOENT); /* Save entry value to @tei */ if (tei->subtype == AF_INET) tei->value = ((struct radix_addr_entry *)rn)->value; else tei->value = ((struct radix_addr_xentry *)rn)->value; tb->ent_ptr = rn; if (tei->subtype == AF_INET) cfg->count4--; else cfg->count6--; *pnum = 1; return (0); } static void ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_radix *tb; tb = (struct ta_buf_radix *)ta_buf; if (tb->ent_ptr != NULL) free(tb->ent_ptr, M_IPFW_TBL); } static int ta_need_modify_radix(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { /* * radix does not require additional memory allocations * other than nodes itself. Adding new masks to the tree do * but we don't have any API to call (and we don't known which * sizes do we need). */ return (0); } struct table_algo addr_radix = { .name = "addr:radix", .type = IPFW_TABLE_ADDR, .flags = TA_FLAG_DEFAULT, .ta_buf_size = sizeof(struct ta_buf_radix), .init = ta_init_radix, .destroy = ta_destroy_radix, .prepare_add = ta_prepare_add_radix, .prepare_del = ta_prepare_del_radix, .add = ta_add_radix, .del = ta_del_radix, .flush_entry = ta_flush_radix_entry, .foreach = ta_foreach_radix, .dump_tentry = ta_dump_radix_tentry, .find_tentry = ta_find_radix_tentry, .dump_tinfo = ta_dump_radix_tinfo, .need_modify = ta_need_modify_radix, }; /* * addr:hash cmds * * * ti->data: * [inv.mask4][inv.mask6][log2hsize4][log2hsize6] * [ 8][ 8[ 8][ 8] * * inv.mask4: 32 - mask * inv.mask6: * 1) _slow lookup: mask * 2) _aligned: (128 - mask) / 8 * 3) _64: 8 * * * pflags: * [v4=1/v6=0][hsize] * [ 32][ 32] */ struct chashentry; SLIST_HEAD(chashbhead, chashentry); struct chash_cfg { struct chashbhead *head4; struct chashbhead *head6; size_t size4; size_t size6; size_t items4; size_t items6; uint8_t mask4; uint8_t mask6; }; struct chashentry { SLIST_ENTRY(chashentry) next; uint32_t value; uint32_t type; union { uint32_t a4; /* Host format */ struct in6_addr a6; /* Network format */ } a; }; struct ta_buf_chash { void *ent_ptr; struct chashentry ent; }; #ifdef INET static __inline uint32_t hash_ip(uint32_t addr, int hsize); #endif #ifdef INET6 static __inline uint32_t hash_ip6(struct in6_addr *addr6, int hsize); static __inline uint16_t hash_ip64(struct in6_addr *addr6, int hsize); static __inline uint32_t hash_ip6_slow(struct in6_addr *addr6, void *key, int mask, int hsize); static __inline uint32_t hash_ip6_al(struct in6_addr *addr6, void *key, int mask, int hsize); #endif static int ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_lookup_chash_aligned(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int chash_parse_opts(struct chash_cfg *cfg, char *data); static void ta_print_chash_config(void *ta_state, struct table_info *ti, char *buf, size_t bufsize); static int ta_log2(uint32_t v); static int ta_init_chash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_destroy_chash(void *ta_state, struct table_info *ti); static void ta_dump_chash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_dump_chash_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static uint32_t hash_ent(struct chashentry *ent, int af, int mlen, uint32_t size); static int tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent); static int ta_find_chash_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_chash(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); static int ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_del_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_need_modify_chash(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags); static int ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags); static void ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags); static void ta_flush_mod_chash(void *ta_buf); #ifdef INET static __inline uint32_t hash_ip(uint32_t addr, int hsize) { return (addr % (hsize - 1)); } #endif #ifdef INET6 static __inline uint32_t hash_ip6(struct in6_addr *addr6, int hsize) { uint32_t i; i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1] ^ addr6->s6_addr32[2] ^ addr6->s6_addr32[3]; return (i % (hsize - 1)); } static __inline uint16_t hash_ip64(struct in6_addr *addr6, int hsize) { uint32_t i; i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1]; return (i % (hsize - 1)); } static __inline uint32_t hash_ip6_slow(struct in6_addr *addr6, void *key, int mask, int hsize) { struct in6_addr mask6; ipv6_writemask(&mask6, mask); memcpy(addr6, key, sizeof(struct in6_addr)); APPLY_MASK(addr6, &mask6); return (hash_ip6(addr6, hsize)); } static __inline uint32_t hash_ip6_al(struct in6_addr *addr6, void *key, int mask, int hsize) { uint64_t *paddr; paddr = (uint64_t *)addr6; *paddr = 0; *(paddr + 1) = 0; memcpy(addr6, key, mask); return (hash_ip6(addr6, hsize)); } #endif static int ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct chashbhead *head; struct chashentry *ent; uint16_t hash, hsize; uint8_t imask; if (keylen == sizeof(in_addr_t)) { #ifdef INET head = (struct chashbhead *)ti->state; imask = ti->data >> 24; hsize = 1 << ((ti->data & 0xFFFF) >> 8); uint32_t a; a = ntohl(*((in_addr_t *)key)); a = a >> imask; hash = hash_ip(a, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (ent->a.a4 == a) { *val = ent->value; return (1); } } #endif } else { #ifdef INET6 /* IPv6: worst scenario: non-round mask */ struct in6_addr addr6; head = (struct chashbhead *)ti->xstate; imask = (ti->data & 0xFF0000) >> 16; hsize = 1 << (ti->data & 0xFF); hash = hash_ip6_slow(&addr6, key, imask, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (memcmp(&ent->a.a6, &addr6, 16) == 0) { *val = ent->value; return (1); } } #endif } return (0); } static int ta_lookup_chash_aligned(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct chashbhead *head; struct chashentry *ent; uint16_t hash, hsize; uint8_t imask; if (keylen == sizeof(in_addr_t)) { #ifdef INET head = (struct chashbhead *)ti->state; imask = ti->data >> 24; hsize = 1 << ((ti->data & 0xFFFF) >> 8); uint32_t a; a = ntohl(*((in_addr_t *)key)); a = a >> imask; hash = hash_ip(a, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (ent->a.a4 == a) { *val = ent->value; return (1); } } #endif } else { #ifdef INET6 /* IPv6: aligned to 8bit mask */ struct in6_addr addr6; uint64_t *paddr, *ptmp; head = (struct chashbhead *)ti->xstate; imask = (ti->data & 0xFF0000) >> 16; hsize = 1 << (ti->data & 0xFF); hash = hash_ip6_al(&addr6, key, imask, hsize); paddr = (uint64_t *)&addr6; SLIST_FOREACH(ent, &head[hash], next) { ptmp = (uint64_t *)&ent->a.a6; if (paddr[0] == ptmp[0] && paddr[1] == ptmp[1]) { *val = ent->value; return (1); } } #endif } return (0); } static int ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct chashbhead *head; struct chashentry *ent; uint16_t hash, hsize; uint8_t imask; if (keylen == sizeof(in_addr_t)) { #ifdef INET head = (struct chashbhead *)ti->state; imask = ti->data >> 24; hsize = 1 << ((ti->data & 0xFFFF) >> 8); uint32_t a; a = ntohl(*((in_addr_t *)key)); a = a >> imask; hash = hash_ip(a, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (ent->a.a4 == a) { *val = ent->value; return (1); } } #endif } else { #ifdef INET6 /* IPv6: /64 */ uint64_t a6, *paddr; head = (struct chashbhead *)ti->xstate; paddr = (uint64_t *)key; hsize = 1 << (ti->data & 0xFF); a6 = *paddr; hash = hash_ip64((struct in6_addr *)key, hsize); SLIST_FOREACH(ent, &head[hash], next) { paddr = (uint64_t *)&ent->a.a6; if (a6 == *paddr) { *val = ent->value; return (1); } } #endif } return (0); } static int chash_parse_opts(struct chash_cfg *cfg, char *data) { char *pdel, *pend, *s; int mask4, mask6; mask4 = cfg->mask4; mask6 = cfg->mask6; if (data == NULL) return (0); if ((pdel = strchr(data, ' ')) == NULL) return (0); while (*pdel == ' ') pdel++; if (strncmp(pdel, "masks=", 6) != 0) return (EINVAL); if ((s = strchr(pdel, ' ')) != NULL) *s++ = '\0'; pdel += 6; /* Need /XX[,/YY] */ if (*pdel++ != '/') return (EINVAL); mask4 = strtol(pdel, &pend, 10); if (*pend == ',') { /* ,/YY */ pdel = pend + 1; if (*pdel++ != '/') return (EINVAL); mask6 = strtol(pdel, &pend, 10); if (*pend != '\0') return (EINVAL); } else if (*pend != '\0') return (EINVAL); if (mask4 < 0 || mask4 > 32 || mask6 < 0 || mask6 > 128) return (EINVAL); cfg->mask4 = mask4; cfg->mask6 = mask6; return (0); } static void ta_print_chash_config(void *ta_state, struct table_info *ti, char *buf, size_t bufsize) { struct chash_cfg *cfg; cfg = (struct chash_cfg *)ta_state; if (cfg->mask4 != 32 || cfg->mask6 != 128) snprintf(buf, bufsize, "%s masks=/%d,/%d", "addr:hash", cfg->mask4, cfg->mask6); else snprintf(buf, bufsize, "%s", "addr:hash"); } static int ta_log2(uint32_t v) { uint32_t r; r = 0; while (v >>= 1) r++; return (r); } /* * New table. * We assume 'data' to be either NULL or the following format: * 'addr:hash [masks=/32[,/128]]' */ static int ta_init_chash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { int error, i; uint32_t hsize; struct chash_cfg *cfg; cfg = malloc(sizeof(struct chash_cfg), M_IPFW, M_WAITOK | M_ZERO); cfg->mask4 = 32; cfg->mask6 = 128; if ((error = chash_parse_opts(cfg, data)) != 0) { free(cfg, M_IPFW); return (error); } cfg->size4 = 128; cfg->size6 = 128; cfg->head4 = malloc(sizeof(struct chashbhead) * cfg->size4, M_IPFW, M_WAITOK | M_ZERO); cfg->head6 = malloc(sizeof(struct chashbhead) * cfg->size6, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < cfg->size4; i++) SLIST_INIT(&cfg->head4[i]); for (i = 0; i < cfg->size6; i++) SLIST_INIT(&cfg->head6[i]); *ta_state = cfg; ti->state = cfg->head4; ti->xstate = cfg->head6; /* Store data depending on v6 mask length */ hsize = ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6); if (cfg->mask6 == 64) { ti->data = (32 - cfg->mask4) << 24 | (128 - cfg->mask6) << 16| hsize; ti->lookup = ta_lookup_chash_64; } else if ((cfg->mask6 % 8) == 0) { ti->data = (32 - cfg->mask4) << 24 | cfg->mask6 << 13 | hsize; ti->lookup = ta_lookup_chash_aligned; } else { /* don't do that! */ ti->data = (32 - cfg->mask4) << 24 | cfg->mask6 << 16 | hsize; ti->lookup = ta_lookup_chash_slow; } return (0); } static void ta_destroy_chash(void *ta_state, struct table_info *ti) { struct chash_cfg *cfg; struct chashentry *ent, *ent_next; int i; cfg = (struct chash_cfg *)ta_state; for (i = 0; i < cfg->size4; i++) SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next) free(ent, M_IPFW_TBL); for (i = 0; i < cfg->size6; i++) SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next) free(ent, M_IPFW_TBL); free(cfg->head4, M_IPFW); free(cfg->head6, M_IPFW); free(cfg, M_IPFW); } static void ta_dump_chash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct chash_cfg *cfg; cfg = (struct chash_cfg *)ta_state; tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM; tinfo->taclass4 = IPFW_TACLASS_HASH; tinfo->size4 = cfg->size4; tinfo->count4 = cfg->items4; tinfo->itemsize4 = sizeof(struct chashentry); tinfo->taclass6 = IPFW_TACLASS_HASH; tinfo->size6 = cfg->size6; tinfo->count6 = cfg->items6; tinfo->itemsize6 = sizeof(struct chashentry); } static int ta_dump_chash_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct chash_cfg *cfg; struct chashentry *ent; cfg = (struct chash_cfg *)ta_state; ent = (struct chashentry *)e; if (ent->type == AF_INET) { tent->k.addr.s_addr = htonl(ent->a.a4 << (32 - cfg->mask4)); tent->masklen = cfg->mask4; tent->subtype = AF_INET; tent->v.kidx = ent->value; #ifdef INET6 } else { memcpy(&tent->k.addr6, &ent->a.a6, sizeof(struct in6_addr)); tent->masklen = cfg->mask6; tent->subtype = AF_INET6; tent->v.kidx = ent->value; #endif } return (0); } static uint32_t hash_ent(struct chashentry *ent, int af, int mlen, uint32_t size) { uint32_t hash; hash = 0; if (af == AF_INET) { #ifdef INET hash = hash_ip(ent->a.a4, size); #endif } else { #ifdef INET6 if (mlen == 64) hash = hash_ip64(&ent->a.a6, size); else hash = hash_ip6(&ent->a.a6, size); #endif } return (hash); } static int tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent) { int mlen; #ifdef INET6 struct in6_addr mask6; #endif mlen = tei->masklen; if (tei->subtype == AF_INET) { #ifdef INET if (mlen > 32) return (EINVAL); ent->type = AF_INET; /* Calculate masked address */ ent->a.a4 = ntohl(*((in_addr_t *)tei->paddr)) >> (32 - mlen); #endif #ifdef INET6 } else if (tei->subtype == AF_INET6) { /* IPv6 case */ if (mlen > 128) return (EINVAL); ent->type = AF_INET6; ipv6_writemask(&mask6, mlen); memcpy(&ent->a.a6, tei->paddr, sizeof(struct in6_addr)); APPLY_MASK(&ent->a.a6, &mask6); #endif } else { /* Unknown CIDR type */ return (EINVAL); } return (0); } static int ta_find_chash_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct chash_cfg *cfg; struct chashbhead *head; struct chashentry ent, *tmp; struct tentry_info tei; int error; uint32_t hash; cfg = (struct chash_cfg *)ta_state; memset(&ent, 0, sizeof(ent)); memset(&tei, 0, sizeof(tei)); if (tent->subtype == AF_INET) { tei.paddr = &tent->k.addr; tei.masklen = cfg->mask4; tei.subtype = AF_INET; if ((error = tei_to_chash_ent(&tei, &ent)) != 0) return (error); head = cfg->head4; hash = hash_ent(&ent, AF_INET, cfg->mask4, cfg->size4); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (tmp->a.a4 != ent.a.a4) continue; ta_dump_chash_tentry(ta_state, ti, tmp, tent); return (0); } } else { tei.paddr = &tent->k.addr6; tei.masklen = cfg->mask6; tei.subtype = AF_INET6; if ((error = tei_to_chash_ent(&tei, &ent)) != 0) return (error); head = cfg->head6; hash = hash_ent(&ent, AF_INET6, cfg->mask6, cfg->size6); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (memcmp(&tmp->a.a6, &ent.a.a6, 16) != 0) continue; ta_dump_chash_tentry(ta_state, ti, tmp, tent); return (0); } } return (ENOENT); } static void ta_foreach_chash(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct chash_cfg *cfg; struct chashentry *ent, *ent_next; int i; cfg = (struct chash_cfg *)ta_state; for (i = 0; i < cfg->size4; i++) SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next) f(ent, arg); for (i = 0; i < cfg->size6; i++) SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next) f(ent, arg); } static int ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_chash *tb; struct chashentry *ent; int error; tb = (struct ta_buf_chash *)ta_buf; ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); error = tei_to_chash_ent(tei, ent); if (error != 0) { free(ent, M_IPFW_TBL); return (error); } tb->ent_ptr = ent; return (0); } static int ta_add_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct chash_cfg *cfg; struct chashbhead *head; struct chashentry *ent, *tmp; struct ta_buf_chash *tb; int exists; uint32_t hash, value; cfg = (struct chash_cfg *)ta_state; tb = (struct ta_buf_chash *)ta_buf; ent = (struct chashentry *)tb->ent_ptr; hash = 0; exists = 0; /* Read current value from @tei */ ent->value = tei->value; /* Read cuurrent value */ if (tei->subtype == AF_INET) { if (tei->masklen != cfg->mask4) return (EINVAL); head = cfg->head4; hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (tmp->a.a4 == ent->a.a4) { exists = 1; break; } } } else { if (tei->masklen != cfg->mask6) return (EINVAL); head = cfg->head6; hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (memcmp(&tmp->a.a6, &ent->a.a6, 16) == 0) { exists = 1; break; } } } if (exists == 1) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Record already exists. Update value if we're asked to */ value = tmp->value; tmp->value = tei->value; tei->value = value; /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; } else { if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); SLIST_INSERT_HEAD(&head[hash], ent, next); tb->ent_ptr = NULL; *pnum = 1; /* Update counters */ if (tei->subtype == AF_INET) cfg->items4++; else cfg->items6++; } return (0); } static int ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_chash *tb; tb = (struct ta_buf_chash *)ta_buf; return (tei_to_chash_ent(tei, &tb->ent)); } static int ta_del_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct chash_cfg *cfg; struct chashbhead *head; struct chashentry *tmp, *tmp_next, *ent; struct ta_buf_chash *tb; uint32_t hash; cfg = (struct chash_cfg *)ta_state; tb = (struct ta_buf_chash *)ta_buf; ent = &tb->ent; if (tei->subtype == AF_INET) { if (tei->masklen != cfg->mask4) return (EINVAL); head = cfg->head4; hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4); SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) { if (tmp->a.a4 != ent->a.a4) continue; SLIST_REMOVE(&head[hash], tmp, chashentry, next); cfg->items4--; tb->ent_ptr = tmp; tei->value = tmp->value; *pnum = 1; return (0); } } else { if (tei->masklen != cfg->mask6) return (EINVAL); head = cfg->head6; hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6); SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) { if (memcmp(&tmp->a.a6, &ent->a.a6, 16) != 0) continue; SLIST_REMOVE(&head[hash], tmp, chashentry, next); cfg->items6--; tb->ent_ptr = tmp; tei->value = tmp->value; *pnum = 1; return (0); } } return (ENOENT); } static void ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_chash *tb; tb = (struct ta_buf_chash *)ta_buf; if (tb->ent_ptr != NULL) free(tb->ent_ptr, M_IPFW_TBL); } /* * Hash growing callbacks. */ static int ta_need_modify_chash(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { struct chash_cfg *cfg; uint64_t data; /* * Since we don't know exact number of IPv4/IPv6 records in @count, * ignore non-zero @count value at all. Check current hash sizes * and return appropriate data. */ cfg = (struct chash_cfg *)ta_state; data = 0; if (cfg->items4 > cfg->size4 && cfg->size4 < 65536) data |= (cfg->size4 * 2) << 16; if (cfg->items6 > cfg->size6 && cfg->size6 < 65536) data |= cfg->size6 * 2; if (data != 0) { *pflags = data; return (1); } return (0); } /* * Allocate new, larger chash. */ static int ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags) { struct mod_item *mi; struct chashbhead *head; int i; mi = (struct mod_item *)ta_buf; memset(mi, 0, sizeof(struct mod_item)); mi->size = (*pflags >> 16) & 0xFFFF; mi->size6 = *pflags & 0xFFFF; if (mi->size > 0) { head = malloc(sizeof(struct chashbhead) * mi->size, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < mi->size; i++) SLIST_INIT(&head[i]); mi->main_ptr = head; } if (mi->size6 > 0) { head = malloc(sizeof(struct chashbhead) * mi->size6, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < mi->size6; i++) SLIST_INIT(&head[i]); mi->main_ptr6 = head; } return (0); } /* * Copy data from old runtime array to new one. */ static int ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags) { /* In is not possible to do rehash if we're not holidng WLOCK. */ return (0); } /* * Switch old & new arrays. */ static void ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags) { struct mod_item *mi; struct chash_cfg *cfg; struct chashbhead *old_head, *new_head; struct chashentry *ent, *ent_next; int af, i, mlen; uint32_t nhash; size_t old_size, new_size; mi = (struct mod_item *)ta_buf; cfg = (struct chash_cfg *)ta_state; /* Check which hash we need to grow and do we still need that */ if (mi->size > 0 && cfg->size4 < mi->size) { new_head = (struct chashbhead *)mi->main_ptr; new_size = mi->size; old_size = cfg->size4; old_head = ti->state; mlen = cfg->mask4; af = AF_INET; for (i = 0; i < old_size; i++) { SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { nhash = hash_ent(ent, af, mlen, new_size); SLIST_INSERT_HEAD(&new_head[nhash], ent, next); } } ti->state = new_head; cfg->head4 = new_head; cfg->size4 = mi->size; mi->main_ptr = old_head; } if (mi->size6 > 0 && cfg->size6 < mi->size6) { new_head = (struct chashbhead *)mi->main_ptr6; new_size = mi->size6; old_size = cfg->size6; old_head = ti->xstate; mlen = cfg->mask6; af = AF_INET6; for (i = 0; i < old_size; i++) { SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { nhash = hash_ent(ent, af, mlen, new_size); SLIST_INSERT_HEAD(&new_head[nhash], ent, next); } } ti->xstate = new_head; cfg->head6 = new_head; cfg->size6 = mi->size6; mi->main_ptr6 = old_head; } /* Update lower 32 bits with new values */ ti->data &= 0xFFFFFFFF00000000; ti->data |= ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6); } /* * Free unneded array. */ static void ta_flush_mod_chash(void *ta_buf) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; if (mi->main_ptr != NULL) free(mi->main_ptr, M_IPFW); if (mi->main_ptr6 != NULL) free(mi->main_ptr6, M_IPFW); } struct table_algo addr_hash = { .name = "addr:hash", .type = IPFW_TABLE_ADDR, .ta_buf_size = sizeof(struct ta_buf_chash), .init = ta_init_chash, .destroy = ta_destroy_chash, .prepare_add = ta_prepare_add_chash, .prepare_del = ta_prepare_del_chash, .add = ta_add_chash, .del = ta_del_chash, .flush_entry = ta_flush_chash_entry, .foreach = ta_foreach_chash, .dump_tentry = ta_dump_chash_tentry, .find_tentry = ta_find_chash_tentry, .print_config = ta_print_chash_config, .dump_tinfo = ta_dump_chash_tinfo, .need_modify = ta_need_modify_chash, .prepare_mod = ta_prepare_mod_chash, .fill_mod = ta_fill_mod_chash, .modify = ta_modify_chash, .flush_mod = ta_flush_mod_chash, }; /* * Iface table cmds. * * Implementation: * * Runtime part: * - sorted array of "struct ifidx" pointed by ti->state. * Array is allocated with rounding up to IFIDX_CHUNK. Only existing * interfaces are stored in array, however its allocated size is * sufficient to hold all table records if needed. * - current array size is stored in ti->data * * Table data: * - "struct iftable_cfg" is allocated to store table state (ta_state). * - All table records are stored inside namedobj instance. * */ struct ifidx { uint16_t kidx; uint16_t spare; uint32_t value; }; #define DEFAULT_IFIDX_SIZE 64 struct iftable_cfg; struct ifentry { struct named_object no; struct ipfw_ifc ic; struct iftable_cfg *icfg; uint32_t value; int linked; }; struct iftable_cfg { struct namedobj_instance *ii; struct ip_fw_chain *ch; struct table_info *ti; void *main_ptr; size_t size; /* Number of items allocated in array */ size_t count; /* Number of all items */ size_t used; /* Number of items _active_ now */ }; struct ta_buf_ifidx { struct ifentry *ife; uint32_t value; }; int compare_ifidx(const void *k, const void *v); static struct ifidx * ifidx_find(struct table_info *ti, void *key); static int ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_change_ti_ifidx(void *ta_state, struct table_info *ti); static int destroy_ifidx_locked(struct namedobj_instance *ii, struct named_object *no, void *arg); static void ta_destroy_ifidx(void *ta_state, struct table_info *ti); static void ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_del_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_ifidx_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static void if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex); static int ta_need_modify_ifidx(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags); static int ta_fill_mod_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags); static void ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags); static void ta_flush_mod_ifidx(void *ta_buf); static int ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int ta_find_ifidx_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static int foreach_ifidx(struct namedobj_instance *ii, struct named_object *no, void *arg); static void ta_foreach_ifidx(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); int compare_ifidx(const void *k, const void *v) { const struct ifidx *ifidx; uint16_t key; key = *((const uint16_t *)k); ifidx = (const struct ifidx *)v; if (key < ifidx->kidx) return (-1); else if (key > ifidx->kidx) return (1); return (0); } /* * Adds item @item with key @key into ascending-sorted array @base. * Assumes @base has enough additional storage. * * Returns 1 on success, 0 on duplicate key. */ static int badd(const void *key, void *item, void *base, size_t nmemb, size_t size, int (*compar) (const void *, const void *)) { int min, max, mid, shift, res; caddr_t paddr; if (nmemb == 0) { memcpy(base, item, size); return (1); } /* Binary search */ min = 0; max = nmemb - 1; mid = 0; while (min <= max) { mid = (min + max) / 2; res = compar(key, (const void *)((caddr_t)base + mid * size)); if (res == 0) return (0); if (res > 0) min = mid + 1; else max = mid - 1; } /* Item not found. */ res = compar(key, (const void *)((caddr_t)base + mid * size)); if (res > 0) shift = mid + 1; else shift = mid; paddr = (caddr_t)base + shift * size; if (nmemb > shift) memmove(paddr + size, paddr, (nmemb - shift) * size); memcpy(paddr, item, size); return (1); } /* * Deletes item with key @key from ascending-sorted array @base. * * Returns 1 on success, 0 for non-existent key. */ static int bdel(const void *key, void *base, size_t nmemb, size_t size, int (*compar) (const void *, const void *)) { caddr_t item; size_t sz; item = (caddr_t)bsearch(key, base, nmemb, size, compar); if (item == NULL) return (0); sz = (caddr_t)base + nmemb * size - item; if (sz > 0) memmove(item, item + size, sz); return (1); } static struct ifidx * ifidx_find(struct table_info *ti, void *key) { struct ifidx *ifi; ifi = bsearch(key, ti->state, ti->data, sizeof(struct ifidx), compare_ifidx); return (ifi); } static int ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct ifidx *ifi; ifi = ifidx_find(ti, key); if (ifi != NULL) { *val = ifi->value; return (1); } return (0); } static int ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { struct iftable_cfg *icfg; icfg = malloc(sizeof(struct iftable_cfg), M_IPFW, M_WAITOK | M_ZERO); icfg->ii = ipfw_objhash_create(DEFAULT_IFIDX_SIZE); icfg->size = DEFAULT_IFIDX_SIZE; icfg->main_ptr = malloc(sizeof(struct ifidx) * icfg->size, M_IPFW, M_WAITOK | M_ZERO); icfg->ch = ch; *ta_state = icfg; ti->state = icfg->main_ptr; ti->lookup = ta_lookup_ifidx; return (0); } /* * Handle tableinfo @ti pointer change (on table array resize). */ static void ta_change_ti_ifidx(void *ta_state, struct table_info *ti) { struct iftable_cfg *icfg; icfg = (struct iftable_cfg *)ta_state; icfg->ti = ti; } static int destroy_ifidx_locked(struct namedobj_instance *ii, struct named_object *no, void *arg) { struct ifentry *ife; struct ip_fw_chain *ch; ch = (struct ip_fw_chain *)arg; ife = (struct ifentry *)no; ipfw_iface_del_notify(ch, &ife->ic); ipfw_iface_unref(ch, &ife->ic); free(ife, M_IPFW_TBL); return (0); } /* * Destroys table @ti */ static void ta_destroy_ifidx(void *ta_state, struct table_info *ti) { struct iftable_cfg *icfg; struct ip_fw_chain *ch; icfg = (struct iftable_cfg *)ta_state; ch = icfg->ch; if (icfg->main_ptr != NULL) free(icfg->main_ptr, M_IPFW); IPFW_UH_WLOCK(ch); ipfw_objhash_foreach(icfg->ii, destroy_ifidx_locked, ch); IPFW_UH_WUNLOCK(ch); ipfw_objhash_destroy(icfg->ii); free(icfg, M_IPFW); } /* * Provide algo-specific table info */ static void ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct iftable_cfg *cfg; cfg = (struct iftable_cfg *)ta_state; tinfo->taclass4 = IPFW_TACLASS_ARRAY; tinfo->size4 = cfg->size; tinfo->count4 = cfg->used; tinfo->itemsize4 = sizeof(struct ifidx); } /* * Prepare state to add to the table: * allocate ifentry and reference needed interface. */ static int ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_ifidx *tb; char *ifname; struct ifentry *ife; tb = (struct ta_buf_ifidx *)ta_buf; /* Check if string is terminated */ ifname = (char *)tei->paddr; if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) return (EINVAL); ife = malloc(sizeof(struct ifentry), M_IPFW_TBL, M_WAITOK | M_ZERO); ife->ic.cb = if_notifier; ife->ic.cbdata = ife; if (ipfw_iface_ref(ch, ifname, &ife->ic) != 0) { free(ife, M_IPFW_TBL); return (EINVAL); } /* Use ipfw_iface 'ifname' field as stable storage */ ife->no.name = ife->ic.iface->ifname; tb->ife = ife; return (0); } static int ta_add_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct iftable_cfg *icfg; struct ifentry *ife, *tmp; struct ta_buf_ifidx *tb; struct ipfw_iface *iif; struct ifidx *ifi; char *ifname; uint32_t value; tb = (struct ta_buf_ifidx *)ta_buf; ifname = (char *)tei->paddr; icfg = (struct iftable_cfg *)ta_state; ife = tb->ife; ife->icfg = icfg; ife->value = tei->value; tmp = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); if (tmp != NULL) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Exchange values in @tmp and @tei */ value = tmp->value; tmp->value = tei->value; tei->value = value; iif = tmp->ic.iface; if (iif->resolved != 0) { /* We have to update runtime value, too */ ifi = ifidx_find(ti, &iif->ifindex); ifi->value = ife->value; } /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; return (0); } if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); /* Link to internal list */ ipfw_objhash_add(icfg->ii, &ife->no); /* Link notifier (possible running its callback) */ ipfw_iface_add_notify(icfg->ch, &ife->ic); icfg->count++; tb->ife = NULL; *pnum = 1; return (0); } /* * Prepare to delete key from table. * Do basic interface name checks. */ static int ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_ifidx *tb; char *ifname; tb = (struct ta_buf_ifidx *)ta_buf; /* Check if string is terminated */ ifname = (char *)tei->paddr; if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) return (EINVAL); return (0); } /* * Remove key from both configuration list and * runtime array. Removed interface notification. */ static int ta_del_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct iftable_cfg *icfg; struct ifentry *ife; struct ta_buf_ifidx *tb; char *ifname; uint16_t ifindex; int res; tb = (struct ta_buf_ifidx *)ta_buf; ifname = (char *)tei->paddr; icfg = (struct iftable_cfg *)ta_state; ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); if (ife == NULL) return (ENOENT); if (ife->linked != 0) { /* We have to remove item from runtime */ ifindex = ife->ic.iface->ifindex; res = bdel(&ifindex, icfg->main_ptr, icfg->used, sizeof(struct ifidx), compare_ifidx); KASSERT(res == 1, ("index %d does not exist", ifindex)); icfg->used--; ti->data = icfg->used; ife->linked = 0; } /* Unlink from local list */ ipfw_objhash_del(icfg->ii, &ife->no); /* Unlink notifier and deref */ ipfw_iface_del_notify(icfg->ch, &ife->ic); ipfw_iface_unref(icfg->ch, &ife->ic); icfg->count--; tei->value = ife->value; tb->ife = ife; *pnum = 1; return (0); } /* * Flush deleted entry. * Drops interface reference and frees entry. */ static void ta_flush_ifidx_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_ifidx *tb; tb = (struct ta_buf_ifidx *)ta_buf; if (tb->ife != NULL) free(tb->ife, M_IPFW_TBL); } /* * Handle interface announce/withdrawal for particular table. * Every real runtime array modification happens here. */ static void if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex) { struct ifentry *ife; struct ifidx ifi; struct iftable_cfg *icfg; struct table_info *ti; int res; ife = (struct ifentry *)cbdata; icfg = ife->icfg; ti = icfg->ti; KASSERT(ti != NULL, ("ti=NULL, check change_ti handler")); if (ife->linked == 0 && ifindex != 0) { /* Interface announce */ ifi.kidx = ifindex; ifi.spare = 0; ifi.value = ife->value; res = badd(&ifindex, &ifi, icfg->main_ptr, icfg->used, sizeof(struct ifidx), compare_ifidx); KASSERT(res == 1, ("index %d already exists", ifindex)); icfg->used++; ti->data = icfg->used; ife->linked = 1; } else if (ife->linked != 0 && ifindex == 0) { /* Interface withdrawal */ ifindex = ife->ic.iface->ifindex; res = bdel(&ifindex, icfg->main_ptr, icfg->used, sizeof(struct ifidx), compare_ifidx); KASSERT(res == 1, ("index %d does not exist", ifindex)); icfg->used--; ti->data = icfg->used; ife->linked = 0; } } /* * Table growing callbacks. */ static int ta_need_modify_ifidx(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { struct iftable_cfg *cfg; uint32_t size; cfg = (struct iftable_cfg *)ta_state; size = cfg->size; while (size < cfg->count + count) size *= 2; if (size != cfg->size) { *pflags = size; return (1); } return (0); } /* * Allocate ned, larger runtime ifidx array. */ static int ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; memset(mi, 0, sizeof(struct mod_item)); mi->size = *pflags; mi->main_ptr = malloc(sizeof(struct ifidx) * mi->size, M_IPFW, M_WAITOK | M_ZERO); return (0); } /* * Copy data from old runtime array to new one. */ static int ta_fill_mod_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags) { struct mod_item *mi; struct iftable_cfg *icfg; mi = (struct mod_item *)ta_buf; icfg = (struct iftable_cfg *)ta_state; /* Check if we still need to grow array */ if (icfg->size >= mi->size) { *pflags = 0; return (0); } memcpy(mi->main_ptr, icfg->main_ptr, icfg->used * sizeof(struct ifidx)); return (0); } /* * Switch old & new arrays. */ static void ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags) { struct mod_item *mi; struct iftable_cfg *icfg; void *old_ptr; mi = (struct mod_item *)ta_buf; icfg = (struct iftable_cfg *)ta_state; old_ptr = icfg->main_ptr; icfg->main_ptr = mi->main_ptr; icfg->size = mi->size; ti->state = icfg->main_ptr; mi->main_ptr = old_ptr; } /* * Free unneded array. */ static void ta_flush_mod_ifidx(void *ta_buf) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; if (mi->main_ptr != NULL) free(mi->main_ptr, M_IPFW); } static int ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct ifentry *ife; ife = (struct ifentry *)e; tent->masklen = 8 * IF_NAMESIZE; memcpy(&tent->k, ife->no.name, IF_NAMESIZE); tent->v.kidx = ife->value; return (0); } static int ta_find_ifidx_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct iftable_cfg *icfg; struct ifentry *ife; char *ifname; icfg = (struct iftable_cfg *)ta_state; ifname = tent->k.iface; if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) return (EINVAL); ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); if (ife != NULL) { ta_dump_ifidx_tentry(ta_state, ti, ife, tent); return (0); } return (ENOENT); } struct wa_ifidx { ta_foreach_f *f; void *arg; }; static int foreach_ifidx(struct namedobj_instance *ii, struct named_object *no, void *arg) { struct ifentry *ife; struct wa_ifidx *wa; ife = (struct ifentry *)no; wa = (struct wa_ifidx *)arg; wa->f(ife, wa->arg); return (0); } static void ta_foreach_ifidx(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct iftable_cfg *icfg; struct wa_ifidx wa; icfg = (struct iftable_cfg *)ta_state; wa.f = f; wa.arg = arg; ipfw_objhash_foreach(icfg->ii, foreach_ifidx, &wa); } struct table_algo iface_idx = { .name = "iface:array", .type = IPFW_TABLE_INTERFACE, .flags = TA_FLAG_DEFAULT, .ta_buf_size = sizeof(struct ta_buf_ifidx), .init = ta_init_ifidx, .destroy = ta_destroy_ifidx, .prepare_add = ta_prepare_add_ifidx, .prepare_del = ta_prepare_del_ifidx, .add = ta_add_ifidx, .del = ta_del_ifidx, .flush_entry = ta_flush_ifidx_entry, .foreach = ta_foreach_ifidx, .dump_tentry = ta_dump_ifidx_tentry, .find_tentry = ta_find_ifidx_tentry, .dump_tinfo = ta_dump_ifidx_tinfo, .need_modify = ta_need_modify_ifidx, .prepare_mod = ta_prepare_mod_ifidx, .fill_mod = ta_fill_mod_ifidx, .modify = ta_modify_ifidx, .flush_mod = ta_flush_mod_ifidx, .change_ti = ta_change_ti_ifidx, }; /* * Number array cmds. * * Implementation: * * Runtime part: * - sorted array of "struct numarray" pointed by ti->state. * Array is allocated with rounding up to NUMARRAY_CHUNK. * - current array size is stored in ti->data * */ struct numarray { uint32_t number; uint32_t value; }; struct numarray_cfg { void *main_ptr; size_t size; /* Number of items allocated in array */ size_t used; /* Number of items _active_ now */ }; struct ta_buf_numarray { struct numarray na; }; int compare_numarray(const void *k, const void *v); static struct numarray *numarray_find(struct table_info *ti, void *key); static int ta_lookup_numarray(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_init_numarray(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_destroy_numarray(void *ta_state, struct table_info *ti); static void ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_prepare_add_numarray(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_del_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_numarray_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_need_modify_numarray(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags); static int ta_fill_mod_numarray(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags); static void ta_modify_numarray(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags); static void ta_flush_mod_numarray(void *ta_buf); static int ta_dump_numarray_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int ta_find_numarray_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_numarray(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); int compare_numarray(const void *k, const void *v) { const struct numarray *na; uint32_t key; key = *((const uint32_t *)k); na = (const struct numarray *)v; if (key < na->number) return (-1); else if (key > na->number) return (1); return (0); } static struct numarray * numarray_find(struct table_info *ti, void *key) { struct numarray *ri; ri = bsearch(key, ti->state, ti->data, sizeof(struct numarray), compare_ifidx); return (ri); } static int ta_lookup_numarray(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct numarray *ri; ri = numarray_find(ti, key); if (ri != NULL) { *val = ri->value; return (1); } return (0); } static int ta_init_numarray(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { struct numarray_cfg *cfg; cfg = malloc(sizeof(*cfg), M_IPFW, M_WAITOK | M_ZERO); cfg->size = 16; cfg->main_ptr = malloc(sizeof(struct numarray) * cfg->size, M_IPFW, M_WAITOK | M_ZERO); *ta_state = cfg; ti->state = cfg->main_ptr; ti->lookup = ta_lookup_numarray; return (0); } /* * Destroys table @ti */ static void ta_destroy_numarray(void *ta_state, struct table_info *ti) { struct numarray_cfg *cfg; cfg = (struct numarray_cfg *)ta_state; if (cfg->main_ptr != NULL) free(cfg->main_ptr, M_IPFW); free(cfg, M_IPFW); } /* * Provide algo-specific table info */ static void ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct numarray_cfg *cfg; cfg = (struct numarray_cfg *)ta_state; tinfo->taclass4 = IPFW_TACLASS_ARRAY; tinfo->size4 = cfg->size; tinfo->count4 = cfg->used; tinfo->itemsize4 = sizeof(struct numarray); } /* * Prepare for addition/deletion to an array. */ static int ta_prepare_add_numarray(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_numarray *tb; tb = (struct ta_buf_numarray *)ta_buf; tb->na.number = *((uint32_t *)tei->paddr); return (0); } static int ta_add_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct numarray_cfg *cfg; struct ta_buf_numarray *tb; struct numarray *ri; int res; uint32_t value; tb = (struct ta_buf_numarray *)ta_buf; cfg = (struct numarray_cfg *)ta_state; /* Read current value from @tei */ tb->na.value = tei->value; ri = numarray_find(ti, &tb->na.number); if (ri != NULL) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Exchange values between ri and @tei */ value = ri->value; ri->value = tei->value; tei->value = value; /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; return (0); } if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); res = badd(&tb->na.number, &tb->na, cfg->main_ptr, cfg->used, sizeof(struct numarray), compare_numarray); KASSERT(res == 1, ("number %d already exists", tb->na.number)); cfg->used++; ti->data = cfg->used; *pnum = 1; return (0); } /* * Remove key from both configuration list and * runtime array. Removed interface notification. */ static int ta_del_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct numarray_cfg *cfg; struct ta_buf_numarray *tb; struct numarray *ri; int res; tb = (struct ta_buf_numarray *)ta_buf; cfg = (struct numarray_cfg *)ta_state; ri = numarray_find(ti, &tb->na.number); if (ri == NULL) return (ENOENT); tei->value = ri->value; res = bdel(&tb->na.number, cfg->main_ptr, cfg->used, sizeof(struct numarray), compare_numarray); KASSERT(res == 1, ("number %u does not exist", tb->na.number)); cfg->used--; ti->data = cfg->used; *pnum = 1; return (0); } static void ta_flush_numarray_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { /* We don't have any state, do nothing */ } /* * Table growing callbacks. */ static int ta_need_modify_numarray(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { struct numarray_cfg *cfg; size_t size; cfg = (struct numarray_cfg *)ta_state; size = cfg->size; while (size < cfg->used + count) size *= 2; if (size != cfg->size) { *pflags = size; return (1); } return (0); } /* * Allocate new, larger runtime array. */ static int ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; memset(mi, 0, sizeof(struct mod_item)); mi->size = *pflags; mi->main_ptr = malloc(sizeof(struct numarray) * mi->size, M_IPFW, M_WAITOK | M_ZERO); return (0); } /* * Copy data from old runtime array to new one. */ static int ta_fill_mod_numarray(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags) { struct mod_item *mi; struct numarray_cfg *cfg; mi = (struct mod_item *)ta_buf; cfg = (struct numarray_cfg *)ta_state; /* Check if we still need to grow array */ if (cfg->size >= mi->size) { *pflags = 0; return (0); } memcpy(mi->main_ptr, cfg->main_ptr, cfg->used * sizeof(struct numarray)); return (0); } /* * Switch old & new arrays. */ static void ta_modify_numarray(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags) { struct mod_item *mi; struct numarray_cfg *cfg; void *old_ptr; mi = (struct mod_item *)ta_buf; cfg = (struct numarray_cfg *)ta_state; old_ptr = cfg->main_ptr; cfg->main_ptr = mi->main_ptr; cfg->size = mi->size; ti->state = cfg->main_ptr; mi->main_ptr = old_ptr; } /* * Free unneded array. */ static void ta_flush_mod_numarray(void *ta_buf) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; if (mi->main_ptr != NULL) free(mi->main_ptr, M_IPFW); } static int ta_dump_numarray_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct numarray *na; na = (struct numarray *)e; tent->k.key = na->number; tent->v.kidx = na->value; return (0); } static int ta_find_numarray_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct numarray_cfg *cfg; struct numarray *ri; cfg = (struct numarray_cfg *)ta_state; ri = numarray_find(ti, &tent->k.key); if (ri != NULL) { ta_dump_numarray_tentry(ta_state, ti, ri, tent); return (0); } return (ENOENT); } static void ta_foreach_numarray(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct numarray_cfg *cfg; struct numarray *array; int i; cfg = (struct numarray_cfg *)ta_state; array = cfg->main_ptr; for (i = 0; i < cfg->used; i++) f(&array[i], arg); } struct table_algo number_array = { .name = "number:array", .type = IPFW_TABLE_NUMBER, .ta_buf_size = sizeof(struct ta_buf_numarray), .init = ta_init_numarray, .destroy = ta_destroy_numarray, .prepare_add = ta_prepare_add_numarray, .prepare_del = ta_prepare_add_numarray, .add = ta_add_numarray, .del = ta_del_numarray, .flush_entry = ta_flush_numarray_entry, .foreach = ta_foreach_numarray, .dump_tentry = ta_dump_numarray_tentry, .find_tentry = ta_find_numarray_tentry, .dump_tinfo = ta_dump_numarray_tinfo, .need_modify = ta_need_modify_numarray, .prepare_mod = ta_prepare_mod_numarray, .fill_mod = ta_fill_mod_numarray, .modify = ta_modify_numarray, .flush_mod = ta_flush_mod_numarray, }; /* * flow:hash cmds * * * ti->data: * [inv.mask4][inv.mask6][log2hsize4][log2hsize6] * [ 8][ 8[ 8][ 8] * * inv.mask4: 32 - mask * inv.mask6: * 1) _slow lookup: mask * 2) _aligned: (128 - mask) / 8 * 3) _64: 8 * * * pflags: * [hsize4][hsize6] * [ 16][ 16] */ struct fhashentry; SLIST_HEAD(fhashbhead, fhashentry); struct fhashentry { SLIST_ENTRY(fhashentry) next; uint8_t af; uint8_t proto; uint16_t spare0; uint16_t dport; uint16_t sport; uint32_t value; uint32_t spare1; }; struct fhashentry4 { struct fhashentry e; struct in_addr dip; struct in_addr sip; }; struct fhashentry6 { struct fhashentry e; struct in6_addr dip6; struct in6_addr sip6; }; struct fhash_cfg { struct fhashbhead *head; size_t size; size_t items; struct fhashentry4 fe4; struct fhashentry6 fe6; }; struct ta_buf_fhash { void *ent_ptr; struct fhashentry6 fe6; }; static __inline int cmp_flow_ent(struct fhashentry *a, struct fhashentry *b, size_t sz); static __inline uint32_t hash_flow4(struct fhashentry4 *f, int hsize); static __inline uint32_t hash_flow6(struct fhashentry6 *f, int hsize); static uint32_t hash_flow_ent(struct fhashentry *ent, uint32_t size); static int ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_init_fhash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_destroy_fhash(void *ta_state, struct table_info *ti); static void ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_dump_fhash_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent); static int ta_find_fhash_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_fhash(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); static int ta_prepare_add_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_del_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_need_modify_fhash(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags); static int ta_fill_mod_fhash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags); static void ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags); static void ta_flush_mod_fhash(void *ta_buf); static __inline int cmp_flow_ent(struct fhashentry *a, struct fhashentry *b, size_t sz) { uint64_t *ka, *kb; ka = (uint64_t *)(&a->next + 1); kb = (uint64_t *)(&b->next + 1); if (*ka == *kb && (memcmp(a + 1, b + 1, sz) == 0)) return (1); return (0); } static __inline uint32_t hash_flow4(struct fhashentry4 *f, int hsize) { uint32_t i; i = (f->dip.s_addr) ^ (f->sip.s_addr) ^ (f->e.dport) ^ (f->e.sport); return (i % (hsize - 1)); } static __inline uint32_t hash_flow6(struct fhashentry6 *f, int hsize) { uint32_t i; i = (f->dip6.__u6_addr.__u6_addr32[2]) ^ (f->dip6.__u6_addr.__u6_addr32[3]) ^ (f->sip6.__u6_addr.__u6_addr32[2]) ^ (f->sip6.__u6_addr.__u6_addr32[3]) ^ (f->e.dport) ^ (f->e.sport); return (i % (hsize - 1)); } static uint32_t hash_flow_ent(struct fhashentry *ent, uint32_t size) { uint32_t hash; if (ent->af == AF_INET) { hash = hash_flow4((struct fhashentry4 *)ent, size); } else { hash = hash_flow6((struct fhashentry6 *)ent, size); } return (hash); } static int ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct fhashbhead *head; struct fhashentry *ent; struct fhashentry4 *m4; struct ipfw_flow_id *id; uint32_t hsize; uint16_t hash; id = (struct ipfw_flow_id *)key; head = (struct fhashbhead *)ti->state; hsize = ti->data; m4 = (struct fhashentry4 *)ti->xstate; if (id->addr_type == 4) { struct fhashentry4 f; /* Copy hash mask */ f = *m4; f.dip.s_addr &= id->dst_ip; f.sip.s_addr &= id->src_ip; f.e.dport &= id->dst_port; f.e.sport &= id->src_port; f.e.proto &= id->proto; hash = hash_flow4(&f, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (cmp_flow_ent(ent, &f.e, 2 * 4) != 0) { *val = ent->value; return (1); } } } else if (id->addr_type == 6) { struct fhashentry6 f; uint64_t *fp, *idp; /* Copy hash mask */ f = *((struct fhashentry6 *)(m4 + 1)); /* Handle lack of __u6_addr.__u6_addr64 */ fp = (uint64_t *)&f.dip6; idp = (uint64_t *)&id->dst_ip6; /* src IPv6 is stored after dst IPv6 */ *fp++ &= *idp++; *fp++ &= *idp++; *fp++ &= *idp++; *fp &= *idp; f.e.dport &= id->dst_port; f.e.sport &= id->src_port; f.e.proto &= id->proto; hash = hash_flow6(&f, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (cmp_flow_ent(ent, &f.e, 2 * 16) != 0) { *val = ent->value; return (1); } } } return (0); } /* * New table. */ static int ta_init_fhash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { struct fhash_cfg *cfg; struct fhashentry4 *fe4; struct fhashentry6 *fe6; u_int i; cfg = malloc(sizeof(struct fhash_cfg), M_IPFW, M_WAITOK | M_ZERO); cfg->size = 512; cfg->head = malloc(sizeof(struct fhashbhead) * cfg->size, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < cfg->size; i++) SLIST_INIT(&cfg->head[i]); /* Fill in fe masks based on @tflags */ fe4 = &cfg->fe4; fe6 = &cfg->fe6; if (tflags & IPFW_TFFLAG_SRCIP) { memset(&fe4->sip, 0xFF, sizeof(fe4->sip)); memset(&fe6->sip6, 0xFF, sizeof(fe6->sip6)); } if (tflags & IPFW_TFFLAG_DSTIP) { memset(&fe4->dip, 0xFF, sizeof(fe4->dip)); memset(&fe6->dip6, 0xFF, sizeof(fe6->dip6)); } if (tflags & IPFW_TFFLAG_SRCPORT) { memset(&fe4->e.sport, 0xFF, sizeof(fe4->e.sport)); memset(&fe6->e.sport, 0xFF, sizeof(fe6->e.sport)); } if (tflags & IPFW_TFFLAG_DSTPORT) { memset(&fe4->e.dport, 0xFF, sizeof(fe4->e.dport)); memset(&fe6->e.dport, 0xFF, sizeof(fe6->e.dport)); } if (tflags & IPFW_TFFLAG_PROTO) { memset(&fe4->e.proto, 0xFF, sizeof(fe4->e.proto)); memset(&fe6->e.proto, 0xFF, sizeof(fe6->e.proto)); } fe4->e.af = AF_INET; fe6->e.af = AF_INET6; *ta_state = cfg; ti->state = cfg->head; ti->xstate = &cfg->fe4; ti->data = cfg->size; ti->lookup = ta_lookup_fhash; return (0); } static void ta_destroy_fhash(void *ta_state, struct table_info *ti) { struct fhash_cfg *cfg; struct fhashentry *ent, *ent_next; int i; cfg = (struct fhash_cfg *)ta_state; for (i = 0; i < cfg->size; i++) SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next) free(ent, M_IPFW_TBL); free(cfg->head, M_IPFW); free(cfg, M_IPFW); } /* * Provide algo-specific table info */ static void ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct fhash_cfg *cfg; cfg = (struct fhash_cfg *)ta_state; tinfo->flags = IPFW_TATFLAGS_AFITEM; tinfo->taclass4 = IPFW_TACLASS_HASH; tinfo->size4 = cfg->size; tinfo->count4 = cfg->items; tinfo->itemsize4 = sizeof(struct fhashentry4); tinfo->itemsize6 = sizeof(struct fhashentry6); } static int ta_dump_fhash_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct fhash_cfg *cfg; struct fhashentry *ent; struct fhashentry4 *fe4; #ifdef INET6 struct fhashentry6 *fe6; #endif struct tflow_entry *tfe; cfg = (struct fhash_cfg *)ta_state; ent = (struct fhashentry *)e; tfe = &tent->k.flow; tfe->af = ent->af; tfe->proto = ent->proto; tfe->dport = htons(ent->dport); tfe->sport = htons(ent->sport); tent->v.kidx = ent->value; tent->subtype = ent->af; if (ent->af == AF_INET) { fe4 = (struct fhashentry4 *)ent; tfe->a.a4.sip.s_addr = htonl(fe4->sip.s_addr); tfe->a.a4.dip.s_addr = htonl(fe4->dip.s_addr); tent->masklen = 32; #ifdef INET6 } else { fe6 = (struct fhashentry6 *)ent; tfe->a.a6.sip6 = fe6->sip6; tfe->a.a6.dip6 = fe6->dip6; tent->masklen = 128; #endif } return (0); } static int tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent) { #ifdef INET struct fhashentry4 *fe4; #endif #ifdef INET6 struct fhashentry6 *fe6; #endif struct tflow_entry *tfe; tfe = (struct tflow_entry *)tei->paddr; ent->af = tei->subtype; ent->proto = tfe->proto; ent->dport = ntohs(tfe->dport); ent->sport = ntohs(tfe->sport); if (tei->subtype == AF_INET) { #ifdef INET fe4 = (struct fhashentry4 *)ent; fe4->sip.s_addr = ntohl(tfe->a.a4.sip.s_addr); fe4->dip.s_addr = ntohl(tfe->a.a4.dip.s_addr); #endif #ifdef INET6 } else if (tei->subtype == AF_INET6) { fe6 = (struct fhashentry6 *)ent; fe6->sip6 = tfe->a.a6.sip6; fe6->dip6 = tfe->a.a6.dip6; #endif } else { /* Unknown CIDR type */ return (EINVAL); } return (0); } static int ta_find_fhash_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct fhash_cfg *cfg; struct fhashbhead *head; struct fhashentry *ent, *tmp; struct fhashentry6 fe6; struct tentry_info tei; int error; uint32_t hash; size_t sz; cfg = (struct fhash_cfg *)ta_state; ent = &fe6.e; memset(&fe6, 0, sizeof(fe6)); memset(&tei, 0, sizeof(tei)); tei.paddr = &tent->k.flow; tei.subtype = tent->subtype; if ((error = tei_to_fhash_ent(&tei, ent)) != 0) return (error); head = cfg->head; hash = hash_flow_ent(ent, cfg->size); if (tei.subtype == AF_INET) sz = 2 * sizeof(struct in_addr); else sz = 2 * sizeof(struct in6_addr); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (cmp_flow_ent(tmp, ent, sz) != 0) { ta_dump_fhash_tentry(ta_state, ti, tmp, tent); return (0); } } return (ENOENT); } static void ta_foreach_fhash(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct fhash_cfg *cfg; struct fhashentry *ent, *ent_next; int i; cfg = (struct fhash_cfg *)ta_state; for (i = 0; i < cfg->size; i++) SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next) f(ent, arg); } static int ta_prepare_add_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_fhash *tb; struct fhashentry *ent; size_t sz; int error; tb = (struct ta_buf_fhash *)ta_buf; if (tei->subtype == AF_INET) sz = sizeof(struct fhashentry4); else if (tei->subtype == AF_INET6) sz = sizeof(struct fhashentry6); else return (EINVAL); ent = malloc(sz, M_IPFW_TBL, M_WAITOK | M_ZERO); error = tei_to_fhash_ent(tei, ent); if (error != 0) { free(ent, M_IPFW_TBL); return (error); } tb->ent_ptr = ent; return (0); } static int ta_add_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct fhash_cfg *cfg; struct fhashbhead *head; struct fhashentry *ent, *tmp; struct ta_buf_fhash *tb; int exists; uint32_t hash, value; size_t sz; cfg = (struct fhash_cfg *)ta_state; tb = (struct ta_buf_fhash *)ta_buf; ent = (struct fhashentry *)tb->ent_ptr; exists = 0; /* Read current value from @tei */ ent->value = tei->value; head = cfg->head; hash = hash_flow_ent(ent, cfg->size); if (tei->subtype == AF_INET) sz = 2 * sizeof(struct in_addr); else sz = 2 * sizeof(struct in6_addr); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (cmp_flow_ent(tmp, ent, sz) != 0) { exists = 1; break; } } if (exists == 1) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Record already exists. Update value if we're asked to */ /* Exchange values between tmp and @tei */ value = tmp->value; tmp->value = tei->value; tei->value = value; /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; } else { if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); SLIST_INSERT_HEAD(&head[hash], ent, next); tb->ent_ptr = NULL; *pnum = 1; /* Update counters and check if we need to grow hash */ cfg->items++; } return (0); } static int ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_fhash *tb; tb = (struct ta_buf_fhash *)ta_buf; return (tei_to_fhash_ent(tei, &tb->fe6.e)); } static int ta_del_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct fhash_cfg *cfg; struct fhashbhead *head; struct fhashentry *ent, *tmp; struct ta_buf_fhash *tb; uint32_t hash; size_t sz; cfg = (struct fhash_cfg *)ta_state; tb = (struct ta_buf_fhash *)ta_buf; ent = &tb->fe6.e; head = cfg->head; hash = hash_flow_ent(ent, cfg->size); if (tei->subtype == AF_INET) sz = 2 * sizeof(struct in_addr); else sz = 2 * sizeof(struct in6_addr); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (cmp_flow_ent(tmp, ent, sz) == 0) continue; SLIST_REMOVE(&head[hash], tmp, fhashentry, next); tei->value = tmp->value; *pnum = 1; cfg->items--; tb->ent_ptr = tmp; return (0); } return (ENOENT); } static void ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_fhash *tb; tb = (struct ta_buf_fhash *)ta_buf; if (tb->ent_ptr != NULL) free(tb->ent_ptr, M_IPFW_TBL); } /* * Hash growing callbacks. */ static int ta_need_modify_fhash(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { struct fhash_cfg *cfg; cfg = (struct fhash_cfg *)ta_state; if (cfg->items > cfg->size && cfg->size < 65536) { *pflags = cfg->size * 2; return (1); } return (0); } /* * Allocate new, larger fhash. */ static int ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags) { struct mod_item *mi; struct fhashbhead *head; u_int i; mi = (struct mod_item *)ta_buf; memset(mi, 0, sizeof(struct mod_item)); mi->size = *pflags; head = malloc(sizeof(struct fhashbhead) * mi->size, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < mi->size; i++) SLIST_INIT(&head[i]); mi->main_ptr = head; return (0); } /* * Copy data from old runtime array to new one. */ static int ta_fill_mod_fhash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags) { /* In is not possible to do rehash if we're not holidng WLOCK. */ return (0); } /* * Switch old & new arrays. */ static void ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags) { struct mod_item *mi; struct fhash_cfg *cfg; struct fhashbhead *old_head, *new_head; struct fhashentry *ent, *ent_next; int i; uint32_t nhash; size_t old_size; mi = (struct mod_item *)ta_buf; cfg = (struct fhash_cfg *)ta_state; old_size = cfg->size; old_head = ti->state; new_head = (struct fhashbhead *)mi->main_ptr; for (i = 0; i < old_size; i++) { SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { nhash = hash_flow_ent(ent, mi->size); SLIST_INSERT_HEAD(&new_head[nhash], ent, next); } } ti->state = new_head; ti->data = mi->size; cfg->head = new_head; cfg->size = mi->size; mi->main_ptr = old_head; } /* * Free unneded array. */ static void ta_flush_mod_fhash(void *ta_buf) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; if (mi->main_ptr != NULL) free(mi->main_ptr, M_IPFW); } struct table_algo flow_hash = { .name = "flow:hash", .type = IPFW_TABLE_FLOW, .flags = TA_FLAG_DEFAULT, .ta_buf_size = sizeof(struct ta_buf_fhash), .init = ta_init_fhash, .destroy = ta_destroy_fhash, .prepare_add = ta_prepare_add_fhash, .prepare_del = ta_prepare_del_fhash, .add = ta_add_fhash, .del = ta_del_fhash, .flush_entry = ta_flush_fhash_entry, .foreach = ta_foreach_fhash, .dump_tentry = ta_dump_fhash_tentry, .find_tentry = ta_find_fhash_tentry, .dump_tinfo = ta_dump_fhash_tinfo, .need_modify = ta_need_modify_fhash, .prepare_mod = ta_prepare_mod_fhash, .fill_mod = ta_fill_mod_fhash, .modify = ta_modify_fhash, .flush_mod = ta_flush_mod_fhash, }; /* * Kernel fibs bindings. * * Implementation: * * Runtime part: * - fully relies on route API * - fib number is stored in ti->data * */ static int ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int kfib_parse_opts(int *pfib, char *data); static void ta_print_kfib_config(void *ta_state, struct table_info *ti, char *buf, size_t bufsize); static int ta_init_kfib(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_destroy_kfib(void *ta_state, struct table_info *ti); static void ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int contigmask(uint8_t *p, int len); static int ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int ta_dump_kfib_tentry_int(struct sockaddr *paddr, struct sockaddr *pmask, ipfw_obj_tentry *tent); static int ta_find_kfib_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_kfib(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); static int ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { #ifdef INET - struct nhop4_basic nh4; struct in_addr in; #endif -#ifdef INET6 - struct nhop6_basic nh6; -#endif int error; error = ENOENT; #ifdef INET if (keylen == 4) { in.s_addr = *(in_addr_t *)key; - error = fib4_lookup_nh_basic(ti->data, - in, 0, 0, &nh4); + NET_EPOCH_ASSERT(); + error = fib4_lookup(ti->data, in, 0, NHR_NONE, 0) != NULL; } #endif #ifdef INET6 if (keylen == 6) - error = fib6_lookup_nh_basic(ti->data, - (struct in6_addr *)key, 0, 0, 0, &nh6); + error = fib6_lookup(ti->data, (struct in6_addr *)key, + 0, NHR_NONE, 0) != NULL; #endif if (error != 0) return (0); *val = 0; return (1); } /* Parse 'fib=%d' */ static int kfib_parse_opts(int *pfib, char *data) { char *pdel, *pend, *s; int fibnum; if (data == NULL) return (0); if ((pdel = strchr(data, ' ')) == NULL) return (0); while (*pdel == ' ') pdel++; if (strncmp(pdel, "fib=", 4) != 0) return (EINVAL); if ((s = strchr(pdel, ' ')) != NULL) *s++ = '\0'; pdel += 4; /* Need \d+ */ fibnum = strtol(pdel, &pend, 10); if (*pend != '\0') return (EINVAL); *pfib = fibnum; return (0); } static void ta_print_kfib_config(void *ta_state, struct table_info *ti, char *buf, size_t bufsize) { if (ti->data != 0) snprintf(buf, bufsize, "%s fib=%lu", "addr:kfib", ti->data); else snprintf(buf, bufsize, "%s", "addr:kfib"); } static int ta_init_kfib(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { int error, fibnum; fibnum = 0; if ((error = kfib_parse_opts(&fibnum, data)) != 0) return (error); if (fibnum >= rt_numfibs) return (E2BIG); ti->data = fibnum; ti->lookup = ta_lookup_kfib; return (0); } /* * Destroys table @ti */ static void ta_destroy_kfib(void *ta_state, struct table_info *ti) { } /* * Provide algo-specific table info */ static void ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { tinfo->flags = IPFW_TATFLAGS_AFDATA; tinfo->taclass4 = IPFW_TACLASS_RADIX; tinfo->count4 = 0; tinfo->itemsize4 = sizeof(struct rtentry); tinfo->taclass6 = IPFW_TACLASS_RADIX; tinfo->count6 = 0; tinfo->itemsize6 = sizeof(struct rtentry); } static int contigmask(uint8_t *p, int len) { int i, n; for (i = 0; i < len ; i++) if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */ break; for (n= i + 1; n < len; n++) if ( (p[n/8] & (1 << (7 - (n % 8)))) != 0) return (-1); /* mask not contiguous */ return (i); } static int ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct rtentry *rte; rte = (struct rtentry *)e; return ta_dump_kfib_tentry_int(rt_key(rte), rt_mask(rte), tent); } static int ta_dump_kfib_tentry_int(struct sockaddr *paddr, struct sockaddr *pmask, ipfw_obj_tentry *tent) { #ifdef INET struct sockaddr_in *addr, *mask; #endif #ifdef INET6 struct sockaddr_in6 *addr6, *mask6; #endif int len; len = 0; /* Guess IPv4/IPv6 radix by sockaddr family */ #ifdef INET if (paddr->sa_family == AF_INET) { addr = (struct sockaddr_in *)paddr; mask = (struct sockaddr_in *)pmask; tent->k.addr.s_addr = addr->sin_addr.s_addr; len = 32; if (mask != NULL) len = contigmask((uint8_t *)&mask->sin_addr, 32); if (len == -1) len = 0; tent->masklen = len; tent->subtype = AF_INET; tent->v.kidx = 0; /* Do we need to put GW here? */ } #endif #ifdef INET6 if (paddr->sa_family == AF_INET6) { addr6 = (struct sockaddr_in6 *)paddr; mask6 = (struct sockaddr_in6 *)pmask; memcpy(&tent->k.addr6, &addr6->sin6_addr, sizeof(struct in6_addr)); len = 128; if (mask6 != NULL) len = contigmask((uint8_t *)&mask6->sin6_addr, 128); if (len == -1) len = 0; tent->masklen = len; tent->subtype = AF_INET6; tent->v.kidx = 0; } #endif return (0); } static int ta_find_kfib_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct rt_addrinfo info; struct sockaddr_in6 key6, dst6, mask6; struct sockaddr *dst, *key, *mask; /* Prepare sockaddr for prefix/mask and info */ bzero(&dst6, sizeof(dst6)); dst6.sin6_len = sizeof(dst6); dst = (struct sockaddr *)&dst6; bzero(&mask6, sizeof(mask6)); mask6.sin6_len = sizeof(mask6); mask = (struct sockaddr *)&mask6; bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_NETMASK] = mask; /* Prepare the lookup key */ bzero(&key6, sizeof(key6)); key6.sin6_family = tent->subtype; key = (struct sockaddr *)&key6; if (tent->subtype == AF_INET) { ((struct sockaddr_in *)&key6)->sin_addr = tent->k.addr; key6.sin6_len = sizeof(struct sockaddr_in); } else { key6.sin6_addr = tent->k.addr6; key6.sin6_len = sizeof(struct sockaddr_in6); } if (rib_lookup_info(ti->data, key, 0, 0, &info) != 0) return (ENOENT); if ((info.rti_addrs & RTA_NETMASK) == 0) mask = NULL; ta_dump_kfib_tentry_int(dst, mask, tent); return (0); } static void ta_foreach_kfib(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { RIB_RLOCK_TRACKER; struct rib_head *rh; int error; rh = rt_tables_get_rnh(ti->data, AF_INET); if (rh != NULL) { RIB_RLOCK(rh); error = rh->rnh_walktree(&rh->head, (walktree_f_t *)f, arg); RIB_RUNLOCK(rh); } rh = rt_tables_get_rnh(ti->data, AF_INET6); if (rh != NULL) { RIB_RLOCK(rh); error = rh->rnh_walktree(&rh->head, (walktree_f_t *)f, arg); RIB_RUNLOCK(rh); } } struct table_algo addr_kfib = { .name = "addr:kfib", .type = IPFW_TABLE_ADDR, .flags = TA_FLAG_READONLY, .ta_buf_size = 0, .init = ta_init_kfib, .destroy = ta_destroy_kfib, .foreach = ta_foreach_kfib, .dump_tentry = ta_dump_kfib_tentry, .find_tentry = ta_find_kfib_tentry, .dump_tinfo = ta_dump_kfib_tinfo, .print_config = ta_print_kfib_config, }; void ipfw_table_algo_init(struct ip_fw_chain *ch) { size_t sz; /* * Register all algorithms presented here. */ sz = sizeof(struct table_algo); ipfw_add_table_algo(ch, &addr_radix, sz, &addr_radix.idx); ipfw_add_table_algo(ch, &addr_hash, sz, &addr_hash.idx); ipfw_add_table_algo(ch, &iface_idx, sz, &iface_idx.idx); ipfw_add_table_algo(ch, &number_array, sz, &number_array.idx); ipfw_add_table_algo(ch, &flow_hash, sz, &flow_hash.idx); ipfw_add_table_algo(ch, &addr_kfib, sz, &addr_kfib.idx); } void ipfw_table_algo_destroy(struct ip_fw_chain *ch) { ipfw_del_table_algo(ch, addr_radix.idx); ipfw_del_table_algo(ch, addr_hash.idx); ipfw_del_table_algo(ch, iface_idx.idx); ipfw_del_table_algo(ch, number_array.idx); ipfw_del_table_algo(ch, flow_hash.idx); ipfw_del_table_algo(ch, addr_kfib.idx); } Index: head/sys/netpfil/ipfw/nat64/nat64_translate.c =================================================================== --- head/sys/netpfil/ipfw/nat64/nat64_translate.c (revision 362899) +++ head/sys/netpfil/ipfw/nat64/nat64_translate.c (revision 362900) @@ -1,1707 +1,1716 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2015-2019 Yandex LLC * Copyright (c) 2015-2019 Andrey V. Elsukov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ipstealth.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ip_fw_nat64.h" #include "nat64_translate.h" typedef int (*nat64_output_t)(struct ifnet *, struct mbuf *, struct sockaddr *, struct nat64_counters *, void *); typedef int (*nat64_output_one_t)(struct mbuf *, struct nat64_counters *, void *); -static int nat64_find_route4(struct nhop4_basic *, struct sockaddr_in *, +static struct nhop_object *nat64_find_route4(struct sockaddr_in *, struct mbuf *); -static int nat64_find_route6(struct nhop6_basic *, struct sockaddr_in6 *, +static struct nhop_object *nat64_find_route6(struct sockaddr_in6 *, struct mbuf *); static int nat64_output_one(struct mbuf *, struct nat64_counters *, void *); static int nat64_output(struct ifnet *, struct mbuf *, struct sockaddr *, struct nat64_counters *, void *); static int nat64_direct_output_one(struct mbuf *, struct nat64_counters *, void *); static int nat64_direct_output(struct ifnet *, struct mbuf *, struct sockaddr *, struct nat64_counters *, void *); struct nat64_methods { nat64_output_t output; nat64_output_one_t output_one; }; static const struct nat64_methods nat64_netisr = { .output = nat64_output, .output_one = nat64_output_one }; static const struct nat64_methods nat64_direct = { .output = nat64_direct_output, .output_one = nat64_direct_output_one }; /* These variables should be initialized explicitly on module loading */ VNET_DEFINE_STATIC(const struct nat64_methods *, nat64out); VNET_DEFINE_STATIC(const int *, nat64ipstealth); VNET_DEFINE_STATIC(const int *, nat64ip6stealth); #define V_nat64out VNET(nat64out) #define V_nat64ipstealth VNET(nat64ipstealth) #define V_nat64ip6stealth VNET(nat64ip6stealth) static const int stealth_on = 1; #ifndef IPSTEALTH static const int stealth_off = 0; #endif void nat64_set_output_method(int direct) { if (direct != 0) { V_nat64out = &nat64_direct; #ifdef IPSTEALTH /* Honor corresponding variables, if IPSTEALTH is defined */ V_nat64ipstealth = &V_ipstealth; V_nat64ip6stealth = &V_ip6stealth; #else /* otherwise we need to decrement HLIM/TTL for direct case */ V_nat64ipstealth = V_nat64ip6stealth = &stealth_off; #endif } else { V_nat64out = &nat64_netisr; /* Leave TTL/HLIM decrementing to forwarding code */ V_nat64ipstealth = V_nat64ip6stealth = &stealth_on; } } int nat64_get_output_method(void) { return (V_nat64out == &nat64_direct ? 1: 0); } static void nat64_log(struct pfloghdr *logdata, struct mbuf *m, sa_family_t family) { logdata->dir = PF_OUT; logdata->af = family; ipfw_bpf_mtap2(logdata, PFLOG_HDRLEN, m); } static int nat64_direct_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct nat64_counters *stats, void *logdata) { int error; if (logdata != NULL) nat64_log(logdata, m, dst->sa_family); error = (*ifp->if_output)(ifp, m, dst, NULL); if (error != 0) NAT64STAT_INC(stats, oerrors); return (error); } static int nat64_direct_output_one(struct mbuf *m, struct nat64_counters *stats, void *logdata) { - struct nhop6_basic nh6; - struct nhop4_basic nh4; + struct nhop_object *nh4 = NULL; + struct nhop_object *nh6 = NULL; struct sockaddr_in6 dst6; struct sockaddr_in dst4; struct sockaddr *dst; struct ip6_hdr *ip6; struct ip *ip4; struct ifnet *ifp; int error; ip4 = mtod(m, struct ip *); + error = 0; switch (ip4->ip_v) { case IPVERSION: dst4.sin_addr = ip4->ip_dst; - error = nat64_find_route4(&nh4, &dst4, m); - if (error != 0) + nh4 = nat64_find_route4(&dst4, m); + if (nh4 == NULL) { NAT64STAT_INC(stats, noroute4); - else { - ifp = nh4.nh_ifp; + error = EHOSTUNREACH; + } else { + ifp = nh4->nh_ifp; dst = (struct sockaddr *)&dst4; } break; case (IPV6_VERSION >> 4): ip6 = mtod(m, struct ip6_hdr *); dst6.sin6_addr = ip6->ip6_dst; - error = nat64_find_route6(&nh6, &dst6, m); - if (error != 0) + nh6 = nat64_find_route6(&dst6, m); + if (nh6 == NULL) { NAT64STAT_INC(stats, noroute6); - else { - ifp = nh6.nh_ifp; + error = EHOSTUNREACH; + } else { + ifp = nh6->nh_ifp; dst = (struct sockaddr *)&dst6; } break; default: m_freem(m); NAT64STAT_INC(stats, dropped); DPRINTF(DP_DROPS, "dropped due to unknown IP version"); return (EAFNOSUPPORT); } if (error != 0) { m_freem(m); return (EHOSTUNREACH); } if (logdata != NULL) nat64_log(logdata, m, dst->sa_family); error = (*ifp->if_output)(ifp, m, dst, NULL); if (error != 0) NAT64STAT_INC(stats, oerrors); return (error); } static int nat64_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct nat64_counters *stats, void *logdata) { struct ip *ip4; int ret, af; ip4 = mtod(m, struct ip *); switch (ip4->ip_v) { case IPVERSION: af = AF_INET; ret = NETISR_IP; break; case (IPV6_VERSION >> 4): af = AF_INET6; ret = NETISR_IPV6; break; default: m_freem(m); NAT64STAT_INC(stats, dropped); DPRINTF(DP_DROPS, "unknown IP version"); return (EAFNOSUPPORT); } if (logdata != NULL) nat64_log(logdata, m, af); if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; ret = netisr_queue(ret, m); if (ret != 0) NAT64STAT_INC(stats, oerrors); return (ret); } static int nat64_output_one(struct mbuf *m, struct nat64_counters *stats, void *logdata) { return (nat64_output(NULL, m, NULL, stats, logdata)); } /* * Check the given IPv6 prefix and length according to RFC6052: * The prefixes can only have one of the following lengths: * 32, 40, 48, 56, 64, or 96 (The Well-Known Prefix is 96 bits long). * Returns zero on success, otherwise EINVAL. */ int nat64_check_prefixlen(int length) { switch (length) { case 32: case 40: case 48: case 56: case 64: case 96: return (0); } return (EINVAL); } int nat64_check_prefix6(const struct in6_addr *prefix, int length) { if (nat64_check_prefixlen(length) != 0) return (EINVAL); /* Well-known prefix has 96 prefix length */ if (IN6_IS_ADDR_WKPFX(prefix) && length != 96) return (EINVAL); /* Bits 64 to 71 must be set to zero */ if (prefix->__u6_addr.__u6_addr8[8] != 0) return (EINVAL); /* Some extra checks */ if (IN6_IS_ADDR_MULTICAST(prefix) || IN6_IS_ADDR_UNSPECIFIED(prefix) || IN6_IS_ADDR_LOOPBACK(prefix)) return (EINVAL); return (0); } int nat64_check_private_ip4(const struct nat64_config *cfg, in_addr_t ia) { if (cfg->flags & NAT64_ALLOW_PRIVATE) return (0); /* WKPFX must not be used to represent non-global IPv4 addresses */ if (cfg->flags & NAT64_WKPFX) { /* IN_PRIVATE */ if ((ia & htonl(0xff000000)) == htonl(0x0a000000) || (ia & htonl(0xfff00000)) == htonl(0xac100000) || (ia & htonl(0xffff0000)) == htonl(0xc0a80000)) return (1); /* * RFC 5735: * 192.0.0.0/24 - reserved for IETF protocol assignments * 192.88.99.0/24 - for use as 6to4 relay anycast addresses * 198.18.0.0/15 - for use in benchmark tests * 192.0.2.0/24, 198.51.100.0/24, 203.0.113.0/24 - for use * in documentation and example code */ if ((ia & htonl(0xffffff00)) == htonl(0xc0000000) || (ia & htonl(0xffffff00)) == htonl(0xc0586300) || (ia & htonl(0xfffffe00)) == htonl(0xc6120000) || (ia & htonl(0xffffff00)) == htonl(0xc0000200) || (ia & htonl(0xfffffe00)) == htonl(0xc6336400) || (ia & htonl(0xffffff00)) == htonl(0xcb007100)) return (1); } return (0); } /* * Embed @ia IPv4 address into @ip6 IPv6 address. * Place to embedding determined from prefix length @plen. */ void nat64_embed_ip4(struct in6_addr *ip6, int plen, in_addr_t ia) { switch (plen) { case 32: case 96: ip6->s6_addr32[plen / 32] = ia; break; case 40: case 48: case 56: /* * Preserve prefix bits. * Since suffix bits should be zero and reserved for future * use, we just overwrite the whole word, where they are. */ ip6->s6_addr32[1] &= 0xffffffff << (32 - plen % 32); #if BYTE_ORDER == BIG_ENDIAN ip6->s6_addr32[1] |= ia >> (plen % 32); ip6->s6_addr32[2] = ia << (24 - plen % 32); #elif BYTE_ORDER == LITTLE_ENDIAN ip6->s6_addr32[1] |= ia << (plen % 32); ip6->s6_addr32[2] = ia >> (24 - plen % 32); #endif break; case 64: #if BYTE_ORDER == BIG_ENDIAN ip6->s6_addr32[2] = ia >> 8; ip6->s6_addr32[3] = ia << 24; #elif BYTE_ORDER == LITTLE_ENDIAN ip6->s6_addr32[2] = ia << 8; ip6->s6_addr32[3] = ia >> 24; #endif break; default: panic("Wrong plen: %d", plen); }; /* * Bits 64 to 71 of the address are reserved for compatibility * with the host identifier format defined in the IPv6 addressing * architecture [RFC4291]. These bits MUST be set to zero. */ ip6->s6_addr8[8] = 0; } in_addr_t nat64_extract_ip4(const struct in6_addr *ip6, int plen) { in_addr_t ia; /* * According to RFC 6052 p2.2: * IPv4-embedded IPv6 addresses are composed of a variable-length * prefix, the embedded IPv4 address, and a variable length suffix. * The suffix bits are reserved for future extensions and SHOULD * be set to zero. */ switch (plen) { case 32: if (ip6->s6_addr32[3] != 0 || ip6->s6_addr32[2] != 0) goto badip6; break; case 40: if (ip6->s6_addr32[3] != 0 || (ip6->s6_addr32[2] & htonl(0xff00ffff)) != 0) goto badip6; break; case 48: if (ip6->s6_addr32[3] != 0 || (ip6->s6_addr32[2] & htonl(0xff0000ff)) != 0) goto badip6; break; case 56: if (ip6->s6_addr32[3] != 0 || ip6->s6_addr8[8] != 0) goto badip6; break; case 64: if (ip6->s6_addr8[8] != 0 || (ip6->s6_addr32[3] & htonl(0x00ffffff)) != 0) goto badip6; }; switch (plen) { case 32: case 96: ia = ip6->s6_addr32[plen / 32]; break; case 40: case 48: case 56: #if BYTE_ORDER == BIG_ENDIAN ia = (ip6->s6_addr32[1] << (plen % 32)) | (ip6->s6_addr32[2] >> (24 - plen % 32)); #elif BYTE_ORDER == LITTLE_ENDIAN ia = (ip6->s6_addr32[1] >> (plen % 32)) | (ip6->s6_addr32[2] << (24 - plen % 32)); #endif break; case 64: #if BYTE_ORDER == BIG_ENDIAN ia = (ip6->s6_addr32[2] << 8) | (ip6->s6_addr32[3] >> 24); #elif BYTE_ORDER == LITTLE_ENDIAN ia = (ip6->s6_addr32[2] >> 8) | (ip6->s6_addr32[3] << 24); #endif break; default: return (0); }; if (nat64_check_ip4(ia) == 0) return (ia); DPRINTF(DP_GENERIC | DP_DROPS, "invalid destination address: %08x", ia); return (0); badip6: DPRINTF(DP_GENERIC | DP_DROPS, "invalid IPv4-embedded IPv6 address"); return (0); } /* * According to RFC 1624 the equation for incremental checksum update is: * HC' = ~(~HC + ~m + m') -- [Eqn. 3] * HC' = HC - ~m - m' -- [Eqn. 4] * So, when we are replacing IPv4 addresses to IPv6, we * can assume, that new bytes previously were zeros, and vise versa - * when we replacing IPv6 addresses to IPv4, now unused bytes become * zeros. The payload length in pseudo header has bigger size, but one * half of it should be zero. Using the equation 4 we get: * HC' = HC - (~m0 + m0') -- m0 is first changed word * HC' = (HC - (~m0 + m0')) - (~m1 + m1') -- m1 is second changed word * HC' = HC - ~m0 - m0' - ~m1 - m1' - ... = * = HC - sum(~m[i] + m'[i]) * * The function result should be used as follows: * IPv6 to IPv4: HC' = cksum_add(HC, result) * IPv4 to IPv6: HC' = cksum_add(HC, ~result) */ static uint16_t nat64_cksum_convert(struct ip6_hdr *ip6, struct ip *ip) { uint32_t sum; uint16_t *p; sum = ~ip->ip_src.s_addr >> 16; sum += ~ip->ip_src.s_addr & 0xffff; sum += ~ip->ip_dst.s_addr >> 16; sum += ~ip->ip_dst.s_addr & 0xffff; for (p = (uint16_t *)&ip6->ip6_src; p < (uint16_t *)(&ip6->ip6_src + 2); p++) sum += *p; while (sum >> 16) sum = (sum & 0xffff) + (sum >> 16); return (sum); } static void nat64_init_ip4hdr(const struct ip6_hdr *ip6, const struct ip6_frag *frag, uint16_t plen, uint8_t proto, struct ip *ip) { /* assume addresses are already initialized */ ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; ip->ip_len = htons(sizeof(*ip) + plen); ip->ip_ttl = ip6->ip6_hlim; if (*V_nat64ip6stealth == 0) ip->ip_ttl -= IPV6_HLIMDEC; ip->ip_sum = 0; ip->ip_p = (proto == IPPROTO_ICMPV6) ? IPPROTO_ICMP: proto; ip_fillid(ip); if (frag != NULL) { ip->ip_off = htons(ntohs(frag->ip6f_offlg) >> 3); if (frag->ip6f_offlg & IP6F_MORE_FRAG) ip->ip_off |= htons(IP_MF); } else { ip->ip_off = htons(IP_DF); } ip->ip_sum = in_cksum_hdr(ip); } #define FRAGSZ(mtu) ((mtu) - sizeof(struct ip6_hdr) - sizeof(struct ip6_frag)) static NAT64NOINLINE int nat64_fragment6(struct nat64_counters *stats, struct ip6_hdr *ip6, struct mbufq *mq, struct mbuf *m, uint32_t mtu, uint16_t ip_id, uint16_t ip_off) { struct ip6_frag ip6f; struct mbuf *n; uint16_t hlen, len, offset; int plen; plen = ntohs(ip6->ip6_plen); hlen = sizeof(struct ip6_hdr); /* Fragmentation isn't needed */ if (ip_off == 0 && plen <= mtu - hlen) { M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) { NAT64STAT_INC(stats, nomem); return (ENOMEM); } bcopy(ip6, mtod(m, void *), hlen); if (mbufq_enqueue(mq, m) != 0) { m_freem(m); NAT64STAT_INC(stats, dropped); DPRINTF(DP_DROPS, "dropped due to mbufq overflow"); return (ENOBUFS); } return (0); } hlen += sizeof(struct ip6_frag); ip6f.ip6f_reserved = 0; ip6f.ip6f_nxt = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_FRAGMENT; if (ip_off != 0) { /* * We have got an IPv4 fragment. * Use offset value and ip_id from original fragment. */ ip6f.ip6f_ident = htonl(ntohs(ip_id)); offset = (ntohs(ip_off) & IP_OFFMASK) << 3; NAT64STAT_INC(stats, ifrags); } else { /* The packet size exceeds interface MTU */ ip6f.ip6f_ident = htonl(ip6_randomid()); offset = 0; /* First fragment*/ } while (plen > 0 && m != NULL) { n = NULL; len = FRAGSZ(mtu) & ~7; if (len > plen) len = plen; ip6->ip6_plen = htons(len + sizeof(ip6f)); ip6f.ip6f_offlg = ntohs(offset); if (len < plen || (ip_off & htons(IP_MF)) != 0) ip6f.ip6f_offlg |= IP6F_MORE_FRAG; offset += len; plen -= len; if (plen > 0) { n = m_split(m, len, M_NOWAIT); if (n == NULL) goto fail; } M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) goto fail; bcopy(ip6, mtod(m, void *), sizeof(struct ip6_hdr)); bcopy(&ip6f, mtodo(m, sizeof(struct ip6_hdr)), sizeof(struct ip6_frag)); if (mbufq_enqueue(mq, m) != 0) goto fail; m = n; } NAT64STAT_ADD(stats, ofrags, mbufq_len(mq)); return (0); fail: if (m != NULL) m_freem(m); if (n != NULL) m_freem(n); mbufq_drain(mq); NAT64STAT_INC(stats, nomem); return (ENOMEM); } -static NAT64NOINLINE int -nat64_find_route6(struct nhop6_basic *pnh, struct sockaddr_in6 *dst, - struct mbuf *m) +static struct nhop_object * +nat64_find_route6(struct sockaddr_in6 *dst, struct mbuf *m) { - - if (fib6_lookup_nh_basic(M_GETFIB(m), &dst->sin6_addr, 0, 0, 0, - pnh) != 0) - return (EHOSTUNREACH); - if (pnh->nh_flags & (NHF_BLACKHOLE | NHF_REJECT)) - return (EHOSTUNREACH); + struct nhop_object *nh; + NET_EPOCH_ASSERT(); + nh = fib6_lookup(M_GETFIB(m), &dst->sin6_addr, 0, 0, 0); + if (nh == NULL) + return NULL; + if (nh->nh_flags & (NHF_BLACKHOLE | NHF_REJECT)) + return NULL; /* * XXX: we need to use destination address with embedded scope * zone id, because LLTABLE uses such form of addresses for lookup. */ dst->sin6_family = AF_INET6; dst->sin6_len = sizeof(*dst); - dst->sin6_addr = pnh->nh_addr; + dst->sin6_addr = ifatoia6(nh->nh_ifa)->ia_addr.sin6_addr; if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr)) dst->sin6_addr.s6_addr16[1] = - htons(pnh->nh_ifp->if_index & 0xffff); + htons(nh->nh_ifp->if_index & 0xffff); dst->sin6_port = 0; dst->sin6_scope_id = 0; dst->sin6_flowinfo = 0; - return (0); + return nh; } #define NAT64_ICMP6_PLEN 64 static NAT64NOINLINE void nat64_icmp6_reflect(struct mbuf *m, uint8_t type, uint8_t code, uint32_t mtu, struct nat64_counters *stats, void *logdata) { struct icmp6_hdr *icmp6; struct ip6_hdr *ip6, *oip6; struct mbuf *n; int len, plen, proto; len = 0; proto = nat64_getlasthdr(m, &len); if (proto < 0) { DPRINTF(DP_DROPS, "mbuf isn't contigious"); goto freeit; } /* * Do not send ICMPv6 in reply to ICMPv6 errors. */ if (proto == IPPROTO_ICMPV6) { if (m->m_len < len + sizeof(*icmp6)) { DPRINTF(DP_DROPS, "mbuf isn't contigious"); goto freeit; } icmp6 = mtodo(m, len); if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST || icmp6->icmp6_type == ND_REDIRECT) { DPRINTF(DP_DROPS, "do not send ICMPv6 in reply to " "ICMPv6 errors"); goto freeit; } /* * If there are extra headers between IPv6 and ICMPv6, * strip off them. */ if (len > sizeof(struct ip6_hdr)) { /* * NOTE: ipfw_chk already did m_pullup() and it is * expected that data is contigious from the start * of IPv6 header up to the end of ICMPv6 header. */ bcopy(mtod(m, caddr_t), mtodo(m, len - sizeof(struct ip6_hdr)), sizeof(struct ip6_hdr)); m_adj(m, len - sizeof(struct ip6_hdr)); } } /* if (icmp6_ratelimit(&ip6->ip6_src, type, code)) goto freeit; */ ip6 = mtod(m, struct ip6_hdr *); switch (type) { case ICMP6_DST_UNREACH: case ICMP6_PACKET_TOO_BIG: case ICMP6_TIME_EXCEEDED: case ICMP6_PARAM_PROB: break; default: goto freeit; } /* Calculate length of ICMPv6 payload */ len = (m->m_pkthdr.len > NAT64_ICMP6_PLEN) ? NAT64_ICMP6_PLEN: m->m_pkthdr.len; /* Create new ICMPv6 datagram */ plen = len + sizeof(struct icmp6_hdr); n = m_get2(sizeof(struct ip6_hdr) + plen + max_hdr, M_NOWAIT, MT_HEADER, M_PKTHDR); if (n == NULL) { NAT64STAT_INC(stats, nomem); m_freem(m); return; } /* * Move pkthdr from original mbuf. We should have initialized some * fields, because we can reinject this mbuf to netisr and it will * go trough input path (it requires at least rcvif should be set). * Also do M_ALIGN() to reduce chances of need to allocate new mbuf * in the chain, when we will do M_PREPEND() or make some type of * tunneling. */ m_move_pkthdr(n, m); M_ALIGN(n, sizeof(struct ip6_hdr) + plen + max_hdr); n->m_len = n->m_pkthdr.len = sizeof(struct ip6_hdr) + plen; oip6 = mtod(n, struct ip6_hdr *); /* * Make IPv6 source address selection for reflected datagram. * nat64_check_ip6() doesn't allow scoped addresses, therefore * we use zero scopeid. */ if (in6_selectsrc_addr(M_GETFIB(n), &ip6->ip6_src, 0, n->m_pkthdr.rcvif, &oip6->ip6_src, NULL) != 0) { /* * Failed to find proper source address, drop the packet. */ m_freem(n); goto freeit; } oip6->ip6_dst = ip6->ip6_src; oip6->ip6_nxt = IPPROTO_ICMPV6; oip6->ip6_flow = 0; oip6->ip6_vfc |= IPV6_VERSION; oip6->ip6_hlim = V_ip6_defhlim; oip6->ip6_plen = htons(plen); icmp6 = mtodo(n, sizeof(struct ip6_hdr)); icmp6->icmp6_cksum = 0; icmp6->icmp6_type = type; icmp6->icmp6_code = code; icmp6->icmp6_mtu = htonl(mtu); m_copydata(m, 0, len, mtodo(n, sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr))); icmp6->icmp6_cksum = in6_cksum(n, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), plen); m_freem(m); V_nat64out->output_one(n, stats, logdata); return; freeit: NAT64STAT_INC(stats, dropped); m_freem(m); } -static NAT64NOINLINE int -nat64_find_route4(struct nhop4_basic *pnh, struct sockaddr_in *dst, - struct mbuf *m) +static struct nhop_object * +nat64_find_route4(struct sockaddr_in *dst, struct mbuf *m) { + struct nhop_object *nh; - if (fib4_lookup_nh_basic(M_GETFIB(m), dst->sin_addr, 0, 0, pnh) != 0) - return (EHOSTUNREACH); - if (pnh->nh_flags & (NHF_BLACKHOLE | NHF_BROADCAST | NHF_REJECT)) - return (EHOSTUNREACH); + NET_EPOCH_ASSERT(); + nh = fib4_lookup(M_GETFIB(m), dst->sin_addr, 0, 0, 0); + if (nh == NULL) + return NULL; + if (nh->nh_flags & (NHF_BLACKHOLE | NHF_BROADCAST | NHF_REJECT)) + return NULL; dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); - dst->sin_addr = pnh->nh_addr; + dst->sin_addr = IA_SIN(nh->nh_ifa)->sin_addr; dst->sin_port = 0; - return (0); + return nh; } #define NAT64_ICMP_PLEN 64 static NAT64NOINLINE void nat64_icmp_reflect(struct mbuf *m, uint8_t type, uint8_t code, uint16_t mtu, struct nat64_counters *stats, void *logdata) { struct icmp *icmp; struct ip *ip, *oip; struct mbuf *n; int len, plen; ip = mtod(m, struct ip *); /* Do not send ICMP error if packet is not the first fragment */ if (ip->ip_off & ~ntohs(IP_MF|IP_DF)) { DPRINTF(DP_DROPS, "not first fragment"); goto freeit; } /* Do not send ICMP in reply to ICMP errors */ if (ip->ip_p == IPPROTO_ICMP) { if (m->m_len < (ip->ip_hl << 2)) { DPRINTF(DP_DROPS, "mbuf isn't contigious"); goto freeit; } icmp = mtodo(m, ip->ip_hl << 2); if (!ICMP_INFOTYPE(icmp->icmp_type)) { DPRINTF(DP_DROPS, "do not send ICMP in reply to " "ICMP errors"); goto freeit; } } switch (type) { case ICMP_UNREACH: case ICMP_TIMXCEED: case ICMP_PARAMPROB: break; default: goto freeit; } /* Calculate length of ICMP payload */ len = (m->m_pkthdr.len > NAT64_ICMP_PLEN) ? (ip->ip_hl << 2) + 8: m->m_pkthdr.len; /* Create new ICMPv4 datagram */ plen = len + sizeof(struct icmphdr) + sizeof(uint32_t); n = m_get2(sizeof(struct ip) + plen + max_hdr, M_NOWAIT, MT_HEADER, M_PKTHDR); if (n == NULL) { NAT64STAT_INC(stats, nomem); m_freem(m); return; } m_move_pkthdr(n, m); M_ALIGN(n, sizeof(struct ip) + plen + max_hdr); n->m_len = n->m_pkthdr.len = sizeof(struct ip) + plen; oip = mtod(n, struct ip *); oip->ip_v = IPVERSION; oip->ip_hl = sizeof(struct ip) >> 2; oip->ip_tos = 0; oip->ip_len = htons(n->m_pkthdr.len); oip->ip_ttl = V_ip_defttl; oip->ip_p = IPPROTO_ICMP; ip_fillid(oip); oip->ip_off = htons(IP_DF); oip->ip_src = ip->ip_dst; oip->ip_dst = ip->ip_src; oip->ip_sum = 0; oip->ip_sum = in_cksum_hdr(oip); icmp = mtodo(n, sizeof(struct ip)); icmp->icmp_type = type; icmp->icmp_code = code; icmp->icmp_cksum = 0; icmp->icmp_pmvoid = 0; icmp->icmp_nextmtu = htons(mtu); m_copydata(m, 0, len, mtodo(n, sizeof(struct ip) + sizeof(struct icmphdr) + sizeof(uint32_t))); icmp->icmp_cksum = in_cksum_skip(n, sizeof(struct ip) + plen, sizeof(struct ip)); m_freem(m); V_nat64out->output_one(n, stats, logdata); return; freeit: NAT64STAT_INC(stats, dropped); m_freem(m); } /* Translate ICMP echo request/reply into ICMPv6 */ static void nat64_icmp_handle_echo(struct ip6_hdr *ip6, struct icmp6_hdr *icmp6, uint16_t id, uint8_t type) { uint16_t old; old = *(uint16_t *)icmp6; /* save type+code in one word */ icmp6->icmp6_type = type; /* Reflect ICMPv6 -> ICMPv4 type translation in the cksum */ icmp6->icmp6_cksum = cksum_adjust(icmp6->icmp6_cksum, old, *(uint16_t *)icmp6); if (id != 0) { old = icmp6->icmp6_id; icmp6->icmp6_id = id; /* Reflect ICMP id translation in the cksum */ icmp6->icmp6_cksum = cksum_adjust(icmp6->icmp6_cksum, old, id); } /* Reflect IPv6 pseudo header in the cksum */ icmp6->icmp6_cksum = ~in6_cksum_pseudo(ip6, ntohs(ip6->ip6_plen), IPPROTO_ICMPV6, ~icmp6->icmp6_cksum); } static NAT64NOINLINE struct mbuf * nat64_icmp_translate(struct mbuf *m, struct ip6_hdr *ip6, uint16_t icmpid, int offset, struct nat64_config *cfg) { struct ip ip; struct icmp *icmp; struct tcphdr *tcp; struct udphdr *udp; struct ip6_hdr *eip6; struct mbuf *n; uint32_t mtu; int len, hlen, plen; uint8_t type, code; if (m->m_len < offset + ICMP_MINLEN) m = m_pullup(m, offset + ICMP_MINLEN); if (m == NULL) { NAT64STAT_INC(&cfg->stats, nomem); return (m); } mtu = 0; icmp = mtodo(m, offset); /* RFC 7915 p4.2 */ switch (icmp->icmp_type) { case ICMP_ECHOREPLY: type = ICMP6_ECHO_REPLY; code = 0; break; case ICMP_UNREACH: type = ICMP6_DST_UNREACH; switch (icmp->icmp_code) { case ICMP_UNREACH_NET: case ICMP_UNREACH_HOST: case ICMP_UNREACH_SRCFAIL: case ICMP_UNREACH_NET_UNKNOWN: case ICMP_UNREACH_HOST_UNKNOWN: case ICMP_UNREACH_TOSNET: case ICMP_UNREACH_TOSHOST: code = ICMP6_DST_UNREACH_NOROUTE; break; case ICMP_UNREACH_PROTOCOL: type = ICMP6_PARAM_PROB; code = ICMP6_PARAMPROB_NEXTHEADER; break; case ICMP_UNREACH_PORT: code = ICMP6_DST_UNREACH_NOPORT; break; case ICMP_UNREACH_NEEDFRAG: type = ICMP6_PACKET_TOO_BIG; code = 0; /* XXX: needs an additional look */ mtu = max(IPV6_MMTU, ntohs(icmp->icmp_nextmtu) + 20); break; case ICMP_UNREACH_NET_PROHIB: case ICMP_UNREACH_HOST_PROHIB: case ICMP_UNREACH_FILTER_PROHIB: case ICMP_UNREACH_PRECEDENCE_CUTOFF: code = ICMP6_DST_UNREACH_ADMIN; break; default: DPRINTF(DP_DROPS, "Unsupported ICMP type %d, code %d", icmp->icmp_type, icmp->icmp_code); goto freeit; } break; case ICMP_TIMXCEED: type = ICMP6_TIME_EXCEEDED; code = icmp->icmp_code; break; case ICMP_ECHO: type = ICMP6_ECHO_REQUEST; code = 0; break; case ICMP_PARAMPROB: type = ICMP6_PARAM_PROB; switch (icmp->icmp_code) { case ICMP_PARAMPROB_ERRATPTR: case ICMP_PARAMPROB_LENGTH: code = ICMP6_PARAMPROB_HEADER; switch (icmp->icmp_pptr) { case 0: /* Version/IHL */ case 1: /* Type Of Service */ mtu = icmp->icmp_pptr; break; case 2: /* Total Length */ case 3: mtu = 4; /* Payload Length */ break; case 8: /* Time to Live */ mtu = 7; /* Hop Limit */ break; case 9: /* Protocol */ mtu = 6; /* Next Header */ break; case 12: /* Source address */ case 13: case 14: case 15: mtu = 8; break; case 16: /* Destination address */ case 17: case 18: case 19: mtu = 24; break; default: /* Silently drop */ DPRINTF(DP_DROPS, "Unsupported ICMP type %d," " code %d, pptr %d", icmp->icmp_type, icmp->icmp_code, icmp->icmp_pptr); goto freeit; } break; default: DPRINTF(DP_DROPS, "Unsupported ICMP type %d," " code %d, pptr %d", icmp->icmp_type, icmp->icmp_code, icmp->icmp_pptr); goto freeit; } break; default: DPRINTF(DP_DROPS, "Unsupported ICMP type %d, code %d", icmp->icmp_type, icmp->icmp_code); goto freeit; } /* * For echo request/reply we can use original payload, * but we need adjust icmp_cksum, because ICMPv6 cksum covers * IPv6 pseudo header and ICMPv6 types differs from ICMPv4. */ if (type == ICMP6_ECHO_REQUEST || type == ICMP6_ECHO_REPLY) { nat64_icmp_handle_echo(ip6, ICMP6(icmp), icmpid, type); return (m); } /* * For other types of ICMP messages we need to translate inner * IPv4 header to IPv6 header. * Assume ICMP src is the same as payload dst * E.g. we have ( GWsrc1 , NATIP1 ) in outer header * and ( NATIP1, Hostdst1 ) in ICMP copy header. * In that case, we already have map for NATIP1 and GWsrc1. * The only thing we need is to copy IPv6 map prefix to * Hostdst1. */ hlen = offset + ICMP_MINLEN; if (m->m_pkthdr.len < hlen + sizeof(struct ip) + ICMP_MINLEN) { DPRINTF(DP_DROPS, "Message is too short %d", m->m_pkthdr.len); goto freeit; } m_copydata(m, hlen, sizeof(struct ip), (char *)&ip); if (ip.ip_v != IPVERSION) { DPRINTF(DP_DROPS, "Wrong IP version %d", ip.ip_v); goto freeit; } hlen += ip.ip_hl << 2; /* Skip inner IP header */ if (nat64_check_ip4(ip.ip_src.s_addr) != 0 || nat64_check_ip4(ip.ip_dst.s_addr) != 0 || nat64_check_private_ip4(cfg, ip.ip_src.s_addr) != 0 || nat64_check_private_ip4(cfg, ip.ip_dst.s_addr) != 0) { DPRINTF(DP_DROPS, "IP addresses checks failed %04x -> %04x", ntohl(ip.ip_src.s_addr), ntohl(ip.ip_dst.s_addr)); goto freeit; } if (m->m_pkthdr.len < hlen + ICMP_MINLEN) { DPRINTF(DP_DROPS, "Message is too short %d", m->m_pkthdr.len); goto freeit; } #if 0 /* * Check that inner source matches the outer destination. * XXX: We need some method to convert IPv4 into IPv6 address here, * and compare IPv6 addresses. */ if (ip.ip_src.s_addr != nat64_get_ip4(&ip6->ip6_dst)) { DPRINTF(DP_GENERIC, "Inner source doesn't match destination ", "%04x vs %04x", ip.ip_src.s_addr, nat64_get_ip4(&ip6->ip6_dst)); goto freeit; } #endif /* * Create new mbuf for ICMPv6 datagram. * NOTE: len is data length just after inner IP header. */ len = m->m_pkthdr.len - hlen; if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + len > NAT64_ICMP6_PLEN) len = NAT64_ICMP6_PLEN - sizeof(struct icmp6_hdr) - sizeof(struct ip6_hdr); plen = sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr) + len; n = m_get2(offset + plen + max_hdr, M_NOWAIT, MT_HEADER, M_PKTHDR); if (n == NULL) { NAT64STAT_INC(&cfg->stats, nomem); m_freem(m); return (NULL); } m_move_pkthdr(n, m); M_ALIGN(n, offset + plen + max_hdr); n->m_len = n->m_pkthdr.len = offset + plen; /* Adjust ip6_plen in outer header */ ip6->ip6_plen = htons(plen); /* Construct new inner IPv6 header */ eip6 = mtodo(n, offset + sizeof(struct icmp6_hdr)); eip6->ip6_src = ip6->ip6_dst; /* Use the same prefix that we have in outer header */ eip6->ip6_dst = ip6->ip6_src; MPASS(cfg->flags & NAT64_PLATPFX); nat64_embed_ip4(&eip6->ip6_dst, cfg->plat_plen, ip.ip_dst.s_addr); eip6->ip6_flow = htonl(ip.ip_tos << 20); eip6->ip6_vfc |= IPV6_VERSION; eip6->ip6_hlim = ip.ip_ttl; eip6->ip6_plen = htons(ntohs(ip.ip_len) - (ip.ip_hl << 2)); eip6->ip6_nxt = (ip.ip_p == IPPROTO_ICMP) ? IPPROTO_ICMPV6: ip.ip_p; m_copydata(m, hlen, len, (char *)(eip6 + 1)); /* * We need to translate source port in the inner ULP header, * and adjust ULP checksum. */ switch (ip.ip_p) { case IPPROTO_TCP: if (len < offsetof(struct tcphdr, th_sum)) break; tcp = TCP(eip6 + 1); if (icmpid != 0) { tcp->th_sum = cksum_adjust(tcp->th_sum, tcp->th_sport, icmpid); tcp->th_sport = icmpid; } tcp->th_sum = cksum_add(tcp->th_sum, ~nat64_cksum_convert(eip6, &ip)); break; case IPPROTO_UDP: if (len < offsetof(struct udphdr, uh_sum)) break; udp = UDP(eip6 + 1); if (icmpid != 0) { udp->uh_sum = cksum_adjust(udp->uh_sum, udp->uh_sport, icmpid); udp->uh_sport = icmpid; } udp->uh_sum = cksum_add(udp->uh_sum, ~nat64_cksum_convert(eip6, &ip)); break; case IPPROTO_ICMP: /* * Check if this is an ICMP error message for echo request * that we sent. I.e. ULP in the data containing invoking * packet is IPPROTO_ICMP and its type is ICMP_ECHO. */ icmp = (struct icmp *)(eip6 + 1); if (icmp->icmp_type != ICMP_ECHO) { m_freem(n); goto freeit; } /* * For our client this original datagram should looks * like it was ICMPv6 datagram with type ICMP6_ECHO_REQUEST. * Thus we need adjust icmp_cksum and convert type from * ICMP_ECHO to ICMP6_ECHO_REQUEST. */ nat64_icmp_handle_echo(eip6, ICMP6(icmp), icmpid, ICMP6_ECHO_REQUEST); } m_freem(m); /* Convert ICMPv4 into ICMPv6 header */ icmp = mtodo(n, offset); ICMP6(icmp)->icmp6_type = type; ICMP6(icmp)->icmp6_code = code; ICMP6(icmp)->icmp6_mtu = htonl(mtu); ICMP6(icmp)->icmp6_cksum = 0; ICMP6(icmp)->icmp6_cksum = cksum_add( ~in6_cksum_pseudo(ip6, plen, IPPROTO_ICMPV6, 0), in_cksum_skip(n, n->m_pkthdr.len, offset)); return (n); freeit: m_freem(m); NAT64STAT_INC(&cfg->stats, dropped); return (NULL); } int nat64_getlasthdr(struct mbuf *m, int *offset) { struct ip6_hdr *ip6; struct ip6_hbh *hbh; int proto, hlen; if (offset != NULL) hlen = *offset; else hlen = 0; if (m->m_len < hlen + sizeof(*ip6)) return (-1); ip6 = mtodo(m, hlen); hlen += sizeof(*ip6); proto = ip6->ip6_nxt; /* Skip extension headers */ while (proto == IPPROTO_HOPOPTS || proto == IPPROTO_ROUTING || proto == IPPROTO_DSTOPTS) { hbh = mtodo(m, hlen); /* * We expect mbuf has contigious data up to * upper level header. */ if (m->m_len < hlen) return (-1); /* * We doesn't support Jumbo payload option, * so return error. */ if (proto == IPPROTO_HOPOPTS && ip6->ip6_plen == 0) return (-1); proto = hbh->ip6h_nxt; hlen += (hbh->ip6h_len + 1) << 3; } if (offset != NULL) *offset = hlen; return (proto); } int nat64_do_handle_ip4(struct mbuf *m, struct in6_addr *saddr, struct in6_addr *daddr, uint16_t lport, struct nat64_config *cfg, void *logdata) { - struct nhop6_basic nh; + struct nhop_object *nh; struct ip6_hdr ip6; struct sockaddr_in6 dst; struct ip *ip; struct mbufq mq; uint16_t ip_id, ip_off; uint16_t *csum; int plen, hlen; uint8_t proto; ip = mtod(m, struct ip*); if (*V_nat64ipstealth == 0 && ip->ip_ttl <= IPTTLDEC) { nat64_icmp_reflect(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, &cfg->stats, logdata); return (NAT64RETURN); } ip6.ip6_dst = *daddr; ip6.ip6_src = *saddr; hlen = ip->ip_hl << 2; plen = ntohs(ip->ip_len) - hlen; proto = ip->ip_p; /* Save ip_id and ip_off, both are in network byte order */ ip_id = ip->ip_id; ip_off = ip->ip_off & htons(IP_OFFMASK | IP_MF); /* Fragment length must be multiple of 8 octets */ if ((ip->ip_off & htons(IP_MF)) != 0 && (plen & 0x7) != 0) { nat64_icmp_reflect(m, ICMP_PARAMPROB, ICMP_PARAMPROB_LENGTH, 0, &cfg->stats, logdata); return (NAT64RETURN); } /* Fragmented ICMP is unsupported */ if (proto == IPPROTO_ICMP && ip_off != 0) { DPRINTF(DP_DROPS, "dropped due to fragmented ICMP"); NAT64STAT_INC(&cfg->stats, dropped); return (NAT64MFREE); } dst.sin6_addr = ip6.ip6_dst; - if (nat64_find_route6(&nh, &dst, m) != 0) { + nh = nat64_find_route6(&dst, m); + if (nh == NULL) { NAT64STAT_INC(&cfg->stats, noroute6); nat64_icmp_reflect(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, &cfg->stats, logdata); return (NAT64RETURN); } - if (nh.nh_mtu < plen + sizeof(ip6) && + if (nh->nh_mtu < plen + sizeof(ip6) && (ip->ip_off & htons(IP_DF)) != 0) { nat64_icmp_reflect(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, - FRAGSZ(nh.nh_mtu) + sizeof(struct ip), &cfg->stats, logdata); + FRAGSZ(nh->nh_mtu) + sizeof(struct ip), &cfg->stats, logdata); return (NAT64RETURN); } ip6.ip6_flow = htonl(ip->ip_tos << 20); ip6.ip6_vfc |= IPV6_VERSION; ip6.ip6_hlim = ip->ip_ttl; if (*V_nat64ipstealth == 0) ip6.ip6_hlim -= IPTTLDEC; ip6.ip6_plen = htons(plen); ip6.ip6_nxt = (proto == IPPROTO_ICMP) ? IPPROTO_ICMPV6: proto; /* Convert checksums. */ switch (proto) { case IPPROTO_TCP: csum = &TCP(mtodo(m, hlen))->th_sum; if (lport != 0) { struct tcphdr *tcp = TCP(mtodo(m, hlen)); *csum = cksum_adjust(*csum, tcp->th_dport, lport); tcp->th_dport = lport; } *csum = cksum_add(*csum, ~nat64_cksum_convert(&ip6, ip)); break; case IPPROTO_UDP: csum = &UDP(mtodo(m, hlen))->uh_sum; if (lport != 0) { struct udphdr *udp = UDP(mtodo(m, hlen)); *csum = cksum_adjust(*csum, udp->uh_dport, lport); udp->uh_dport = lport; } *csum = cksum_add(*csum, ~nat64_cksum_convert(&ip6, ip)); break; case IPPROTO_ICMP: m = nat64_icmp_translate(m, &ip6, lport, hlen, cfg); if (m == NULL) /* stats already accounted */ return (NAT64RETURN); } m_adj(m, hlen); mbufq_init(&mq, 255); - nat64_fragment6(&cfg->stats, &ip6, &mq, m, nh.nh_mtu, ip_id, ip_off); + nat64_fragment6(&cfg->stats, &ip6, &mq, m, nh->nh_mtu, ip_id, ip_off); while ((m = mbufq_dequeue(&mq)) != NULL) { - if (V_nat64out->output(nh.nh_ifp, m, (struct sockaddr *)&dst, + if (V_nat64out->output(nh->nh_ifp, m, (struct sockaddr *)&dst, &cfg->stats, logdata) != 0) break; NAT64STAT_INC(&cfg->stats, opcnt46); } mbufq_drain(&mq); return (NAT64RETURN); } int nat64_handle_icmp6(struct mbuf *m, int hlen, uint32_t aaddr, uint16_t aport, struct nat64_config *cfg, void *logdata) { struct ip ip; struct icmp6_hdr *icmp6; struct ip6_frag *ip6f; struct ip6_hdr *ip6, *ip6i; uint32_t mtu; int plen, proto; uint8_t type, code; if (hlen == 0) { ip6 = mtod(m, struct ip6_hdr *); if (nat64_check_ip6(&ip6->ip6_src) != 0 || nat64_check_ip6(&ip6->ip6_dst) != 0) return (NAT64SKIP); proto = nat64_getlasthdr(m, &hlen); if (proto != IPPROTO_ICMPV6) { DPRINTF(DP_DROPS, "dropped due to mbuf isn't contigious"); NAT64STAT_INC(&cfg->stats, dropped); return (NAT64MFREE); } } /* * Translate ICMPv6 type and code to ICMPv4 (RFC7915). * NOTE: ICMPv6 echo handled by nat64_do_handle_ip6(). */ icmp6 = mtodo(m, hlen); mtu = 0; switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: type = ICMP_UNREACH; switch (icmp6->icmp6_code) { case ICMP6_DST_UNREACH_NOROUTE: case ICMP6_DST_UNREACH_BEYONDSCOPE: case ICMP6_DST_UNREACH_ADDR: code = ICMP_UNREACH_HOST; break; case ICMP6_DST_UNREACH_ADMIN: code = ICMP_UNREACH_HOST_PROHIB; break; case ICMP6_DST_UNREACH_NOPORT: code = ICMP_UNREACH_PORT; break; default: DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d," " code %d", icmp6->icmp6_type, icmp6->icmp6_code); NAT64STAT_INC(&cfg->stats, dropped); return (NAT64MFREE); } break; case ICMP6_PACKET_TOO_BIG: type = ICMP_UNREACH; code = ICMP_UNREACH_NEEDFRAG; mtu = ntohl(icmp6->icmp6_mtu); if (mtu < IPV6_MMTU) { DPRINTF(DP_DROPS, "Wrong MTU %d in ICMPv6 type %d," " code %d", mtu, icmp6->icmp6_type, icmp6->icmp6_code); NAT64STAT_INC(&cfg->stats, dropped); return (NAT64MFREE); } /* * Adjust MTU to reflect difference between * IPv6 an IPv4 headers. */ mtu -= sizeof(struct ip6_hdr) - sizeof(struct ip); break; case ICMP6_TIME_EXCEEDED: type = ICMP_TIMXCEED; code = icmp6->icmp6_code; break; case ICMP6_PARAM_PROB: switch (icmp6->icmp6_code) { case ICMP6_PARAMPROB_HEADER: type = ICMP_PARAMPROB; code = ICMP_PARAMPROB_ERRATPTR; mtu = ntohl(icmp6->icmp6_pptr); switch (mtu) { case 0: /* Version/Traffic Class */ case 1: /* Traffic Class/Flow Label */ break; case 4: /* Payload Length */ case 5: mtu = 2; break; case 6: /* Next Header */ mtu = 9; break; case 7: /* Hop Limit */ mtu = 8; break; default: if (mtu >= 8 && mtu <= 23) { mtu = 12; /* Source address */ break; } if (mtu >= 24 && mtu <= 39) { mtu = 16; /* Destination address */ break; } DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d," " code %d, pptr %d", icmp6->icmp6_type, icmp6->icmp6_code, mtu); NAT64STAT_INC(&cfg->stats, dropped); return (NAT64MFREE); } case ICMP6_PARAMPROB_NEXTHEADER: type = ICMP_UNREACH; code = ICMP_UNREACH_PROTOCOL; break; default: DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d," " code %d, pptr %d", icmp6->icmp6_type, icmp6->icmp6_code, ntohl(icmp6->icmp6_pptr)); NAT64STAT_INC(&cfg->stats, dropped); return (NAT64MFREE); } break; default: DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d, code %d", icmp6->icmp6_type, icmp6->icmp6_code); NAT64STAT_INC(&cfg->stats, dropped); return (NAT64MFREE); } hlen += sizeof(struct icmp6_hdr); if (m->m_pkthdr.len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN) { NAT64STAT_INC(&cfg->stats, dropped); DPRINTF(DP_DROPS, "Message is too short %d", m->m_pkthdr.len); return (NAT64MFREE); } /* * We need at least ICMP_MINLEN bytes of original datagram payload * to generate ICMP message. It is nice that ICMP_MINLEN is equal * to sizeof(struct ip6_frag). So, if embedded datagram had a fragment * header we will not have to do m_pullup() again. * * What we have here: * Outer header: (IPv6iGW, v4mapPRefix+v4exthost) * Inner header: (v4mapPRefix+v4host, IPv6iHost) [sport, dport] * We need to translate it to: * * Outer header: (alias_host, v4exthost) * Inner header: (v4exthost, alias_host) [sport, alias_port] * * Assume caller function has checked if v4mapPRefix+v4host * matches configured prefix. * The only two things we should be provided with are mapping between * IPv6iHost <> alias_host and between dport and alias_port. */ if (m->m_len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN) m = m_pullup(m, hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN); if (m == NULL) { NAT64STAT_INC(&cfg->stats, nomem); return (NAT64RETURN); } ip6 = mtod(m, struct ip6_hdr *); ip6i = mtodo(m, hlen); ip6f = NULL; proto = ip6i->ip6_nxt; plen = ntohs(ip6i->ip6_plen); hlen += sizeof(struct ip6_hdr); if (proto == IPPROTO_FRAGMENT) { if (m->m_pkthdr.len < hlen + sizeof(struct ip6_frag) + ICMP_MINLEN) goto fail; ip6f = mtodo(m, hlen); proto = ip6f->ip6f_nxt; plen -= sizeof(struct ip6_frag); hlen += sizeof(struct ip6_frag); /* Ajust MTU to reflect frag header size */ if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG) mtu -= sizeof(struct ip6_frag); } if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) { DPRINTF(DP_DROPS, "Unsupported proto %d in the inner header", proto); goto fail; } if (nat64_check_ip6(&ip6i->ip6_src) != 0 || nat64_check_ip6(&ip6i->ip6_dst) != 0) { DPRINTF(DP_DROPS, "Inner addresses do not passes the check"); goto fail; } /* Check if outer dst is the same as inner src */ if (!IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6i->ip6_src)) { DPRINTF(DP_DROPS, "Inner src doesn't match outer dst"); goto fail; } /* Now we need to make a fake IPv4 packet to generate ICMP message */ ip.ip_dst.s_addr = aaddr; ip.ip_src.s_addr = nat64_extract_ip4(&ip6i->ip6_src, cfg->plat_plen); if (ip.ip_src.s_addr == 0) goto fail; /* XXX: Make fake ulp header */ if (V_nat64out == &nat64_direct) /* init_ip4hdr will decrement it */ ip6i->ip6_hlim += IPV6_HLIMDEC; nat64_init_ip4hdr(ip6i, ip6f, plen, proto, &ip); m_adj(m, hlen - sizeof(struct ip)); bcopy(&ip, mtod(m, void *), sizeof(ip)); nat64_icmp_reflect(m, type, code, (uint16_t)mtu, &cfg->stats, logdata); return (NAT64RETURN); fail: /* * We must call m_freem() because mbuf pointer could be * changed with m_pullup(). */ m_freem(m); NAT64STAT_INC(&cfg->stats, dropped); return (NAT64RETURN); } int nat64_do_handle_ip6(struct mbuf *m, uint32_t aaddr, uint16_t aport, struct nat64_config *cfg, void *logdata) { struct ip ip; - struct nhop4_basic nh; + struct nhop_object *nh; struct sockaddr_in dst; struct ip6_frag *frag; struct ip6_hdr *ip6; struct icmp6_hdr *icmp6; uint16_t *csum; int plen, hlen, proto; /* * XXX: we expect ipfw_chk() did m_pullup() up to upper level * protocol's headers. Also we skip some checks, that ip6_input(), * ip6_forward(), ip6_fastfwd() and ipfw_chk() already did. */ ip6 = mtod(m, struct ip6_hdr *); if (nat64_check_ip6(&ip6->ip6_src) != 0 || nat64_check_ip6(&ip6->ip6_dst) != 0) { return (NAT64SKIP); } /* Starting from this point we must not return zero */ ip.ip_src.s_addr = aaddr; if (nat64_check_ip4(ip.ip_src.s_addr) != 0) { DPRINTF(DP_GENERIC | DP_DROPS, "invalid source address: %08x", ip.ip_src.s_addr); NAT64STAT_INC(&cfg->stats, dropped); return (NAT64MFREE); } ip.ip_dst.s_addr = nat64_extract_ip4(&ip6->ip6_dst, cfg->plat_plen); if (ip.ip_dst.s_addr == 0) { NAT64STAT_INC(&cfg->stats, dropped); return (NAT64MFREE); } if (*V_nat64ip6stealth == 0 && ip6->ip6_hlim <= IPV6_HLIMDEC) { nat64_icmp6_reflect(m, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_TRANSIT, 0, &cfg->stats, logdata); return (NAT64RETURN); } hlen = 0; plen = ntohs(ip6->ip6_plen); proto = nat64_getlasthdr(m, &hlen); if (proto < 0) { DPRINTF(DP_DROPS, "dropped due to mbuf isn't contigious"); NAT64STAT_INC(&cfg->stats, dropped); return (NAT64MFREE); } frag = NULL; if (proto == IPPROTO_FRAGMENT) { /* ipfw_chk should m_pullup up to frag header */ if (m->m_len < hlen + sizeof(*frag)) { DPRINTF(DP_DROPS, "dropped due to mbuf isn't contigious"); NAT64STAT_INC(&cfg->stats, dropped); return (NAT64MFREE); } frag = mtodo(m, hlen); proto = frag->ip6f_nxt; hlen += sizeof(*frag); /* Fragmented ICMPv6 is unsupported */ if (proto == IPPROTO_ICMPV6) { DPRINTF(DP_DROPS, "dropped due to fragmented ICMPv6"); NAT64STAT_INC(&cfg->stats, dropped); return (NAT64MFREE); } /* Fragment length must be multiple of 8 octets */ if ((frag->ip6f_offlg & IP6F_MORE_FRAG) != 0 && ((plen + sizeof(struct ip6_hdr) - hlen) & 0x7) != 0) { nat64_icmp6_reflect(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offsetof(struct ip6_hdr, ip6_plen), &cfg->stats, logdata); return (NAT64RETURN); } } plen -= hlen - sizeof(struct ip6_hdr); if (plen < 0 || m->m_pkthdr.len < plen + hlen) { DPRINTF(DP_DROPS, "plen %d, pkthdr.len %d, hlen %d", plen, m->m_pkthdr.len, hlen); NAT64STAT_INC(&cfg->stats, dropped); return (NAT64MFREE); } icmp6 = NULL; /* Make gcc happy */ if (proto == IPPROTO_ICMPV6) { icmp6 = mtodo(m, hlen); if (icmp6->icmp6_type != ICMP6_ECHO_REQUEST && icmp6->icmp6_type != ICMP6_ECHO_REPLY) return (nat64_handle_icmp6(m, hlen, aaddr, aport, cfg, logdata)); } dst.sin_addr.s_addr = ip.ip_dst.s_addr; - if (nat64_find_route4(&nh, &dst, m) != 0) { + nh = nat64_find_route4(&dst, m); + if (nh == NULL) { NAT64STAT_INC(&cfg->stats, noroute4); nat64_icmp6_reflect(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE, 0, &cfg->stats, logdata); return (NAT64RETURN); } - if (nh.nh_mtu < plen + sizeof(ip)) { - nat64_icmp6_reflect(m, ICMP6_PACKET_TOO_BIG, 0, nh.nh_mtu, + if (nh->nh_mtu < plen + sizeof(ip)) { + nat64_icmp6_reflect(m, ICMP6_PACKET_TOO_BIG, 0, nh->nh_mtu, &cfg->stats, logdata); return (NAT64RETURN); } nat64_init_ip4hdr(ip6, frag, plen, proto, &ip); /* Convert checksums. */ switch (proto) { case IPPROTO_TCP: csum = &TCP(mtodo(m, hlen))->th_sum; if (aport != 0) { struct tcphdr *tcp = TCP(mtodo(m, hlen)); *csum = cksum_adjust(*csum, tcp->th_sport, aport); tcp->th_sport = aport; } *csum = cksum_add(*csum, nat64_cksum_convert(ip6, &ip)); break; case IPPROTO_UDP: csum = &UDP(mtodo(m, hlen))->uh_sum; if (aport != 0) { struct udphdr *udp = UDP(mtodo(m, hlen)); *csum = cksum_adjust(*csum, udp->uh_sport, aport); udp->uh_sport = aport; } *csum = cksum_add(*csum, nat64_cksum_convert(ip6, &ip)); break; case IPPROTO_ICMPV6: /* Checksum in ICMPv6 covers pseudo header */ csum = &icmp6->icmp6_cksum; *csum = cksum_add(*csum, in6_cksum_pseudo(ip6, plen, IPPROTO_ICMPV6, 0)); /* Convert ICMPv6 types to ICMP */ proto = *(uint16_t *)icmp6; /* save old word for cksum_adjust */ if (icmp6->icmp6_type == ICMP6_ECHO_REQUEST) icmp6->icmp6_type = ICMP_ECHO; else /* ICMP6_ECHO_REPLY */ icmp6->icmp6_type = ICMP_ECHOREPLY; *csum = cksum_adjust(*csum, (uint16_t)proto, *(uint16_t *)icmp6); if (aport != 0) { uint16_t old_id = icmp6->icmp6_id; icmp6->icmp6_id = aport; *csum = cksum_adjust(*csum, old_id, aport); } break; }; m_adj(m, hlen - sizeof(ip)); bcopy(&ip, mtod(m, void *), sizeof(ip)); - if (V_nat64out->output(nh.nh_ifp, m, (struct sockaddr *)&dst, + if (V_nat64out->output(nh->nh_ifp, m, (struct sockaddr *)&dst, &cfg->stats, logdata) == 0) NAT64STAT_INC(&cfg->stats, opcnt64); return (NAT64RETURN); }