diff --git a/sys/net/if_gif.c b/sys/net/if_gif.c index 011ad7aedeaf..a8c6fb5c9c7d 100644 --- a/sys/net/if_gif.c +++ b/sys/net/if_gif.c @@ -1,1066 +1,1078 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: if_gif.c,v 1.87 2001/10/19 08:50:27 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #ifdef INET6 #ifndef INET #include #endif #include #include #include #include #include #include #endif /* INET6 */ #include #include #include #include #include static const char gifname[] = "gif"; /* * gif_mtx protects a per-vnet gif_softc_list. */ static VNET_DEFINE(struct mtx, gif_mtx); #define V_gif_mtx VNET(gif_mtx) static MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface"); static VNET_DEFINE(LIST_HEAD(, gif_softc), gif_softc_list); #define V_gif_softc_list VNET(gif_softc_list) static struct sx gif_ioctl_sx; SX_SYSINIT(gif_ioctl_sx, &gif_ioctl_sx, "gif_ioctl"); #define GIF_LIST_LOCK_INIT(x) mtx_init(&V_gif_mtx, "gif_mtx", \ NULL, MTX_DEF) #define GIF_LIST_LOCK_DESTROY(x) mtx_destroy(&V_gif_mtx) #define GIF_LIST_LOCK(x) mtx_lock(&V_gif_mtx) #define GIF_LIST_UNLOCK(x) mtx_unlock(&V_gif_mtx) void (*ng_gif_input_p)(struct ifnet *ifp, struct mbuf **mp, int af); void (*ng_gif_input_orphan_p)(struct ifnet *ifp, struct mbuf *m, int af); void (*ng_gif_attach_p)(struct ifnet *ifp); void (*ng_gif_detach_p)(struct ifnet *ifp); static int gif_check_nesting(struct ifnet *, struct mbuf *); static int gif_set_tunnel(struct ifnet *, struct sockaddr *, struct sockaddr *); static void gif_delete_tunnel(struct ifnet *); static int gif_ioctl(struct ifnet *, u_long, caddr_t); static int gif_transmit(struct ifnet *, struct mbuf *); static void gif_qflush(struct ifnet *); static int gif_clone_create(struct if_clone *, int, caddr_t); static void gif_clone_destroy(struct ifnet *); static VNET_DEFINE(struct if_clone *, gif_cloner); #define V_gif_cloner VNET(gif_cloner) static int gifmodevent(module_t, int, void *); SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_GIF, gif, CTLFLAG_RW, 0, "Generic Tunnel Interface"); #ifndef MAX_GIF_NEST /* * This macro controls the default upper limitation on nesting of gif tunnels. * Since, setting a large value to this macro with a careless configuration * may introduce system crash, we don't allow any nestings by default. * If you need to configure nested gif tunnels, you can define this macro * in your kernel configuration file. However, if you do so, please be * careful to configure the tunnels so that it won't make a loop. */ #define MAX_GIF_NEST 1 #endif static VNET_DEFINE(int, max_gif_nesting) = MAX_GIF_NEST; #define V_max_gif_nesting VNET(max_gif_nesting) SYSCTL_INT(_net_link_gif, OID_AUTO, max_nesting, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(max_gif_nesting), 0, "Max nested tunnels"); /* * By default, we disallow creation of multiple tunnels between the same * pair of addresses. Some applications require this functionality so * we allow control over this check here. */ #ifdef XBONEHACK static VNET_DEFINE(int, parallel_tunnels) = 1; #else static VNET_DEFINE(int, parallel_tunnels) = 0; #endif #define V_parallel_tunnels VNET(parallel_tunnels) SYSCTL_INT(_net_link_gif, OID_AUTO, parallel_tunnels, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(parallel_tunnels), 0, "Allow parallel tunnels?"); static int gif_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct gif_softc *sc; sc = malloc(sizeof(struct gif_softc), M_GIF, M_WAITOK | M_ZERO); sc->gif_fibnum = curthread->td_proc->p_fibnum; GIF2IFP(sc) = if_alloc(IFT_GIF); GIF_LOCK_INIT(sc); GIF2IFP(sc)->if_softc = sc; if_initname(GIF2IFP(sc), gifname, unit); GIF2IFP(sc)->if_addrlen = 0; GIF2IFP(sc)->if_mtu = GIF_MTU; GIF2IFP(sc)->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; #if 0 /* turn off ingress filter */ GIF2IFP(sc)->if_flags |= IFF_LINK2; #endif GIF2IFP(sc)->if_ioctl = gif_ioctl; GIF2IFP(sc)->if_transmit = gif_transmit; GIF2IFP(sc)->if_qflush = gif_qflush; GIF2IFP(sc)->if_output = gif_output; GIF2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE; GIF2IFP(sc)->if_capenable |= IFCAP_LINKSTATE; if_attach(GIF2IFP(sc)); bpfattach(GIF2IFP(sc), DLT_NULL, sizeof(u_int32_t)); if (ng_gif_attach_p != NULL) (*ng_gif_attach_p)(GIF2IFP(sc)); GIF_LIST_LOCK(); LIST_INSERT_HEAD(&V_gif_softc_list, sc, gif_list); GIF_LIST_UNLOCK(); return (0); } static void gif_clone_destroy(struct ifnet *ifp) { struct gif_softc *sc; sx_xlock(&gif_ioctl_sx); sc = ifp->if_softc; gif_delete_tunnel(ifp); GIF_LIST_LOCK(); LIST_REMOVE(sc, gif_list); GIF_LIST_UNLOCK(); if (ng_gif_detach_p != NULL) (*ng_gif_detach_p)(ifp); bpfdetach(ifp); if_detach(ifp); ifp->if_softc = NULL; sx_xunlock(&gif_ioctl_sx); if_free(ifp); GIF_LOCK_DESTROY(sc); free(sc, M_GIF); } static void vnet_gif_init(const void *unused __unused) { LIST_INIT(&V_gif_softc_list); GIF_LIST_LOCK_INIT(); V_gif_cloner = if_clone_simple(gifname, gif_clone_create, gif_clone_destroy, 0); } VNET_SYSINIT(vnet_gif_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gif_init, NULL); static void vnet_gif_uninit(const void *unused __unused) { if_clone_detach(V_gif_cloner); GIF_LIST_LOCK_DESTROY(); } VNET_SYSUNINIT(vnet_gif_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gif_uninit, NULL); static int gifmodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t gif_mod = { "if_gif", gifmodevent, 0 }; DECLARE_MODULE(if_gif, gif_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_gif, 1); int gif_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { GIF_RLOCK_TRACKER; const struct ip *ip; struct gif_softc *sc; int ret; sc = (struct gif_softc *)arg; if (sc == NULL || (GIF2IFP(sc)->if_flags & IFF_UP) == 0) return (0); ret = 0; GIF_RLOCK(sc); /* no physical address */ if (sc->gif_family == 0) goto done; switch (proto) { #ifdef INET case IPPROTO_IPV4: #endif #ifdef INET6 case IPPROTO_IPV6: #endif case IPPROTO_ETHERIP: break; default: goto done; } /* Bail on short packets */ M_ASSERTPKTHDR(m); if (m->m_pkthdr.len < sizeof(struct ip)) goto done; ip = mtod(m, const struct ip *); switch (ip->ip_v) { #ifdef INET case 4: if (sc->gif_family != AF_INET) goto done; ret = in_gif_encapcheck(m, off, proto, arg); break; #endif #ifdef INET6 case 6: if (m->m_pkthdr.len < sizeof(struct ip6_hdr)) goto done; if (sc->gif_family != AF_INET6) goto done; ret = in6_gif_encapcheck(m, off, proto, arg); break; #endif } done: GIF_RUNLOCK(sc); return (ret); } static int gif_transmit(struct ifnet *ifp, struct mbuf *m) { struct gif_softc *sc; struct etherip_header *eth; #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; uint32_t t; #endif uint32_t af; uint8_t proto, ecn; int error; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); goto err; } #endif error = ENETDOWN; sc = ifp->if_softc; if ((ifp->if_flags & IFF_MONITOR) != 0 || (ifp->if_flags & IFF_UP) == 0 || sc->gif_family == 0 || (error = gif_check_nesting(ifp, m)) != 0) { m_freem(m); goto err; } /* Now pull back the af that we stashed in the csum_data. */ if (ifp->if_bridge) af = AF_LINK; else af = m->m_pkthdr.csum_data; m->m_flags &= ~(M_BCAST|M_MCAST); M_SETFIB(m, sc->gif_fibnum); BPF_MTAP2(ifp, &af, sizeof(af), m); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); /* inner AF-specific encapsulation */ ecn = 0; switch (af) { #ifdef INET case AF_INET: proto = IPPROTO_IPV4; if (m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) { error = ENOBUFS; goto err; } ip = mtod(m, struct ip *); ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &ecn, &ip->ip_tos); break; #endif #ifdef INET6 case AF_INET6: proto = IPPROTO_IPV6; if (m->m_len < sizeof(struct ip6_hdr)) m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) { error = ENOBUFS; goto err; } t = 0; ip6 = mtod(m, struct ip6_hdr *); ip6_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &t, &ip6->ip6_flow); ecn = (ntohl(t) >> 20) & 0xff; break; #endif case AF_LINK: proto = IPPROTO_ETHERIP; M_PREPEND(m, sizeof(struct etherip_header), M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto err; } eth = mtod(m, struct etherip_header *); eth->eip_resvh = 0; eth->eip_ver = ETHERIP_VERSION; eth->eip_resvl = 0; break; default: error = EAFNOSUPPORT; m_freem(m); goto err; } /* XXX should we check if our outer source is legal? */ /* dispatch to output logic based on outer AF */ switch (sc->gif_family) { #ifdef INET case AF_INET: error = in_gif_output(ifp, m, proto, ecn); break; #endif #ifdef INET6 case AF_INET6: error = in6_gif_output(ifp, m, proto, ecn); break; #endif default: m_freem(m); } err: if (error) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } static void gif_qflush(struct ifnet *ifp __unused) { } #define MTAG_GIF 1080679712 static int gif_check_nesting(struct ifnet *ifp, struct mbuf *m) { struct m_tag *mtag; int count; /* * gif may cause infinite recursion calls when misconfigured. * We'll prevent this by detecting loops. * * High nesting level may cause stack exhaustion. * We'll prevent this by introducing upper limit. */ count = 1; mtag = NULL; while ((mtag = m_tag_locate(m, MTAG_GIF, 0, mtag)) != NULL) { if (*(struct ifnet **)(mtag + 1) == ifp) { log(LOG_NOTICE, "%s: loop detected\n", if_name(ifp)); return (EIO); } count++; } if (count > V_max_gif_nesting) { log(LOG_NOTICE, "%s: if_output recursively called too many times(%d)\n", if_name(ifp), count); return (EIO); } mtag = m_tag_alloc(MTAG_GIF, 0, sizeof(struct ifnet *), M_NOWAIT); if (mtag == NULL) return (ENOMEM); *(struct ifnet **)(mtag + 1) = ifp; m_tag_prepend(m, mtag); return (0); } int gif_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { uint32_t af; if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else af = dst->sa_family; /* * Now save the af in the inbound pkt csum data, this is a cheat since * we are using the inbound csum_data field to carry the af over to * the gif_transmit() routine, avoiding using yet another mtag. */ m->m_pkthdr.csum_data = af; return (ifp->if_transmit(ifp, m)); } void gif_input(struct mbuf *m, struct ifnet *ifp, int proto, uint8_t ecn) { struct etherip_header *eip; #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; uint32_t t; #endif struct ether_header *eh; struct ifnet *oldifp; int isr, n, af; if (ifp == NULL) { /* just in case */ m_freem(m); return; } m->m_pkthdr.rcvif = ifp; m_clrprotoflags(m); switch (proto) { #ifdef INET case IPPROTO_IPV4: af = AF_INET; if (m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) goto drop; ip = mtod(m, struct ip *); if (ip_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &ecn, &ip->ip_tos) == 0) { m_freem(m); goto drop; } break; #endif #ifdef INET6 case IPPROTO_IPV6: af = AF_INET6; if (m->m_len < sizeof(struct ip6_hdr)) m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) goto drop; t = htonl((uint32_t)ecn << 20); ip6 = mtod(m, struct ip6_hdr *); if (ip6_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &t, &ip6->ip6_flow) == 0) { m_freem(m); goto drop; } break; #endif case IPPROTO_ETHERIP: af = AF_LINK; break; default: m_freem(m); goto drop; } #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif if (bpf_peers_present(ifp->if_bpf)) { uint32_t af1 = af; bpf_mtap2(ifp->if_bpf, &af1, sizeof(af1), m); } if ((ifp->if_flags & IFF_MONITOR) != 0) { if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); m_freem(m); return; } if (ng_gif_input_p != NULL) { (*ng_gif_input_p)(ifp, &m, af); if (m == NULL) goto drop; } /* * Put the packet to the network layer input queue according to the * specified address family. * Note: older versions of gif_input directly called network layer * input functions, e.g. ip6_input, here. We changed the policy to * prevent too many recursive calls of such input functions, which * might cause kernel panic. But the change may introduce another * problem; if the input queue is full, packets are discarded. * The kernel stack overflow really happened, and we believed * queue-full rarely occurs, so we changed the policy. */ switch (af) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif case AF_LINK: n = sizeof(struct etherip_header) + sizeof(struct ether_header); if (n > m->m_len) m = m_pullup(m, n); if (m == NULL) goto drop; eip = mtod(m, struct etherip_header *); if (eip->eip_ver != ETHERIP_VERSION) { /* discard unknown versions */ m_freem(m); goto drop; } m_adj(m, sizeof(struct etherip_header)); m->m_flags &= ~(M_BCAST|M_MCAST); m->m_pkthdr.rcvif = ifp; if (ifp->if_bridge) { oldifp = ifp; eh = mtod(m, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } BRIDGE_INPUT(ifp, m); if (m != NULL && ifp != oldifp) { /* * The bridge gave us back itself or one of the * members for which the frame is addressed. */ ether_demux(ifp, m); return; } } if (m != NULL) m_freem(m); return; default: if (ng_gif_input_orphan_p != NULL) (*ng_gif_input_orphan_p)(ifp, m, af); else m_freem(m); return; } if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); return; drop: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } /* XXX how should we handle IPv6 scope on SIOC[GS]IFPHYADDR? */ int gif_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { GIF_RLOCK_TRACKER; struct ifreq *ifr = (struct ifreq*)data; struct sockaddr *dst, *src; struct gif_softc *sc; #ifdef INET struct sockaddr_in *sin = NULL; #endif #ifdef INET6 struct sockaddr_in6 *sin6 = NULL; #endif u_int options; int error; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; case SIOCADDMULTI: case SIOCDELMULTI: case SIOCGIFMTU: case SIOCSIFFLAGS: return (0); case SIOCSIFMTU: if (ifr->ifr_mtu < GIF_MTU_MIN || ifr->ifr_mtu > GIF_MTU_MAX) return (EINVAL); else ifp->if_mtu = ifr->ifr_mtu; return (0); } sx_xlock(&gif_ioctl_sx); sc = ifp->if_softc; if (sc == NULL) { error = ENXIO; goto bad; } error = 0; switch (cmd) { case SIOCSIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif error = EINVAL; switch (cmd) { #ifdef INET case SIOCSIFPHYADDR: src = (struct sockaddr *) &(((struct in_aliasreq *)data)->ifra_addr); dst = (struct sockaddr *) &(((struct in_aliasreq *)data)->ifra_dstaddr); break; #endif #ifdef INET6 case SIOCSIFPHYADDR_IN6: src = (struct sockaddr *) &(((struct in6_aliasreq *)data)->ifra_addr); dst = (struct sockaddr *) &(((struct in6_aliasreq *)data)->ifra_dstaddr); break; #endif default: goto bad; } /* sa_family must be equal */ if (src->sa_family != dst->sa_family || src->sa_len != dst->sa_len) goto bad; /* validate sa_len */ /* check sa_family looks sane for the cmd */ switch (src->sa_family) { #ifdef INET case AF_INET: if (src->sa_len != sizeof(struct sockaddr_in)) goto bad; if (cmd != SIOCSIFPHYADDR) { error = EAFNOSUPPORT; goto bad; } if (satosin(src)->sin_addr.s_addr == INADDR_ANY || satosin(dst)->sin_addr.s_addr == INADDR_ANY) { error = EADDRNOTAVAIL; goto bad; } break; #endif #ifdef INET6 case AF_INET6: if (src->sa_len != sizeof(struct sockaddr_in6)) goto bad; if (cmd != SIOCSIFPHYADDR_IN6) { error = EAFNOSUPPORT; goto bad; } error = EADDRNOTAVAIL; if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(src)->sin6_addr) || IN6_IS_ADDR_UNSPECIFIED(&satosin6(dst)->sin6_addr)) goto bad; /* * Check validity of the scope zone ID of the * addresses, and convert it into the kernel * internal form if necessary. */ error = sa6_embedscope(satosin6(src), 0); if (error != 0) goto bad; error = sa6_embedscope(satosin6(dst), 0); if (error != 0) goto bad; break; #endif default: error = EAFNOSUPPORT; goto bad; } error = gif_set_tunnel(ifp, src, dst); break; case SIOCDIFPHYADDR: gif_delete_tunnel(ifp); break; case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: #ifdef INET6 case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: #endif if (sc->gif_family == 0) { error = EADDRNOTAVAIL; break; } GIF_RLOCK(sc); switch (cmd) { #ifdef INET case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: if (sc->gif_family != AF_INET) { error = EADDRNOTAVAIL; break; } sin = (struct sockaddr_in *)&ifr->ifr_addr; memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); break; #endif #ifdef INET6 case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: if (sc->gif_family != AF_INET6) { error = EADDRNOTAVAIL; break; } sin6 = (struct sockaddr_in6 *) &(((struct in6_ifreq *)data)->ifr_addr); memset(sin6, 0, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); break; #endif default: error = EAFNOSUPPORT; } if (error == 0) { switch (cmd) { #ifdef INET case SIOCGIFPSRCADDR: sin->sin_addr = sc->gif_iphdr->ip_src; break; case SIOCGIFPDSTADDR: sin->sin_addr = sc->gif_iphdr->ip_dst; break; #endif #ifdef INET6 case SIOCGIFPSRCADDR_IN6: sin6->sin6_addr = sc->gif_ip6hdr->ip6_src; break; case SIOCGIFPDSTADDR_IN6: sin6->sin6_addr = sc->gif_ip6hdr->ip6_dst; break; #endif } } GIF_RUNLOCK(sc); if (error != 0) break; switch (cmd) { #ifdef INET case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: error = prison_if(curthread->td_ucred, (struct sockaddr *)sin); if (error != 0) memset(sin, 0, sizeof(*sin)); break; #endif #ifdef INET6 case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: error = prison_if(curthread->td_ucred, (struct sockaddr *)sin6); if (error == 0) error = sa6_recoverscope(sin6); if (error != 0) memset(sin6, 0, sizeof(*sin6)); #endif } break; case SIOCGTUNFIB: ifr->ifr_fib = sc->gif_fibnum; break; case SIOCSTUNFIB: if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0) break; if (ifr->ifr_fib >= rt_numfibs) error = EINVAL; else sc->gif_fibnum = ifr->ifr_fib; break; case GIFGOPTS: options = sc->gif_options; error = copyout(&options, ifr_data_get_ptr(ifr), sizeof(options)); break; case GIFSOPTS: if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0) break; error = copyin(ifr_data_get_ptr(ifr), &options, sizeof(options)); if (error) break; if (options & ~GIF_OPTMASK) error = EINVAL; else sc->gif_options = options; break; default: error = EINVAL; break; } bad: sx_xunlock(&gif_ioctl_sx); return (error); } static void -gif_detach(struct gif_softc *sc) +gif_detach(struct gif_softc *sc, int family) { sx_assert(&gif_ioctl_sx, SA_XLOCKED); - if (sc->gif_ecookie != NULL) - encap_detach(sc->gif_ecookie); + if (sc->gif_ecookie != NULL) { + switch (family) { +#ifdef INET + case AF_INET: + ip_encap_detach(sc->gif_ecookie); + break; +#endif +#ifdef INET6 + case AF_INET6: + ip6_encap_detach(sc->gif_ecookie); + break; +#endif + } + } sc->gif_ecookie = NULL; } static int gif_attach(struct gif_softc *sc, int af) { sx_assert(&gif_ioctl_sx, SA_XLOCKED); switch (af) { #ifdef INET case AF_INET: return (in_gif_attach(sc)); #endif #ifdef INET6 case AF_INET6: return (in6_gif_attach(sc)); #endif } return (EAFNOSUPPORT); } static int gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) { struct gif_softc *sc = ifp->if_softc; struct gif_softc *tsc; #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; #endif void *hdr; int error = 0; if (sc == NULL) return (ENXIO); /* Disallow parallel tunnels unless instructed otherwise. */ if (V_parallel_tunnels == 0) { GIF_LIST_LOCK(); LIST_FOREACH(tsc, &V_gif_softc_list, gif_list) { if (tsc == sc || tsc->gif_family != src->sa_family) continue; #ifdef INET if (tsc->gif_family == AF_INET && tsc->gif_iphdr->ip_src.s_addr == satosin(src)->sin_addr.s_addr && tsc->gif_iphdr->ip_dst.s_addr == satosin(dst)->sin_addr.s_addr) { error = EADDRNOTAVAIL; GIF_LIST_UNLOCK(); goto bad; } #endif #ifdef INET6 if (tsc->gif_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&tsc->gif_ip6hdr->ip6_src, &satosin6(src)->sin6_addr) && IN6_ARE_ADDR_EQUAL(&tsc->gif_ip6hdr->ip6_dst, &satosin6(dst)->sin6_addr)) { error = EADDRNOTAVAIL; GIF_LIST_UNLOCK(); goto bad; } #endif } GIF_LIST_UNLOCK(); } switch (src->sa_family) { #ifdef INET case AF_INET: hdr = ip = malloc(sizeof(struct ip), M_GIF, M_WAITOK | M_ZERO); ip->ip_src.s_addr = satosin(src)->sin_addr.s_addr; ip->ip_dst.s_addr = satosin(dst)->sin_addr.s_addr; break; #endif #ifdef INET6 case AF_INET6: hdr = ip6 = malloc(sizeof(struct ip6_hdr), M_GIF, M_WAITOK | M_ZERO); ip6->ip6_src = satosin6(src)->sin6_addr; ip6->ip6_dst = satosin6(dst)->sin6_addr; ip6->ip6_vfc = IPV6_VERSION; break; #endif default: return (EAFNOSUPPORT); } if (sc->gif_family != src->sa_family) - gif_detach(sc); + gif_detach(sc, sc->gif_family); if (sc->gif_family == 0 || sc->gif_family != src->sa_family) error = gif_attach(sc, src->sa_family); GIF_WLOCK(sc); if (sc->gif_family != 0) free(sc->gif_hdr, M_GIF); sc->gif_family = src->sa_family; sc->gif_hdr = hdr; GIF_WUNLOCK(sc); #if defined(INET) || defined(INET6) bad: #endif if (error == 0 && sc->gif_family != 0) { ifp->if_drv_flags |= IFF_DRV_RUNNING; if_link_state_change(ifp, LINK_STATE_UP); } else { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(ifp, LINK_STATE_DOWN); } return (error); } static void gif_delete_tunnel(struct ifnet *ifp) { struct gif_softc *sc = ifp->if_softc; int family; if (sc == NULL) return; GIF_WLOCK(sc); family = sc->gif_family; sc->gif_family = 0; GIF_WUNLOCK(sc); if (family != 0) { - gif_detach(sc); + gif_detach(sc, family); free(sc->gif_hdr, M_GIF); } ifp->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(ifp, LINK_STATE_DOWN); } diff --git a/sys/net/if_gre.c b/sys/net/if_gre.c index a3e578e4cad3..2160f350bf9b 100644 --- a/sys/net/if_gre.c +++ b/sys/net/if_gre.c @@ -1,999 +1,1006 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1998 The NetBSD Foundation, Inc. * Copyright (c) 2014 Andrey V. Elsukov * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Heiko W.Rupp * * IPv6-over-GRE contributed by Gert Doering * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: if_gre.c,v 1.49 2003/12/11 00:22:29 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #include #include #endif #ifdef INET6 #include #include #include #include #endif #include #include #include #include #include #define GREMTU 1476 static const char grename[] = "gre"; static MALLOC_DEFINE(M_GRE, grename, "Generic Routing Encapsulation"); static VNET_DEFINE(struct mtx, gre_mtx); #define V_gre_mtx VNET(gre_mtx) #define GRE_LIST_LOCK_INIT(x) mtx_init(&V_gre_mtx, "gre_mtx", NULL, \ MTX_DEF) #define GRE_LIST_LOCK_DESTROY(x) mtx_destroy(&V_gre_mtx) #define GRE_LIST_LOCK(x) mtx_lock(&V_gre_mtx) #define GRE_LIST_UNLOCK(x) mtx_unlock(&V_gre_mtx) static VNET_DEFINE(LIST_HEAD(, gre_softc), gre_softc_list); #define V_gre_softc_list VNET(gre_softc_list) static struct sx gre_ioctl_sx; SX_SYSINIT(gre_ioctl_sx, &gre_ioctl_sx, "gre_ioctl"); static int gre_clone_create(struct if_clone *, int, caddr_t); static void gre_clone_destroy(struct ifnet *); static VNET_DEFINE(struct if_clone *, gre_cloner); #define V_gre_cloner VNET(gre_cloner) static void gre_qflush(struct ifnet *); static int gre_transmit(struct ifnet *, struct mbuf *); static int gre_ioctl(struct ifnet *, u_long, caddr_t); static int gre_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void gre_updatehdr(struct gre_softc *); static int gre_set_tunnel(struct ifnet *, struct sockaddr *, struct sockaddr *); static void gre_delete_tunnel(struct ifnet *); SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_TUNNEL, gre, CTLFLAG_RW, 0, "Generic Routing Encapsulation"); #ifndef MAX_GRE_NEST /* * This macro controls the default upper limitation on nesting of gre tunnels. * Since, setting a large value to this macro with a careless configuration * may introduce system crash, we don't allow any nestings by default. * If you need to configure nested gre tunnels, you can define this macro * in your kernel configuration file. However, if you do so, please be * careful to configure the tunnels so that it won't make a loop. */ #define MAX_GRE_NEST 1 #endif static VNET_DEFINE(int, max_gre_nesting) = MAX_GRE_NEST; #define V_max_gre_nesting VNET(max_gre_nesting) SYSCTL_INT(_net_link_gre, OID_AUTO, max_nesting, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_gre_nesting), 0, "Max nested tunnels"); static void vnet_gre_init(const void *unused __unused) { LIST_INIT(&V_gre_softc_list); GRE_LIST_LOCK_INIT(); V_gre_cloner = if_clone_simple(grename, gre_clone_create, gre_clone_destroy, 0); } VNET_SYSINIT(vnet_gre_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gre_init, NULL); static void vnet_gre_uninit(const void *unused __unused) { if_clone_detach(V_gre_cloner); GRE_LIST_LOCK_DESTROY(); } VNET_SYSUNINIT(vnet_gre_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gre_uninit, NULL); static int gre_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct gre_softc *sc; sc = malloc(sizeof(struct gre_softc), M_GRE, M_WAITOK | M_ZERO); sc->gre_fibnum = curthread->td_proc->p_fibnum; GRE2IFP(sc) = if_alloc(IFT_TUNNEL); GRE_LOCK_INIT(sc); GRE2IFP(sc)->if_softc = sc; if_initname(GRE2IFP(sc), grename, unit); GRE2IFP(sc)->if_mtu = GREMTU; GRE2IFP(sc)->if_flags = IFF_POINTOPOINT|IFF_MULTICAST; GRE2IFP(sc)->if_output = gre_output; GRE2IFP(sc)->if_ioctl = gre_ioctl; GRE2IFP(sc)->if_transmit = gre_transmit; GRE2IFP(sc)->if_qflush = gre_qflush; GRE2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE; GRE2IFP(sc)->if_capenable |= IFCAP_LINKSTATE; if_attach(GRE2IFP(sc)); bpfattach(GRE2IFP(sc), DLT_NULL, sizeof(u_int32_t)); GRE_LIST_LOCK(); LIST_INSERT_HEAD(&V_gre_softc_list, sc, gre_list); GRE_LIST_UNLOCK(); return (0); } static void gre_clone_destroy(struct ifnet *ifp) { struct gre_softc *sc; sx_xlock(&gre_ioctl_sx); sc = ifp->if_softc; gre_delete_tunnel(ifp); GRE_LIST_LOCK(); LIST_REMOVE(sc, gre_list); GRE_LIST_UNLOCK(); bpfdetach(ifp); if_detach(ifp); ifp->if_softc = NULL; sx_xunlock(&gre_ioctl_sx); if_free(ifp); GRE_LOCK_DESTROY(sc); free(sc, M_GRE); } static int gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { GRE_RLOCK_TRACKER; struct ifreq *ifr = (struct ifreq *)data; struct sockaddr *src, *dst; struct gre_softc *sc; #ifdef INET struct sockaddr_in *sin = NULL; #endif #ifdef INET6 struct sockaddr_in6 *sin6 = NULL; #endif uint32_t opt; int error; switch (cmd) { case SIOCSIFMTU: /* XXX: */ if (ifr->ifr_mtu < 576) return (EINVAL); ifp->if_mtu = ifr->ifr_mtu; return (0); case SIOCSIFADDR: ifp->if_flags |= IFF_UP; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: return (0); case GRESADDRS: case GRESADDRD: case GREGADDRS: case GREGADDRD: case GRESPROTO: case GREGPROTO: return (EOPNOTSUPP); } src = dst = NULL; sx_xlock(&gre_ioctl_sx); sc = ifp->if_softc; if (sc == NULL) { error = ENXIO; goto end; } error = 0; switch (cmd) { case SIOCSIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif error = EINVAL; switch (cmd) { #ifdef INET case SIOCSIFPHYADDR: src = (struct sockaddr *) &(((struct in_aliasreq *)data)->ifra_addr); dst = (struct sockaddr *) &(((struct in_aliasreq *)data)->ifra_dstaddr); break; #endif #ifdef INET6 case SIOCSIFPHYADDR_IN6: src = (struct sockaddr *) &(((struct in6_aliasreq *)data)->ifra_addr); dst = (struct sockaddr *) &(((struct in6_aliasreq *)data)->ifra_dstaddr); break; #endif default: error = EAFNOSUPPORT; goto end; } /* sa_family must be equal */ if (src->sa_family != dst->sa_family || src->sa_len != dst->sa_len) goto end; /* validate sa_len */ switch (src->sa_family) { #ifdef INET case AF_INET: if (src->sa_len != sizeof(struct sockaddr_in)) goto end; break; #endif #ifdef INET6 case AF_INET6: if (src->sa_len != sizeof(struct sockaddr_in6)) goto end; break; #endif default: error = EAFNOSUPPORT; goto end; } /* check sa_family looks sane for the cmd */ error = EAFNOSUPPORT; switch (cmd) { #ifdef INET case SIOCSIFPHYADDR: if (src->sa_family == AF_INET) break; goto end; #endif #ifdef INET6 case SIOCSIFPHYADDR_IN6: if (src->sa_family == AF_INET6) break; goto end; #endif } error = EADDRNOTAVAIL; switch (src->sa_family) { #ifdef INET case AF_INET: if (satosin(src)->sin_addr.s_addr == INADDR_ANY || satosin(dst)->sin_addr.s_addr == INADDR_ANY) goto end; break; #endif #ifdef INET6 case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(src)->sin6_addr) || IN6_IS_ADDR_UNSPECIFIED(&satosin6(dst)->sin6_addr)) goto end; /* * Check validity of the scope zone ID of the * addresses, and convert it into the kernel * internal form if necessary. */ error = sa6_embedscope(satosin6(src), 0); if (error != 0) goto end; error = sa6_embedscope(satosin6(dst), 0); if (error != 0) goto end; #endif } error = gre_set_tunnel(ifp, src, dst); break; case SIOCDIFPHYADDR: gre_delete_tunnel(ifp); break; case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: #ifdef INET6 case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: #endif if (sc->gre_family == 0) { error = EADDRNOTAVAIL; break; } GRE_RLOCK(sc); switch (cmd) { #ifdef INET case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: if (sc->gre_family != AF_INET) { error = EADDRNOTAVAIL; break; } sin = (struct sockaddr_in *)&ifr->ifr_addr; memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); break; #endif #ifdef INET6 case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: if (sc->gre_family != AF_INET6) { error = EADDRNOTAVAIL; break; } sin6 = (struct sockaddr_in6 *) &(((struct in6_ifreq *)data)->ifr_addr); memset(sin6, 0, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); break; #endif } if (error == 0) { switch (cmd) { #ifdef INET case SIOCGIFPSRCADDR: sin->sin_addr = sc->gre_oip.ip_src; break; case SIOCGIFPDSTADDR: sin->sin_addr = sc->gre_oip.ip_dst; break; #endif #ifdef INET6 case SIOCGIFPSRCADDR_IN6: sin6->sin6_addr = sc->gre_oip6.ip6_src; break; case SIOCGIFPDSTADDR_IN6: sin6->sin6_addr = sc->gre_oip6.ip6_dst; break; #endif } } GRE_RUNLOCK(sc); if (error != 0) break; switch (cmd) { #ifdef INET case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: error = prison_if(curthread->td_ucred, (struct sockaddr *)sin); if (error != 0) memset(sin, 0, sizeof(*sin)); break; #endif #ifdef INET6 case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: error = prison_if(curthread->td_ucred, (struct sockaddr *)sin6); if (error == 0) error = sa6_recoverscope(sin6); if (error != 0) memset(sin6, 0, sizeof(*sin6)); #endif } break; case SIOCGTUNFIB: ifr->ifr_fib = sc->gre_fibnum; break; case SIOCSTUNFIB: if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; if (ifr->ifr_fib >= rt_numfibs) error = EINVAL; else sc->gre_fibnum = ifr->ifr_fib; break; case GRESKEY: if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; if ((error = copyin(ifr_data_get_ptr(ifr), &opt, sizeof(opt))) != 0) break; if (sc->gre_key != opt) { GRE_WLOCK(sc); sc->gre_key = opt; gre_updatehdr(sc); GRE_WUNLOCK(sc); } break; case GREGKEY: error = copyout(&sc->gre_key, ifr_data_get_ptr(ifr), sizeof(sc->gre_key)); break; case GRESOPTS: if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; if ((error = copyin(ifr_data_get_ptr(ifr), &opt, sizeof(opt))) != 0) break; if (opt & ~GRE_OPTMASK) error = EINVAL; else { if (sc->gre_options != opt) { GRE_WLOCK(sc); sc->gre_options = opt; gre_updatehdr(sc); GRE_WUNLOCK(sc); } } break; case GREGOPTS: error = copyout(&sc->gre_options, ifr_data_get_ptr(ifr), sizeof(sc->gre_options)); break; default: error = EINVAL; break; } end: sx_xunlock(&gre_ioctl_sx); return (error); } static void gre_updatehdr(struct gre_softc *sc) { struct grehdr *gh = NULL; uint32_t *opts; uint16_t flags; GRE_WLOCK_ASSERT(sc); switch (sc->gre_family) { #ifdef INET case AF_INET: sc->gre_hlen = sizeof(struct greip); sc->gre_oip.ip_v = IPPROTO_IPV4; sc->gre_oip.ip_hl = sizeof(struct ip) >> 2; sc->gre_oip.ip_p = IPPROTO_GRE; gh = &sc->gre_gihdr->gi_gre; break; #endif #ifdef INET6 case AF_INET6: sc->gre_hlen = sizeof(struct greip6); sc->gre_oip6.ip6_vfc = IPV6_VERSION; sc->gre_oip6.ip6_nxt = IPPROTO_GRE; gh = &sc->gre_gi6hdr->gi6_gre; break; #endif default: return; } flags = 0; opts = gh->gre_opts; if (sc->gre_options & GRE_ENABLE_CSUM) { flags |= GRE_FLAGS_CP; sc->gre_hlen += 2 * sizeof(uint16_t); *opts++ = 0; } if (sc->gre_key != 0) { flags |= GRE_FLAGS_KP; sc->gre_hlen += sizeof(uint32_t); *opts++ = htonl(sc->gre_key); } if (sc->gre_options & GRE_ENABLE_SEQ) { flags |= GRE_FLAGS_SP; sc->gre_hlen += sizeof(uint32_t); *opts++ = 0; } else sc->gre_oseq = 0; gh->gre_flags = htons(flags); } static void -gre_detach(struct gre_softc *sc) +gre_detach(struct gre_softc *sc, int family) { sx_assert(&gre_ioctl_sx, SA_XLOCKED); - if (sc->gre_ecookie != NULL) - encap_detach(sc->gre_ecookie); + if (sc->gre_ecookie != NULL) { + switch (family) { +#ifdef INET + case AF_INET: + ip_encap_detach(sc->gre_ecookie); + break; +#endif +#ifdef INET6 + case AF_INET6: + ip6_encap_detach(sc->gre_ecookie); + break; +#endif + } + } sc->gre_ecookie = NULL; } static int gre_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) { struct gre_softc *sc, *tsc; #ifdef INET6 struct ip6_hdr *ip6; #endif #ifdef INET struct ip *ip; #endif void *hdr; int error; sx_assert(&gre_ioctl_sx, SA_XLOCKED); GRE_LIST_LOCK(); sc = ifp->if_softc; LIST_FOREACH(tsc, &V_gre_softc_list, gre_list) { if (tsc == sc || tsc->gre_family != src->sa_family) continue; #ifdef INET if (tsc->gre_family == AF_INET && tsc->gre_oip.ip_src.s_addr == satosin(src)->sin_addr.s_addr && tsc->gre_oip.ip_dst.s_addr == satosin(dst)->sin_addr.s_addr) { GRE_LIST_UNLOCK(); return (EADDRNOTAVAIL); } #endif #ifdef INET6 if (tsc->gre_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&tsc->gre_oip6.ip6_src, &satosin6(src)->sin6_addr) && IN6_ARE_ADDR_EQUAL(&tsc->gre_oip6.ip6_dst, &satosin6(dst)->sin6_addr)) { GRE_LIST_UNLOCK(); return (EADDRNOTAVAIL); } #endif } GRE_LIST_UNLOCK(); switch (src->sa_family) { #ifdef INET case AF_INET: hdr = ip = malloc(sizeof(struct greip) + 3 * sizeof(uint32_t), M_GRE, M_WAITOK | M_ZERO); ip->ip_src = satosin(src)->sin_addr; ip->ip_dst = satosin(dst)->sin_addr; break; #endif #ifdef INET6 case AF_INET6: hdr = ip6 = malloc(sizeof(struct greip6) + 3 * sizeof(uint32_t), M_GRE, M_WAITOK | M_ZERO); ip6->ip6_src = satosin6(src)->sin6_addr; ip6->ip6_dst = satosin6(dst)->sin6_addr; break; #endif default: return (EAFNOSUPPORT); } if (sc->gre_family != 0) - gre_detach(sc); + gre_detach(sc, sc->gre_family); GRE_WLOCK(sc); if (sc->gre_family != 0) free(sc->gre_hdr, M_GRE); sc->gre_family = src->sa_family; sc->gre_hdr = hdr; sc->gre_oseq = 0; sc->gre_iseq = UINT32_MAX; gre_updatehdr(sc); GRE_WUNLOCK(sc); error = 0; switch (src->sa_family) { #ifdef INET case AF_INET: error = in_gre_attach(sc); break; #endif #ifdef INET6 case AF_INET6: error = in6_gre_attach(sc); break; #endif } if (error == 0) { ifp->if_drv_flags |= IFF_DRV_RUNNING; if_link_state_change(ifp, LINK_STATE_UP); } return (error); } static void gre_delete_tunnel(struct ifnet *ifp) { struct gre_softc *sc = ifp->if_softc; int family; GRE_WLOCK(sc); family = sc->gre_family; sc->gre_family = 0; GRE_WUNLOCK(sc); if (family != 0) { - gre_detach(sc); + gre_detach(sc, family); free(sc->gre_hdr, M_GRE); } ifp->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(ifp, LINK_STATE_DOWN); } int -gre_input(struct mbuf **mp, int *offp, int proto) +gre_input(struct mbuf *m, int off, int proto, void *arg) { - struct gre_softc *sc; + struct gre_softc *sc = arg; struct grehdr *gh; struct ifnet *ifp; - struct mbuf *m; uint32_t *opts; #ifdef notyet uint32_t key; #endif uint16_t flags; int hlen, isr, af; - m = *mp; - sc = encap_getarg(m); - KASSERT(sc != NULL, ("encap_getarg returned NULL")); - ifp = GRE2IFP(sc); - hlen = *offp + sizeof(struct grehdr) + 4 * sizeof(uint32_t); + hlen = off + sizeof(struct grehdr) + 4 * sizeof(uint32_t); if (m->m_pkthdr.len < hlen) goto drop; if (m->m_len < hlen) { m = m_pullup(m, hlen); if (m == NULL) goto drop; } - gh = (struct grehdr *)mtodo(m, *offp); + gh = (struct grehdr *)mtodo(m, off); flags = ntohs(gh->gre_flags); if (flags & ~GRE_FLAGS_MASK) goto drop; opts = gh->gre_opts; hlen = 2 * sizeof(uint16_t); if (flags & GRE_FLAGS_CP) { /* reserved1 field must be zero */ if (((uint16_t *)opts)[1] != 0) goto drop; - if (in_cksum_skip(m, m->m_pkthdr.len, *offp) != 0) + if (in_cksum_skip(m, m->m_pkthdr.len, off) != 0) goto drop; hlen += 2 * sizeof(uint16_t); opts++; } if (flags & GRE_FLAGS_KP) { #ifdef notyet /* * XXX: The current implementation uses the key only for outgoing * packets. But we can check the key value here, or even in the * encapcheck function. */ key = ntohl(*opts); #endif hlen += sizeof(uint32_t); opts++; } #ifdef notyet } else key = 0; if (sc->gre_key != 0 && (key != sc->gre_key || key != 0)) goto drop; #endif if (flags & GRE_FLAGS_SP) { #ifdef notyet seq = ntohl(*opts); #endif hlen += sizeof(uint32_t); } switch (ntohs(gh->gre_proto)) { case ETHERTYPE_WCCP: /* * For WCCP skip an additional 4 bytes if after GRE header * doesn't follow an IP header. */ if (flags == 0 && (*(uint8_t *)gh->gre_opts & 0xF0) != 0x40) hlen += sizeof(uint32_t); /* FALLTHROUGH */ case ETHERTYPE_IP: isr = NETISR_IP; af = AF_INET; break; case ETHERTYPE_IPV6: isr = NETISR_IPV6; af = AF_INET6; break; default: goto drop; } - m_adj(m, *offp + hlen); + m_adj(m, off + hlen); m_clrprotoflags(m); m->m_pkthdr.rcvif = ifp; M_SETFIB(m, ifp->if_fib); #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif BPF_MTAP2(ifp, &af, sizeof(af), m); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if ((ifp->if_flags & IFF_MONITOR) != 0) m_freem(m); else netisr_dispatch(isr, m); return (IPPROTO_DONE); drop: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return (IPPROTO_DONE); } #define MTAG_GRE 1307983903 static int gre_check_nesting(struct ifnet *ifp, struct mbuf *m) { struct m_tag *mtag; int count; count = 1; mtag = NULL; while ((mtag = m_tag_locate(m, MTAG_GRE, 0, mtag)) != NULL) { if (*(struct ifnet **)(mtag + 1) == ifp) { log(LOG_NOTICE, "%s: loop detected\n", ifp->if_xname); return (EIO); } count++; } if (count > V_max_gre_nesting) { log(LOG_NOTICE, "%s: if_output recursively called too many times(%d)\n", ifp->if_xname, count); return (EIO); } mtag = m_tag_alloc(MTAG_GRE, 0, sizeof(struct ifnet *), M_NOWAIT); if (mtag == NULL) return (ENOMEM); *(struct ifnet **)(mtag + 1) = ifp; m_tag_prepend(m, mtag); return (0); } static int gre_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { uint32_t af; int error; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error != 0) goto drop; #endif if ((ifp->if_flags & IFF_MONITOR) != 0 || (ifp->if_flags & IFF_UP) == 0) { error = ENETDOWN; goto drop; } error = gre_check_nesting(ifp, m); if (error != 0) goto drop; m->m_flags &= ~(M_BCAST|M_MCAST); if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else af = dst->sa_family; BPF_MTAP2(ifp, &af, sizeof(af), m); m->m_pkthdr.csum_data = af; /* save af for if_transmit */ return (ifp->if_transmit(ifp, m)); drop: m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } static void gre_setseqn(struct grehdr *gh, uint32_t seq) { uint32_t *opts; uint16_t flags; opts = gh->gre_opts; flags = ntohs(gh->gre_flags); KASSERT((flags & GRE_FLAGS_SP) != 0, ("gre_setseqn called, but GRE_FLAGS_SP isn't set ")); if (flags & GRE_FLAGS_CP) opts++; if (flags & GRE_FLAGS_KP) opts++; *opts = htonl(seq); } static int gre_transmit(struct ifnet *ifp, struct mbuf *m) { GRE_RLOCK_TRACKER; struct gre_softc *sc; struct grehdr *gh; uint32_t iaf, oaf, oseq; int error, hlen, olen, plen; int want_seq, want_csum; plen = 0; sc = ifp->if_softc; if (sc == NULL) { error = ENETDOWN; m_freem(m); goto drop; } GRE_RLOCK(sc); if (sc->gre_family == 0) { GRE_RUNLOCK(sc); error = ENETDOWN; m_freem(m); goto drop; } iaf = m->m_pkthdr.csum_data; oaf = sc->gre_family; hlen = sc->gre_hlen; want_seq = (sc->gre_options & GRE_ENABLE_SEQ) != 0; if (want_seq) oseq = sc->gre_oseq++; /* XXX */ else oseq = 0; /* Make compiler happy. */ want_csum = (sc->gre_options & GRE_ENABLE_CSUM) != 0; M_SETFIB(m, sc->gre_fibnum); M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) { GRE_RUNLOCK(sc); error = ENOBUFS; goto drop; } bcopy(sc->gre_hdr, mtod(m, void *), hlen); GRE_RUNLOCK(sc); switch (oaf) { #ifdef INET case AF_INET: olen = sizeof(struct ip); break; #endif #ifdef INET6 case AF_INET6: olen = sizeof(struct ip6_hdr); break; #endif default: error = ENETDOWN; goto drop; } gh = (struct grehdr *)mtodo(m, olen); switch (iaf) { #ifdef INET case AF_INET: gh->gre_proto = htons(ETHERTYPE_IP); break; #endif #ifdef INET6 case AF_INET6: gh->gre_proto = htons(ETHERTYPE_IPV6); break; #endif default: error = ENETDOWN; goto drop; } if (want_seq) gre_setseqn(gh, oseq); if (want_csum) { *(uint16_t *)gh->gre_opts = in_cksum_skip(m, m->m_pkthdr.len, olen); } plen = m->m_pkthdr.len - hlen; switch (oaf) { #ifdef INET case AF_INET: error = in_gre_output(m, iaf, hlen); break; #endif #ifdef INET6 case AF_INET6: error = in6_gre_output(m, iaf, hlen); break; #endif default: m_freem(m); error = ENETDOWN; } drop: if (error) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, plen); } return (error); } static void gre_qflush(struct ifnet *ifp __unused) { } static int gremodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t gre_mod = { "if_gre", gremodevent, 0 }; DECLARE_MODULE(if_gre, gre_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_gre, 1); diff --git a/sys/net/if_gre.h b/sys/net/if_gre.h index 0eac9e9f33b0..1a068d4b1118 100644 --- a/sys/net/if_gre.h +++ b/sys/net/if_gre.h @@ -1,137 +1,137 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1998 The NetBSD Foundation, Inc. * Copyright (c) 2014 Andrey V. Elsukov * All rights reserved * * This code is derived from software contributed to The NetBSD Foundation * by Heiko W.Rupp * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: if_gre.h,v 1.13 2003/11/10 08:51:52 wiz Exp $ * $FreeBSD$ */ #ifndef _NET_IF_GRE_H_ #define _NET_IF_GRE_H_ #ifdef _KERNEL /* GRE header according to RFC 2784 and RFC 2890 */ struct grehdr { uint16_t gre_flags; /* GRE flags */ #define GRE_FLAGS_CP 0x8000 /* checksum present */ #define GRE_FLAGS_KP 0x2000 /* key present */ #define GRE_FLAGS_SP 0x1000 /* sequence present */ #define GRE_FLAGS_MASK (GRE_FLAGS_CP|GRE_FLAGS_KP|GRE_FLAGS_SP) uint16_t gre_proto; /* protocol type */ uint32_t gre_opts[0]; /* optional fields */ } __packed; #ifdef INET struct greip { struct ip gi_ip; struct grehdr gi_gre; } __packed; #endif #ifdef INET6 struct greip6 { struct ip6_hdr gi6_ip6; struct grehdr gi6_gre; } __packed; #endif struct gre_softc { struct ifnet *gre_ifp; LIST_ENTRY(gre_softc) gre_list; struct rmlock gre_lock; int gre_family; /* AF of delivery header */ uint32_t gre_iseq; uint32_t gre_oseq; uint32_t gre_key; uint32_t gre_options; u_int gre_fibnum; u_int gre_hlen; /* header size */ union { void *hdr; #ifdef INET struct greip *gihdr; #endif #ifdef INET6 struct greip6 *gi6hdr; #endif } gre_uhdr; const struct encaptab *gre_ecookie; }; #define GRE2IFP(sc) ((sc)->gre_ifp) #define GRE_LOCK_INIT(sc) rm_init(&(sc)->gre_lock, "gre softc") #define GRE_LOCK_DESTROY(sc) rm_destroy(&(sc)->gre_lock) #define GRE_RLOCK_TRACKER struct rm_priotracker gre_tracker #define GRE_RLOCK(sc) rm_rlock(&(sc)->gre_lock, &gre_tracker) #define GRE_RUNLOCK(sc) rm_runlock(&(sc)->gre_lock, &gre_tracker) #define GRE_RLOCK_ASSERT(sc) rm_assert(&(sc)->gre_lock, RA_RLOCKED) #define GRE_WLOCK(sc) rm_wlock(&(sc)->gre_lock) #define GRE_WUNLOCK(sc) rm_wunlock(&(sc)->gre_lock) #define GRE_WLOCK_ASSERT(sc) rm_assert(&(sc)->gre_lock, RA_WLOCKED) #define gre_hdr gre_uhdr.hdr #define gre_gihdr gre_uhdr.gihdr #define gre_gi6hdr gre_uhdr.gi6hdr #define gre_oip gre_gihdr->gi_ip #define gre_oip6 gre_gi6hdr->gi6_ip6 -int gre_input(struct mbuf **, int *, int); +int gre_input(struct mbuf *, int, int, void *); #ifdef INET int in_gre_attach(struct gre_softc *); int in_gre_output(struct mbuf *, int, int); #endif #ifdef INET6 int in6_gre_attach(struct gre_softc *); int in6_gre_output(struct mbuf *, int, int); #endif /* * CISCO uses special type for GRE tunnel created as part of WCCP * connection, while in fact those packets are just IPv4 encapsulated * into GRE. */ #define ETHERTYPE_WCCP 0x883E #endif /* _KERNEL */ #define GRESADDRS _IOW('i', 101, struct ifreq) #define GRESADDRD _IOW('i', 102, struct ifreq) #define GREGADDRS _IOWR('i', 103, struct ifreq) #define GREGADDRD _IOWR('i', 104, struct ifreq) #define GRESPROTO _IOW('i' , 105, struct ifreq) #define GREGPROTO _IOWR('i', 106, struct ifreq) #define GREGKEY _IOWR('i', 107, struct ifreq) #define GRESKEY _IOW('i', 108, struct ifreq) #define GREGOPTS _IOWR('i', 109, struct ifreq) #define GRESOPTS _IOW('i', 110, struct ifreq) #define GRE_ENABLE_CSUM 0x0001 #define GRE_ENABLE_SEQ 0x0002 #define GRE_OPTMASK (GRE_ENABLE_CSUM|GRE_ENABLE_SEQ) #endif /* _NET_IF_GRE_H_ */ diff --git a/sys/net/if_me.c b/sys/net/if_me.c index 4ab013bdcce2..806c57eabb4f 100644 --- a/sys/net/if_me.c +++ b/sys/net/if_me.c @@ -1,664 +1,652 @@ /*- * Copyright (c) 2014 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define MEMTU (1500 - sizeof(struct mobhdr)) static const char mename[] = "me"; static MALLOC_DEFINE(M_IFME, mename, "Minimal Encapsulation for IP"); static VNET_DEFINE(struct mtx, me_mtx); #define V_me_mtx VNET(me_mtx) /* Minimal forwarding header RFC 2004 */ struct mobhdr { uint8_t mob_proto; /* protocol */ uint8_t mob_flags; /* flags */ #define MOB_FLAGS_SP 0x80 /* source present */ uint16_t mob_csum; /* header checksum */ struct in_addr mob_dst; /* original destination address */ struct in_addr mob_src; /* original source addr (optional) */ } __packed; struct me_softc { struct ifnet *me_ifp; LIST_ENTRY(me_softc) me_list; struct rmlock me_lock; u_int me_fibnum; const struct encaptab *me_ecookie; struct in_addr me_src; struct in_addr me_dst; }; #define ME2IFP(sc) ((sc)->me_ifp) #define ME_READY(sc) ((sc)->me_src.s_addr != 0) #define ME_LOCK_INIT(sc) rm_init(&(sc)->me_lock, "me softc") #define ME_LOCK_DESTROY(sc) rm_destroy(&(sc)->me_lock) #define ME_RLOCK_TRACKER struct rm_priotracker me_tracker #define ME_RLOCK(sc) rm_rlock(&(sc)->me_lock, &me_tracker) #define ME_RUNLOCK(sc) rm_runlock(&(sc)->me_lock, &me_tracker) #define ME_RLOCK_ASSERT(sc) rm_assert(&(sc)->me_lock, RA_RLOCKED) #define ME_WLOCK(sc) rm_wlock(&(sc)->me_lock) #define ME_WUNLOCK(sc) rm_wunlock(&(sc)->me_lock) #define ME_WLOCK_ASSERT(sc) rm_assert(&(sc)->me_lock, RA_WLOCKED) #define ME_LIST_LOCK_INIT(x) mtx_init(&V_me_mtx, "me_mtx", NULL, MTX_DEF) #define ME_LIST_LOCK_DESTROY(x) mtx_destroy(&V_me_mtx) #define ME_LIST_LOCK(x) mtx_lock(&V_me_mtx) #define ME_LIST_UNLOCK(x) mtx_unlock(&V_me_mtx) static VNET_DEFINE(LIST_HEAD(, me_softc), me_softc_list); #define V_me_softc_list VNET(me_softc_list) static struct sx me_ioctl_sx; SX_SYSINIT(me_ioctl_sx, &me_ioctl_sx, "me_ioctl"); static int me_clone_create(struct if_clone *, int, caddr_t); static void me_clone_destroy(struct ifnet *); static VNET_DEFINE(struct if_clone *, me_cloner); #define V_me_cloner VNET(me_cloner) static void me_qflush(struct ifnet *); static int me_transmit(struct ifnet *, struct mbuf *); static int me_ioctl(struct ifnet *, u_long, caddr_t); static int me_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); -static int me_input(struct mbuf **, int *, int); +static int me_input(struct mbuf *, int, int, void *); static int me_set_tunnel(struct ifnet *, struct sockaddr_in *, struct sockaddr_in *); static void me_delete_tunnel(struct ifnet *); +static int me_encapcheck(const struct mbuf *, int, int, void *); + +#define ME_MINLEN (sizeof(struct ip) + sizeof(struct mobhdr) -\ + sizeof(in_addr_t)) +static const struct encap_config ipv4_encap_cfg = { + .proto = IPPROTO_MOBILE, + .min_length = ME_MINLEN, + .exact_match = (sizeof(in_addr_t) << 4) + 8, + .check = me_encapcheck, + .input = me_input +}; SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_TUNNEL, me, CTLFLAG_RW, 0, "Minimal Encapsulation for IP (RFC 2004)"); #ifndef MAX_ME_NEST #define MAX_ME_NEST 1 #endif static VNET_DEFINE(int, max_me_nesting) = MAX_ME_NEST; #define V_max_me_nesting VNET(max_me_nesting) SYSCTL_INT(_net_link_me, OID_AUTO, max_nesting, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_me_nesting), 0, "Max nested tunnels"); -extern struct domain inetdomain; -static const struct protosw in_mobile_protosw = { - .pr_type = SOCK_RAW, - .pr_domain = &inetdomain, - .pr_protocol = IPPROTO_MOBILE, - .pr_flags = PR_ATOMIC|PR_ADDR, - .pr_input = me_input, - .pr_output = rip_output, - .pr_ctlinput = rip_ctlinput, - .pr_ctloutput = rip_ctloutput, - .pr_usrreqs = &rip_usrreqs -}; - static void vnet_me_init(const void *unused __unused) { LIST_INIT(&V_me_softc_list); ME_LIST_LOCK_INIT(); V_me_cloner = if_clone_simple(mename, me_clone_create, me_clone_destroy, 0); } VNET_SYSINIT(vnet_me_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_me_init, NULL); static void vnet_me_uninit(const void *unused __unused) { if_clone_detach(V_me_cloner); ME_LIST_LOCK_DESTROY(); } VNET_SYSUNINIT(vnet_me_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_me_uninit, NULL); static int me_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct me_softc *sc; sc = malloc(sizeof(struct me_softc), M_IFME, M_WAITOK | M_ZERO); sc->me_fibnum = curthread->td_proc->p_fibnum; ME2IFP(sc) = if_alloc(IFT_TUNNEL); ME_LOCK_INIT(sc); ME2IFP(sc)->if_softc = sc; if_initname(ME2IFP(sc), mename, unit); ME2IFP(sc)->if_mtu = MEMTU;; ME2IFP(sc)->if_flags = IFF_POINTOPOINT|IFF_MULTICAST; ME2IFP(sc)->if_output = me_output; ME2IFP(sc)->if_ioctl = me_ioctl; ME2IFP(sc)->if_transmit = me_transmit; ME2IFP(sc)->if_qflush = me_qflush; ME2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE; ME2IFP(sc)->if_capenable |= IFCAP_LINKSTATE; if_attach(ME2IFP(sc)); bpfattach(ME2IFP(sc), DLT_NULL, sizeof(u_int32_t)); ME_LIST_LOCK(); LIST_INSERT_HEAD(&V_me_softc_list, sc, me_list); ME_LIST_UNLOCK(); return (0); } static void me_clone_destroy(struct ifnet *ifp) { struct me_softc *sc; sx_xlock(&me_ioctl_sx); sc = ifp->if_softc; me_delete_tunnel(ifp); ME_LIST_LOCK(); LIST_REMOVE(sc, me_list); ME_LIST_UNLOCK(); bpfdetach(ifp); if_detach(ifp); ifp->if_softc = NULL; sx_xunlock(&me_ioctl_sx); if_free(ifp); ME_LOCK_DESTROY(sc); free(sc, M_IFME); } static int me_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { ME_RLOCK_TRACKER; struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *src, *dst; struct me_softc *sc; int error; switch (cmd) { case SIOCSIFMTU: if (ifr->ifr_mtu < 576) return (EINVAL); ifp->if_mtu = ifr->ifr_mtu; return (0); case SIOCSIFADDR: ifp->if_flags |= IFF_UP; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: return (0); } sx_xlock(&me_ioctl_sx); sc = ifp->if_softc; if (sc == NULL) { error = ENXIO; goto end; } error = 0; switch (cmd) { case SIOCSIFPHYADDR: src = (struct sockaddr_in *) &(((struct in_aliasreq *)data)->ifra_addr); dst = (struct sockaddr_in *) &(((struct in_aliasreq *)data)->ifra_dstaddr); if (src->sin_family != dst->sin_family || src->sin_family != AF_INET || src->sin_len != dst->sin_len || src->sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; break; } if (src->sin_addr.s_addr == INADDR_ANY || dst->sin_addr.s_addr == INADDR_ANY) { error = EADDRNOTAVAIL; break; } error = me_set_tunnel(ifp, src, dst); break; case SIOCDIFPHYADDR: me_delete_tunnel(ifp); break; case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: ME_RLOCK(sc); if (!ME_READY(sc)) { error = EADDRNOTAVAIL; ME_RUNLOCK(sc); break; } src = (struct sockaddr_in *)&ifr->ifr_addr; memset(src, 0, sizeof(*src)); src->sin_family = AF_INET; src->sin_len = sizeof(*src); switch (cmd) { case SIOCGIFPSRCADDR: src->sin_addr = sc->me_src; break; case SIOCGIFPDSTADDR: src->sin_addr = sc->me_dst; break; } ME_RUNLOCK(sc); error = prison_if(curthread->td_ucred, sintosa(src)); if (error != 0) memset(src, 0, sizeof(*src)); break; case SIOCGTUNFIB: ifr->ifr_fib = sc->me_fibnum; break; case SIOCSTUNFIB: if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; if (ifr->ifr_fib >= rt_numfibs) error = EINVAL; else sc->me_fibnum = ifr->ifr_fib; break; default: error = EINVAL; break; } end: sx_xunlock(&me_ioctl_sx); return (error); } static int me_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { ME_RLOCK_TRACKER; struct me_softc *sc; struct ip *ip; int ret; sc = (struct me_softc *)arg; if ((ME2IFP(sc)->if_flags & IFF_UP) == 0) return (0); M_ASSERTPKTHDR(m); - if (m->m_pkthdr.len < sizeof(struct ip) + sizeof(struct mobhdr) - - sizeof(struct in_addr)) - return (0); - ret = 0; ME_RLOCK(sc); if (ME_READY(sc)) { ip = mtod(m, struct ip *); if (sc->me_src.s_addr == ip->ip_dst.s_addr && sc->me_dst.s_addr == ip->ip_src.s_addr) - ret = 32 * 2; + ret = 32 * 2 + 8; } ME_RUNLOCK(sc); return (ret); } static int me_set_tunnel(struct ifnet *ifp, struct sockaddr_in *src, struct sockaddr_in *dst) { struct me_softc *sc, *tsc; sx_assert(&me_ioctl_sx, SA_XLOCKED); ME_LIST_LOCK(); sc = ifp->if_softc; LIST_FOREACH(tsc, &V_me_softc_list, me_list) { if (tsc == sc || !ME_READY(tsc)) continue; if (tsc->me_src.s_addr == src->sin_addr.s_addr && tsc->me_dst.s_addr == dst->sin_addr.s_addr) { ME_LIST_UNLOCK(); return (EADDRNOTAVAIL); } } ME_LIST_UNLOCK(); ME_WLOCK(sc); sc->me_dst = dst->sin_addr; sc->me_src = src->sin_addr; ME_WUNLOCK(sc); if (sc->me_ecookie == NULL) - sc->me_ecookie = encap_attach_func(AF_INET, IPPROTO_MOBILE, - me_encapcheck, &in_mobile_protosw, sc); + sc->me_ecookie = ip_encap_attach(&ipv4_encap_cfg, + sc, M_WAITOK); if (sc->me_ecookie != NULL) { ifp->if_drv_flags |= IFF_DRV_RUNNING; if_link_state_change(ifp, LINK_STATE_UP); } return (0); } static void me_delete_tunnel(struct ifnet *ifp) { struct me_softc *sc = ifp->if_softc; sx_assert(&me_ioctl_sx, SA_XLOCKED); if (sc->me_ecookie != NULL) - encap_detach(sc->me_ecookie); + ip_encap_detach(sc->me_ecookie); sc->me_ecookie = NULL; ME_WLOCK(sc); sc->me_src.s_addr = 0; sc->me_dst.s_addr = 0; ME_WUNLOCK(sc); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(ifp, LINK_STATE_DOWN); } static uint16_t me_in_cksum(uint16_t *p, int nwords) { uint32_t sum = 0; while (nwords-- > 0) sum += *p++; sum = (sum >> 16) + (sum & 0xffff); sum += (sum >> 16); return (~sum); } -int -me_input(struct mbuf **mp, int *offp, int proto) +static int +me_input(struct mbuf *m, int off, int proto, void *arg) { - struct me_softc *sc; + struct me_softc *sc = arg; struct mobhdr *mh; struct ifnet *ifp; - struct mbuf *m; struct ip *ip; int hlen; - m = *mp; - sc = encap_getarg(m); - KASSERT(sc != NULL, ("encap_getarg returned NULL")); - ifp = ME2IFP(sc); /* checks for short packets */ hlen = sizeof(struct mobhdr); if (m->m_pkthdr.len < sizeof(struct ip) + hlen) hlen -= sizeof(struct in_addr); if (m->m_len < sizeof(struct ip) + hlen) m = m_pullup(m, sizeof(struct ip) + hlen); if (m == NULL) goto drop; mh = (struct mobhdr *)mtodo(m, sizeof(struct ip)); /* check for wrong flags */ if (mh->mob_flags & (~MOB_FLAGS_SP)) { m_freem(m); goto drop; } if (mh->mob_flags) { if (hlen != sizeof(struct mobhdr)) { m_freem(m); goto drop; } } else hlen = sizeof(struct mobhdr) - sizeof(struct in_addr); /* check mobile header checksum */ if (me_in_cksum((uint16_t *)mh, hlen / sizeof(uint16_t)) != 0) { m_freem(m); goto drop; } #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif ip = mtod(m, struct ip *); ip->ip_dst = mh->mob_dst; ip->ip_p = mh->mob_proto; ip->ip_sum = 0; ip->ip_len = htons(m->m_pkthdr.len - hlen); if (mh->mob_flags) ip->ip_src = mh->mob_src; memmove(mtodo(m, hlen), ip, sizeof(struct ip)); m_adj(m, hlen); m_clrprotoflags(m); m->m_pkthdr.rcvif = ifp; m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); M_SETFIB(m, ifp->if_fib); hlen = AF_INET; BPF_MTAP2(ifp, &hlen, sizeof(hlen), m); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if ((ifp->if_flags & IFF_MONITOR) != 0) m_freem(m); else netisr_dispatch(NETISR_IP, m); return (IPPROTO_DONE); drop: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return (IPPROTO_DONE); } #define MTAG_ME 1414491977 static int me_check_nesting(struct ifnet *ifp, struct mbuf *m) { struct m_tag *mtag; int count; count = 1; mtag = NULL; while ((mtag = m_tag_locate(m, MTAG_ME, 0, mtag)) != NULL) { if (*(struct ifnet **)(mtag + 1) == ifp) { log(LOG_NOTICE, "%s: loop detected\n", ifp->if_xname); return (EIO); } count++; } if (count > V_max_me_nesting) { log(LOG_NOTICE, "%s: if_output recursively called too many times(%d)\n", ifp->if_xname, count); return (EIO); } mtag = m_tag_alloc(MTAG_ME, 0, sizeof(struct ifnet *), M_NOWAIT); if (mtag == NULL) return (ENOMEM); *(struct ifnet **)(mtag + 1) = ifp; m_tag_prepend(m, mtag); return (0); } static int me_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { uint32_t af; int error; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error != 0) goto drop; #endif if ((ifp->if_flags & IFF_MONITOR) != 0 || (ifp->if_flags & IFF_UP) == 0) { error = ENETDOWN; goto drop; } error = me_check_nesting(ifp, m); if (error != 0) goto drop; m->m_flags &= ~(M_BCAST|M_MCAST); if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else af = dst->sa_family; if (af != AF_INET) { error = EAFNOSUPPORT; goto drop; } BPF_MTAP2(ifp, &af, sizeof(af), m); return (ifp->if_transmit(ifp, m)); drop: m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } static int me_transmit(struct ifnet *ifp, struct mbuf *m) { ME_RLOCK_TRACKER; struct mobhdr mh; struct me_softc *sc; struct ip *ip; int error, hlen, plen; sc = ifp->if_softc; if (sc == NULL) { error = ENETDOWN; m_freem(m); goto drop; } if (m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) { error = ENOBUFS; goto drop; } ip = mtod(m, struct ip *); /* Fragmented datagramms shouldn't be encapsulated */ if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) { error = EINVAL; m_freem(m); goto drop; } mh.mob_proto = ip->ip_p; mh.mob_src = ip->ip_src; mh.mob_dst = ip->ip_dst; ME_RLOCK(sc); if (!ME_READY(sc)) { ME_RUNLOCK(sc); error = ENETDOWN; m_freem(m); goto drop; } if (in_hosteq(sc->me_src, ip->ip_src)) { hlen = sizeof(struct mobhdr) - sizeof(struct in_addr); mh.mob_flags = 0; } else { hlen = sizeof(struct mobhdr); mh.mob_flags = MOB_FLAGS_SP; } plen = m->m_pkthdr.len; ip->ip_src = sc->me_src; ip->ip_dst = sc->me_dst; M_SETFIB(m, sc->me_fibnum); ME_RUNLOCK(sc); M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto drop; } if (m->m_len < sizeof(struct ip) + hlen) m = m_pullup(m, sizeof(struct ip) + hlen); if (m == NULL) { error = ENOBUFS; goto drop; } memmove(mtod(m, void *), mtodo(m, hlen), sizeof(struct ip)); ip = mtod(m, struct ip *); ip->ip_len = htons(m->m_pkthdr.len); ip->ip_p = IPPROTO_MOBILE; ip->ip_sum = 0; mh.mob_csum = 0; mh.mob_csum = me_in_cksum((uint16_t *)&mh, hlen / sizeof(uint16_t)); bcopy(&mh, mtodo(m, sizeof(struct ip)), hlen); error = ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); drop: if (error) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, plen); } return (error); } static void me_qflush(struct ifnet *ifp __unused) { } static int memodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t me_mod = { "if_me", memodevent, 0 }; DECLARE_MODULE(if_me, me_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_me, 1); diff --git a/sys/net/if_stf.c b/sys/net/if_stf.c index b10202076d02..f073e20858cd 100644 --- a/sys/net/if_stf.c +++ b/sys/net/if_stf.c @@ -1,767 +1,753 @@ /* $FreeBSD$ */ /* $KAME: if_stf.c,v 1.73 2001/12/03 11:08:30 keiichi Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 2000 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * 6to4 interface, based on RFC3056. * * 6to4 interface is NOT capable of link-layer (I mean, IPv4) multicasting. * There is no address mapping defined from IPv6 multicast address to IPv4 * address. Therefore, we do not have IFF_MULTICAST on the interface. * * Due to the lack of address mapping for link-local addresses, we cannot * throw packets toward link-local addresses (fe80::x). Also, we cannot throw * packets to link-local multicast addresses (ff02::x). * * Here are interesting symptoms due to the lack of link-local address: * * Unicast routing exchange: * - RIPng: Impossible. Uses link-local multicast packet toward ff02::9, * and link-local addresses as nexthop. * - OSPFv6: Impossible. OSPFv6 assumes that there's link-local address * assigned to the link, and makes use of them. Also, HELLO packets use * link-local multicast addresses (ff02::5 and ff02::6). * - BGP4+: Maybe. You can only use global address as nexthop, and global * address as TCP endpoint address. * * Multicast routing protocols: * - PIM: Hello packet cannot be used to discover adjacent PIM routers. * Adjacent PIM routers must be configured manually (is it really spec-wise * correct thing to do?). * * ICMPv6: * - Redirects cannot be used due to the lack of link-local address. * * stf interface does not have, and will not need, a link-local address. * It seems to have no real benefit and does not help the above symptoms much. * Even if we assign link-locals to interface, we cannot really * use link-local unicast/multicast on top of 6to4 cloud (since there's no * encapsulation defined for link-local address), and the above analysis does * not change. RFC3056 does not mandate the assignment of link-local address * either. * * 6to4 interface has security issues. Refer to * http://playground.iijlab.net/i-d/draft-itojun-ipv6-transition-abuse-00.txt * for details. The code tries to filter out some of malicious packets. * Note that there is no way to be 100% secure. */ #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_STF, stf, CTLFLAG_RW, 0, "6to4 Interface"); static int stf_permit_rfc1918 = 0; SYSCTL_INT(_net_link_stf, OID_AUTO, permit_rfc1918, CTLFLAG_RWTUN, &stf_permit_rfc1918, 0, "Permit the use of private IPv4 addresses"); #define STFUNIT 0 #define IN6_IS_ADDR_6TO4(x) (ntohs((x)->s6_addr16[0]) == 0x2002) /* * XXX: Return a pointer with 16-bit aligned. Don't cast it to * struct in_addr *; use bcopy() instead. */ #define GET_V4(x) (&(x)->s6_addr16[1]) struct stf_softc { struct ifnet *sc_ifp; u_int sc_fibnum; const struct encaptab *encap_cookie; }; #define STF2IFP(sc) ((sc)->sc_ifp) static const char stfname[] = "stf"; static MALLOC_DEFINE(M_STF, stfname, "6to4 Tunnel Interface"); static const int ip_stf_ttl = 40; -extern struct domain inetdomain; -static int in_stf_input(struct mbuf **, int *, int); -static struct protosw in_stf_protosw = { - .pr_type = SOCK_RAW, - .pr_domain = &inetdomain, - .pr_protocol = IPPROTO_IPV6, - .pr_flags = PR_ATOMIC|PR_ADDR, - .pr_input = in_stf_input, - .pr_output = rip_output, - .pr_ctloutput = rip_ctloutput, - .pr_usrreqs = &rip_usrreqs -}; - +static int in_stf_input(struct mbuf *, int, int, void *); static char *stfnames[] = {"stf0", "stf", "6to4", NULL}; static int stfmodevent(module_t, int, void *); static int stf_encapcheck(const struct mbuf *, int, int, void *); static int stf_getsrcifa6(struct ifnet *, struct in6_addr *, struct in6_addr *); static int stf_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int isrfc1918addr(struct in_addr *); static int stf_checkaddr4(struct stf_softc *, struct in_addr *, struct ifnet *); static int stf_checkaddr6(struct stf_softc *, struct in6_addr *, struct ifnet *); static int stf_ioctl(struct ifnet *, u_long, caddr_t); static int stf_clone_match(struct if_clone *, const char *); static int stf_clone_create(struct if_clone *, char *, size_t, caddr_t); static int stf_clone_destroy(struct if_clone *, struct ifnet *); static struct if_clone *stf_cloner; +static const struct encap_config ipv4_encap_cfg = { + .proto = IPPROTO_IPV6, + .min_length = sizeof(struct ip), + .exact_match = (sizeof(in_addr_t) << 3) + 8, + .check = stf_encapcheck, + .input = in_stf_input +}; + static int stf_clone_match(struct if_clone *ifc, const char *name) { int i; for(i = 0; stfnames[i] != NULL; i++) { if (strcmp(stfnames[i], name) == 0) return (1); } return (0); } static int stf_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { char *dp; int err, unit, wildcard; struct stf_softc *sc; struct ifnet *ifp; err = ifc_name2unit(name, &unit); if (err != 0) return (err); wildcard = (unit < 0); /* * We can only have one unit, but since unit allocation is * already locked, we use it to keep from allocating extra * interfaces. */ unit = STFUNIT; err = ifc_alloc_unit(ifc, &unit); if (err != 0) return (err); sc = malloc(sizeof(struct stf_softc), M_STF, M_WAITOK | M_ZERO); ifp = STF2IFP(sc) = if_alloc(IFT_STF); if (ifp == NULL) { free(sc, M_STF); ifc_free_unit(ifc, unit); return (ENOSPC); } ifp->if_softc = sc; sc->sc_fibnum = curthread->td_proc->p_fibnum; /* * Set the name manually rather then using if_initname because * we don't conform to the default naming convention for interfaces. * In the wildcard case, we need to update the name. */ if (wildcard) { for (dp = name; *dp != '\0'; dp++); if (snprintf(dp, len - (dp-name), "%d", unit) > len - (dp-name) - 1) { /* * This can only be a programmer error and * there's no straightforward way to recover if * it happens. */ panic("if_clone_create(): interface name too long"); } } strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = stfname; ifp->if_dunit = IF_DUNIT_NONE; - sc->encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV6, - stf_encapcheck, &in_stf_protosw, sc); + sc->encap_cookie = ip_encap_attach(&ipv4_encap_cfg, sc, M_WAITOK); if (sc->encap_cookie == NULL) { if_printf(ifp, "attach failed\n"); free(sc, M_STF); ifc_free_unit(ifc, unit); return (ENOMEM); } ifp->if_mtu = IPV6_MMTU; ifp->if_ioctl = stf_ioctl; ifp->if_output = stf_output; ifp->if_snd.ifq_maxlen = ifqmaxlen; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); return (0); } static int stf_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) { struct stf_softc *sc = ifp->if_softc; int err __unused; - err = encap_detach(sc->encap_cookie); + err = ip_encap_detach(sc->encap_cookie); KASSERT(err == 0, ("Unexpected error detaching encap_cookie")); bpfdetach(ifp); if_detach(ifp); if_free(ifp); free(sc, M_STF); ifc_free_unit(ifc, STFUNIT); return (0); } static int stfmodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: stf_cloner = if_clone_advanced(stfname, 0, stf_clone_match, stf_clone_create, stf_clone_destroy); break; case MOD_UNLOAD: if_clone_detach(stf_cloner); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t stf_mod = { "if_stf", stfmodevent, 0 }; DECLARE_MODULE(if_stf, stf_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); static int stf_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { struct ip ip; struct stf_softc *sc; struct in_addr a, b, mask; struct in6_addr addr6, mask6; sc = (struct stf_softc *)arg; if (sc == NULL) return 0; if ((STF2IFP(sc)->if_flags & IFF_UP) == 0) return 0; /* IFF_LINK0 means "no decapsulation" */ if ((STF2IFP(sc)->if_flags & IFF_LINK0) != 0) return 0; if (proto != IPPROTO_IPV6) return 0; m_copydata(m, 0, sizeof(ip), (caddr_t)&ip); if (ip.ip_v != 4) return 0; if (stf_getsrcifa6(STF2IFP(sc), &addr6, &mask6) != 0) return (0); /* * check if IPv4 dst matches the IPv4 address derived from the * local 6to4 address. * success on: dst = 10.1.1.1, ia6->ia_addr = 2002:0a01:0101:... */ if (bcmp(GET_V4(&addr6), &ip.ip_dst, sizeof(ip.ip_dst)) != 0) return 0; /* * check if IPv4 src matches the IPv4 address derived from the * local 6to4 address masked by prefixmask. * success on: src = 10.1.1.1, ia6->ia_addr = 2002:0a00:.../24 * fail on: src = 10.1.1.1, ia6->ia_addr = 2002:0b00:.../24 */ bzero(&a, sizeof(a)); bcopy(GET_V4(&addr6), &a, sizeof(a)); bcopy(GET_V4(&mask6), &mask, sizeof(mask)); a.s_addr &= mask.s_addr; b = ip.ip_src; b.s_addr &= mask.s_addr; if (a.s_addr != b.s_addr) return 0; /* stf interface makes single side match only */ return 32; } static int stf_getsrcifa6(struct ifnet *ifp, struct in6_addr *addr, struct in6_addr *mask) { struct ifaddr *ia; struct in_ifaddr *ia4; struct in6_ifaddr *ia6; struct sockaddr_in6 *sin6; struct in_addr in; if_addr_rlock(ifp); CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ia->ifa_addr; if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) continue; bcopy(GET_V4(&sin6->sin6_addr), &in, sizeof(in)); LIST_FOREACH(ia4, INADDR_HASH(in.s_addr), ia_hash) if (ia4->ia_addr.sin_addr.s_addr == in.s_addr) break; if (ia4 == NULL) continue; ia6 = (struct in6_ifaddr *)ia; *addr = sin6->sin6_addr; *mask = ia6->ia_prefixmask.sin6_addr; if_addr_runlock(ifp); return (0); } if_addr_runlock(ifp); return (ENOENT); } static int stf_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct stf_softc *sc; const struct sockaddr_in6 *dst6; struct in_addr in4; const void *ptr; u_int8_t tos; struct ip *ip; struct ip6_hdr *ip6; struct in6_addr addr6, mask6; int error; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); return (error); } #endif sc = ifp->if_softc; dst6 = (const struct sockaddr_in6 *)dst; /* just in case */ if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETDOWN; } /* * If we don't have an ip4 address that match my inner ip6 address, * we shouldn't generate output. Without this check, we'll end up * using wrong IPv4 source. */ if (stf_getsrcifa6(ifp, &addr6, &mask6) != 0) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETDOWN; } if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENOBUFS; } } ip6 = mtod(m, struct ip6_hdr *); tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; /* * Pickup the right outer dst addr from the list of candidates. * ip6_dst has priority as it may be able to give us shorter IPv4 hops. */ ptr = NULL; if (IN6_IS_ADDR_6TO4(&ip6->ip6_dst)) ptr = GET_V4(&ip6->ip6_dst); else if (IN6_IS_ADDR_6TO4(&dst6->sin6_addr)) ptr = GET_V4(&dst6->sin6_addr); else { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETUNREACH; } bcopy(ptr, &in4, sizeof(in4)); if (bpf_peers_present(ifp->if_bpf)) { /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ u_int af = AF_INET6; bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); } M_PREPEND(m, sizeof(struct ip), M_NOWAIT); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENOBUFS; } ip = mtod(m, struct ip *); bzero(ip, sizeof(*ip)); bcopy(GET_V4(&addr6), &ip->ip_src, sizeof(ip->ip_src)); bcopy(&in4, &ip->ip_dst, sizeof(ip->ip_dst)); ip->ip_p = IPPROTO_IPV6; ip->ip_ttl = ip_stf_ttl; ip->ip_len = htons(m->m_pkthdr.len); if (ifp->if_flags & IFF_LINK1) ip_ecn_ingress(ECN_ALLOWED, &ip->ip_tos, &tos); else ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos); M_SETFIB(m, sc->sc_fibnum); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); error = ip_output(m, NULL, NULL, 0, NULL, NULL); return error; } static int isrfc1918addr(struct in_addr *in) { /* * returns 1 if private address range: * 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16 */ if (stf_permit_rfc1918 == 0 && ( (ntohl(in->s_addr) & 0xff000000) >> 24 == 10 || (ntohl(in->s_addr) & 0xfff00000) >> 16 == 172 * 256 + 16 || (ntohl(in->s_addr) & 0xffff0000) >> 16 == 192 * 256 + 168)) return 1; return 0; } static int stf_checkaddr4(struct stf_softc *sc, struct in_addr *in, struct ifnet *inifp) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia4; /* * reject packets with the following address: * 224.0.0.0/4 0.0.0.0/8 127.0.0.0/8 255.0.0.0/8 */ if (IN_MULTICAST(ntohl(in->s_addr))) return -1; switch ((ntohl(in->s_addr) & 0xff000000) >> 24) { case 0: case 127: case 255: return -1; } /* * reject packets with private address range. * (requirement from RFC3056 section 2 1st paragraph) */ if (isrfc1918addr(in)) return -1; /* * reject packets with broadcast */ IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia4, &V_in_ifaddrhead, ia_link) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) continue; if (in->s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return -1; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * perform ingress filter */ if (sc && (STF2IFP(sc)->if_flags & IFF_LINK2) == 0 && inifp) { struct nhop4_basic nh4; if (fib4_lookup_nh_basic(sc->sc_fibnum, *in, 0, 0, &nh4) != 0) return (-1); if (nh4.nh_ifp != inifp) return (-1); } return 0; } static int stf_checkaddr6(struct stf_softc *sc, struct in6_addr *in6, struct ifnet *inifp) { /* * check 6to4 addresses */ if (IN6_IS_ADDR_6TO4(in6)) { struct in_addr in4; bcopy(GET_V4(in6), &in4, sizeof(in4)); return stf_checkaddr4(sc, &in4, inifp); } /* * reject anything that look suspicious. the test is implemented * in ip6_input too, but we check here as well to * (1) reject bad packets earlier, and * (2) to be safe against future ip6_input change. */ if (IN6_IS_ADDR_V4COMPAT(in6) || IN6_IS_ADDR_V4MAPPED(in6)) return -1; return 0; } static int -in_stf_input(struct mbuf **mp, int *offp, int proto) +in_stf_input(struct mbuf *m, int off, int proto, void *arg) { - struct stf_softc *sc; + struct stf_softc *sc = arg; struct ip *ip; struct ip6_hdr *ip6; - struct mbuf *m; u_int8_t otos, itos; struct ifnet *ifp; - int off; - - m = *mp; - off = *offp; if (proto != IPPROTO_IPV6) { m_freem(m); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); - - sc = (struct stf_softc *)encap_getarg(m); - if (sc == NULL || (STF2IFP(sc)->if_flags & IFF_UP) == 0) { m_freem(m); return (IPPROTO_DONE); } ifp = STF2IFP(sc); #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif /* * perform sanity check against outer src/dst. * for source, perform ingress filter as well. */ if (stf_checkaddr4(sc, &ip->ip_dst, NULL) < 0 || stf_checkaddr4(sc, &ip->ip_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); return (IPPROTO_DONE); } otos = ip->ip_tos; m_adj(m, off); if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) return (IPPROTO_DONE); } ip6 = mtod(m, struct ip6_hdr *); /* * perform sanity check against inner src/dst. * for source, perform ingress filter as well. */ if (stf_checkaddr6(sc, &ip6->ip6_dst, NULL) < 0 || stf_checkaddr6(sc, &ip6->ip6_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); return (IPPROTO_DONE); } itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; if ((ifp->if_flags & IFF_LINK1) != 0) ip_ecn_egress(ECN_ALLOWED, &otos, &itos); else ip_ecn_egress(ECN_NOCARE, &otos, &itos); ip6->ip6_flow &= ~htonl(0xff << 20); ip6->ip6_flow |= htonl((u_int32_t)itos << 20); m->m_pkthdr.rcvif = ifp; - + if (bpf_peers_present(ifp->if_bpf)) { /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ u_int32_t af = AF_INET6; bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); } /* * Put the packet to the network layer input queue according to the * specified address family. * See net/if_gif.c for possible issues with packet processing * reorder due to extra queueing. */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); M_SETFIB(m, ifp->if_fib); netisr_dispatch(NETISR_IPV6, m); return (IPPROTO_DONE); } static int stf_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifaddr *ifa; struct ifreq *ifr; struct sockaddr_in6 *sin6; struct in_addr addr; int error, mtu; error = 0; switch (cmd) { case SIOCSIFADDR: ifa = (struct ifaddr *)data; if (ifa == NULL || ifa->ifa_addr->sa_family != AF_INET6) { error = EAFNOSUPPORT; break; } sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) { error = EINVAL; break; } bcopy(GET_V4(&sin6->sin6_addr), &addr, sizeof(addr)); if (isrfc1918addr(&addr)) { error = EINVAL; break; } ifp->if_flags |= IFF_UP; break; case SIOCADDMULTI: case SIOCDELMULTI: ifr = (struct ifreq *)data; if (ifr && ifr->ifr_addr.sa_family == AF_INET6) ; else error = EAFNOSUPPORT; break; case SIOCGIFMTU: break; case SIOCSIFMTU: ifr = (struct ifreq *)data; mtu = ifr->ifr_mtu; /* RFC 4213 3.2 ideal world MTU */ if (mtu < IPV6_MINMTU || mtu > IF_MAXMTU - 20) return (EINVAL); ifp->if_mtu = mtu; break; default: error = EINVAL; break; } return error; } diff --git a/sys/netinet/in_gif.c b/sys/netinet/in_gif.c index c30709c878ea..835760c8a4d6 100644 --- a/sys/netinet/in_gif.c +++ b/sys/netinet/in_gif.c @@ -1,218 +1,206 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in_gif.c,v 1.54 2001/05/14 14:02:16 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include -static int in_gif_input(struct mbuf **, int *, int); - -extern struct domain inetdomain; -static struct protosw in_gif_protosw = { - .pr_type = SOCK_RAW, - .pr_domain = &inetdomain, - .pr_protocol = 0/* IPPROTO_IPV[46] */, - .pr_flags = PR_ATOMIC|PR_ADDR, - .pr_input = in_gif_input, - .pr_output = rip_output, - .pr_ctloutput = rip_ctloutput, - .pr_usrreqs = &rip_usrreqs -}; - #define GIF_TTL 30 static VNET_DEFINE(int, ip_gif_ttl) = GIF_TTL; #define V_ip_gif_ttl VNET(ip_gif_ttl) SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_VNET | CTLFLAG_RW, - &VNET_NAME(ip_gif_ttl), 0, ""); + &VNET_NAME(ip_gif_ttl), 0, "Default TTL value for encapsulated packets"); int in_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn) { GIF_RLOCK_TRACKER; struct gif_softc *sc = ifp->if_softc; struct ip *ip; int len; /* prepend new IP header */ len = sizeof(struct ip); #ifndef __NO_STRICT_ALIGNMENT if (proto == IPPROTO_ETHERIP) len += ETHERIP_ALIGN; #endif M_PREPEND(m, len, M_NOWAIT); if (m == NULL) return (ENOBUFS); #ifndef __NO_STRICT_ALIGNMENT if (proto == IPPROTO_ETHERIP) { len = mtod(m, vm_offset_t) & 3; KASSERT(len == 0 || len == ETHERIP_ALIGN, ("in_gif_output: unexpected misalignment")); m->m_data += len; m->m_len -= ETHERIP_ALIGN; } #endif ip = mtod(m, struct ip *); GIF_RLOCK(sc); if (sc->gif_family != AF_INET) { m_freem(m); GIF_RUNLOCK(sc); return (ENETDOWN); } bcopy(sc->gif_iphdr, ip, sizeof(struct ip)); GIF_RUNLOCK(sc); ip->ip_p = proto; /* version will be set in ip_output() */ ip->ip_ttl = V_ip_gif_ttl; ip->ip_len = htons(m->m_pkthdr.len); ip->ip_tos = ecn; return (ip_output(m, NULL, NULL, 0, NULL, NULL)); } static int -in_gif_input(struct mbuf **mp, int *offp, int proto) +in_gif_input(struct mbuf *m, int off, int proto, void *arg) { - struct mbuf *m = *mp; - struct gif_softc *sc; + struct gif_softc *sc = arg; struct ifnet *gifp; struct ip *ip; uint8_t ecn; - sc = encap_getarg(m); if (sc == NULL) { m_freem(m); KMOD_IPSTAT_INC(ips_nogif); return (IPPROTO_DONE); } gifp = GIF2IFP(sc); if ((gifp->if_flags & IFF_UP) != 0) { ip = mtod(m, struct ip *); ecn = ip->ip_tos; - m_adj(m, *offp); + m_adj(m, off); gif_input(m, gifp, proto, ecn); } else { m_freem(m); KMOD_IPSTAT_INC(ips_nogif); } return (IPPROTO_DONE); } /* * we know that we are in IFF_UP, outer address available, and outer family * matched the physical addr family. see gif_encapcheck(). */ int in_gif_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { const struct ip *ip; struct gif_softc *sc; int ret; /* sanity check done in caller */ sc = (struct gif_softc *)arg; GIF_RLOCK_ASSERT(sc); /* check for address match */ ip = mtod(m, const struct ip *); if (sc->gif_iphdr->ip_src.s_addr != ip->ip_dst.s_addr) return (0); - ret = 32; + ret = 32 + 8; /* src + proto */ if (sc->gif_iphdr->ip_dst.s_addr != ip->ip_src.s_addr) { if ((sc->gif_options & GIF_IGNORE_SOURCE) == 0) return (0); } else ret += 32; /* ingress filters on outer source */ if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0) { struct nhop4_basic nh4; struct in_addr dst; dst = ip->ip_src; if (fib4_lookup_nh_basic(sc->gif_fibnum, dst, 0, 0, &nh4) != 0) return (0); if (nh4.nh_ifp != m->m_pkthdr.rcvif) return (0); } return (ret); } +static const struct encap_config ipv4_encap_cfg = { + .proto = -1, + .min_length = sizeof(struct ip), + .exact_match = (sizeof(in_addr_t) << 4) + 8, + .check = gif_encapcheck, + .input = in_gif_input +}; + int in_gif_attach(struct gif_softc *sc) { KASSERT(sc->gif_ecookie == NULL, ("gif_ecookie isn't NULL")); - sc->gif_ecookie = encap_attach_func(AF_INET, -1, gif_encapcheck, - &in_gif_protosw, sc); - if (sc->gif_ecookie == NULL) - return (EEXIST); + sc->gif_ecookie = ip_encap_attach(&ipv4_encap_cfg, sc, M_WAITOK); return (0); } diff --git a/sys/netinet/ip_encap.c b/sys/netinet/ip_encap.c index 82d46986e12b..7679d4202bae 100644 --- a/sys/netinet/ip_encap.c +++ b/sys/netinet/ip_encap.c @@ -1,472 +1,270 @@ /* $KAME: ip_encap.c,v 1.41 2001/03/15 08:35:08 itojun Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * Copyright (c) 2018 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * My grandfather said that there's a devil inside tunnelling technology... * * We have surprisingly many protocols that want packets with IP protocol * #4 or #41. Here's a list of protocols that want protocol #41: * RFC1933 configured tunnel * RFC1933 automatic tunnel * RFC2401 IPsec tunnel * RFC2473 IPv6 generic packet tunnelling * RFC2529 6over4 tunnel * mobile-ip6 (uses RFC2473) * RFC3056 6to4 tunnel * isatap tunnel * Here's a list of protocol that want protocol #4: * RFC1853 IPv4-in-IPv4 tunnelling * RFC2003 IPv4 encapsulation within IPv4 * RFC2344 reverse tunnelling for mobile-ip4 * RFC2401 IPsec tunnel * Well, what can I say. They impose different en/decapsulation mechanism * from each other, so they need separate protocol handler. The only one * we can easily determine by protocol # is IPsec, which always has * AH/ESP/IPComp header right after outer IP header. * * So, clearly good old protosw does not work for protocol #4 and #41. * The code will let you match protocol via src/dst address pair. */ -/* XXX is M_NETADDR correct? */ #include __FBSDID("$FreeBSD$"); -#include "opt_mrouting.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include +#include #include +#include #include -#include -#include #include #include -#include -#include +#include #include -#include +#include #include -#include -#include #include #include #ifdef INET6 -#include #include #endif -#include +static MALLOC_DEFINE(M_NETADDR, "encap_export_host", + "Export host address structure"); -#include -#include -static MALLOC_DEFINE(M_NETADDR, "encap_export_host", "Export host address structure"); +struct encaptab { + CK_LIST_ENTRY(encaptab) chain; + int proto; + int min_length; + int exact_match; + void *arg; -static void encap_add(struct encaptab *); -static int mask_match(const struct encaptab *, const struct sockaddr *, - const struct sockaddr *); -static void encap_fillarg(struct mbuf *, void *); + encap_lookup_t lookup; + encap_check_t check; + encap_input_t input; +}; + +CK_LIST_HEAD(encaptab_head, encaptab); +#ifdef INET +static struct encaptab_head ipv4_encaptab = CK_LIST_HEAD_INITIALIZER(); +#endif +#ifdef INET6 +static struct encaptab_head ipv6_encaptab = CK_LIST_HEAD_INITIALIZER(); +#endif -/* - * All global variables in ip_encap.c are locked using encapmtx. - */ static struct mtx encapmtx; MTX_SYSINIT(encapmtx, &encapmtx, "encapmtx", MTX_DEF); -static LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(encaptab); - -#ifdef INET -int -encap4_input(struct mbuf **mp, int *offp, int proto) +#define ENCAP_WLOCK() mtx_lock(&encapmtx) +#define ENCAP_WUNLOCK() mtx_unlock(&encapmtx) +#define ENCAP_RLOCK() epoch_enter_preempt(net_epoch_preempt) +#define ENCAP_RUNLOCK() epoch_exit_preempt(net_epoch_preempt) +#define ENCAP_WAIT() epoch_wait_preempt(net_epoch_preempt) + +static struct encaptab * +encap_attach(struct encaptab_head *head, const struct encap_config *cfg, + void *arg, int mflags) { - struct ip *ip; - struct mbuf *m; - struct sockaddr_in s, d; - const struct protosw *psw; - struct encaptab *ep, *match; - void *arg; - int matchprio, off, prio; - - m = *mp; - off = *offp; - ip = mtod(m, struct ip *); - - bzero(&s, sizeof(s)); - s.sin_family = AF_INET; - s.sin_len = sizeof(struct sockaddr_in); - s.sin_addr = ip->ip_src; - bzero(&d, sizeof(d)); - d.sin_family = AF_INET; - d.sin_len = sizeof(struct sockaddr_in); - d.sin_addr = ip->ip_dst; - - arg = NULL; - psw = NULL; - match = NULL; - matchprio = 0; - mtx_lock(&encapmtx); - LIST_FOREACH(ep, &encaptab, chain) { - if (ep->af != AF_INET) - continue; - if (ep->proto >= 0 && ep->proto != proto) - continue; - if (ep->func) - prio = (*ep->func)(m, off, proto, ep->arg); - else { - /* - * it's inbound traffic, we need to match in reverse - * order - */ - prio = mask_match(ep, (struct sockaddr *)&d, - (struct sockaddr *)&s); - } + struct encaptab *ep, *tmp; - /* - * We prioritize the matches by using bit length of the - * matches. mask_match() and user-supplied matching function - * should return the bit length of the matches (for example, - * if both src/dst are matched for IPv4, 64 should be returned). - * 0 or negative return value means "it did not match". - * - * The question is, since we have two "mask" portion, we - * cannot really define total order between entries. - * For example, which of these should be preferred? - * mask_match() returns 48 (32 + 16) for both of them. - * src=3ffe::/16, dst=3ffe:501::/32 - * src=3ffe:501::/32, dst=3ffe::/16 - * - * We need to loop through all the possible candidates - * to get the best match - the search takes O(n) for - * n attachments (i.e. interfaces). - */ - if (prio <= 0) - continue; - if (prio > matchprio) { - matchprio = prio; - match = ep; - } - } - if (match != NULL) { - psw = match->psw; - arg = match->arg; - } - mtx_unlock(&encapmtx); + if (cfg == NULL || cfg->input == NULL || + (cfg->check == NULL && cfg->lookup == NULL) || + (cfg->lookup != NULL && cfg->exact_match != ENCAP_DRV_LOOKUP) || + (cfg->exact_match == ENCAP_DRV_LOOKUP && cfg->lookup == NULL)) + return (NULL); - if (match != NULL) { - /* found a match, "match" has the best one */ - if (psw != NULL && psw->pr_input != NULL) { - encap_fillarg(m, arg); - (*psw->pr_input)(mp, offp, proto); - } else - m_freem(m); - return (IPPROTO_DONE); + ep = malloc(sizeof(*ep), M_NETADDR, mflags); + if (ep == NULL) + return (NULL); + + ep->proto = cfg->proto; + ep->min_length = cfg->min_length; + ep->exact_match = cfg->exact_match; + ep->arg = arg; + ep->lookup = cfg->exact_match == ENCAP_DRV_LOOKUP ? cfg->lookup: NULL; + ep->check = cfg->exact_match != ENCAP_DRV_LOOKUP ? cfg->check: NULL; + ep->input = cfg->input; + + ENCAP_WLOCK(); + CK_LIST_FOREACH(tmp, head, chain) { + if (tmp->exact_match <= ep->exact_match) + break; } + if (tmp == NULL) + CK_LIST_INSERT_HEAD(head, ep, chain); + else + CK_LIST_INSERT_BEFORE(tmp, ep, chain); + ENCAP_WUNLOCK(); + return (ep); +} + +static int +encap_detach(struct encaptab_head *head, const struct encaptab *cookie) +{ + struct encaptab *ep; - /* last resort: inject to raw socket */ - return (rip_input(mp, offp, proto)); + ENCAP_WLOCK(); + CK_LIST_FOREACH(ep, head, chain) { + if (ep == cookie) { + CK_LIST_REMOVE(ep, chain); + ENCAP_WUNLOCK(); + ENCAP_WAIT(); + free(ep, M_NETADDR); + return (0); + } + } + ENCAP_WUNLOCK(); + return (EINVAL); } -#endif -#ifdef INET6 -int -encap6_input(struct mbuf **mp, int *offp, int proto) +static int +encap_input(struct encaptab_head *head, struct mbuf *m, int off, int proto) { - struct mbuf *m = *mp; - struct ip6_hdr *ip6; - struct sockaddr_in6 s, d; - const struct protosw *psw; struct encaptab *ep, *match; void *arg; - int prio, matchprio; - - ip6 = mtod(m, struct ip6_hdr *); + int matchprio, ret; - bzero(&s, sizeof(s)); - s.sin6_family = AF_INET6; - s.sin6_len = sizeof(struct sockaddr_in6); - s.sin6_addr = ip6->ip6_src; - bzero(&d, sizeof(d)); - d.sin6_family = AF_INET6; - d.sin6_len = sizeof(struct sockaddr_in6); - d.sin6_addr = ip6->ip6_dst; - - arg = NULL; - psw = NULL; match = NULL; matchprio = 0; - mtx_lock(&encapmtx); - LIST_FOREACH(ep, &encaptab, chain) { - if (ep->af != AF_INET6) - continue; + + ENCAP_RLOCK(); + CK_LIST_FOREACH(ep, head, chain) { if (ep->proto >= 0 && ep->proto != proto) continue; - if (ep->func) - prio = (*ep->func)(m, *offp, proto, ep->arg); - else { - /* - * it's inbound traffic, we need to match in reverse - * order - */ - prio = mask_match(ep, (struct sockaddr *)&d, - (struct sockaddr *)&s); - } - - /* see encap4_input() for issues here */ - if (prio <= 0) + if (ep->min_length > m->m_pkthdr.len) continue; - if (prio > matchprio) { - matchprio = prio; + if (ep->exact_match == ENCAP_DRV_LOOKUP) + ret = (*ep->lookup)(m, off, proto, &arg); + else + ret = (*ep->check)(m, off, proto, ep->arg); + if (ret <= 0) + continue; + if (ret > matchprio) { match = ep; + if (ep->exact_match != ENCAP_DRV_LOOKUP) + arg = ep->arg; + /* + * No need to continue the search, we got the + * exact match. + */ + if (ret >= ep->exact_match) + break; + matchprio = ret; } } - if (match != NULL) { - psw = match->psw; - arg = match->arg; - } - mtx_unlock(&encapmtx); if (match != NULL) { - /* found a match */ - if (psw != NULL && psw->pr_input != NULL) { - encap_fillarg(m, arg); - return (*psw->pr_input)(mp, offp, proto); - } else { - m_freem(m); - return (IPPROTO_DONE); - } + /* found a match, "match" has the best one */ + ret = (*match->input)(m, off, proto, arg); + ENCAP_RUNLOCK(); + MPASS(ret == IPPROTO_DONE); + return (IPPROTO_DONE); } - - /* last resort: inject to raw socket */ - return rip6_input(mp, offp, proto); -} -#endif - -/*lint -sem(encap_add, custodial(1)) */ -static void -encap_add(struct encaptab *ep) -{ - - mtx_assert(&encapmtx, MA_OWNED); - LIST_INSERT_HEAD(&encaptab, ep, chain); + ENCAP_RUNLOCK(); + return (0); } -/* - * sp (src ptr) is always my side, and dp (dst ptr) is always remote side. - * length of mask (sm and dm) is assumed to be same as sp/dp. - * Return value will be necessary as input (cookie) for encap_detach(). - */ +#ifdef INET const struct encaptab * -encap_attach(int af, int proto, const struct sockaddr *sp, - const struct sockaddr *sm, const struct sockaddr *dp, - const struct sockaddr *dm, const struct protosw *psw, void *arg) +ip_encap_attach(const struct encap_config *cfg, void *arg, int mflags) { - struct encaptab *ep; - - /* sanity check on args */ - if (sp->sa_len > sizeof(ep->src) || dp->sa_len > sizeof(ep->dst)) - return (NULL); - if (sp->sa_len != dp->sa_len) - return (NULL); - if (af != sp->sa_family || af != dp->sa_family) - return (NULL); - /* check if anyone have already attached with exactly same config */ - mtx_lock(&encapmtx); - LIST_FOREACH(ep, &encaptab, chain) { - if (ep->af != af) - continue; - if (ep->proto != proto) - continue; - if (ep->src.ss_len != sp->sa_len || - bcmp(&ep->src, sp, sp->sa_len) != 0 || - bcmp(&ep->srcmask, sm, sp->sa_len) != 0) - continue; - if (ep->dst.ss_len != dp->sa_len || - bcmp(&ep->dst, dp, dp->sa_len) != 0 || - bcmp(&ep->dstmask, dm, dp->sa_len) != 0) - continue; - - mtx_unlock(&encapmtx); - return (NULL); - } - - ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT); /*XXX*/ - if (ep == NULL) { - mtx_unlock(&encapmtx); - return (NULL); - } - bzero(ep, sizeof(*ep)); - - ep->af = af; - ep->proto = proto; - bcopy(sp, &ep->src, sp->sa_len); - bcopy(sm, &ep->srcmask, sp->sa_len); - bcopy(dp, &ep->dst, dp->sa_len); - bcopy(dm, &ep->dstmask, dp->sa_len); - ep->psw = psw; - ep->arg = arg; - - encap_add(ep); - mtx_unlock(&encapmtx); - return (ep); + return (encap_attach(&ipv4_encaptab, cfg, arg, mflags)); } -const struct encaptab * -encap_attach_func(int af, int proto, - int (*func)(const struct mbuf *, int, int, void *), - const struct protosw *psw, void *arg) +int +ip_encap_detach(const struct encaptab *cookie) { - struct encaptab *ep; - /* sanity check on args */ - if (!func) - return (NULL); - - ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT); /*XXX*/ - if (ep == NULL) - return (NULL); - bzero(ep, sizeof(*ep)); - - ep->af = af; - ep->proto = proto; - ep->func = func; - ep->psw = psw; - ep->arg = arg; - - mtx_lock(&encapmtx); - encap_add(ep); - mtx_unlock(&encapmtx); - return (ep); + return (encap_detach(&ipv4_encaptab, cookie)); } int -encap_detach(const struct encaptab *cookie) +encap4_input(struct mbuf **mp, int *offp, int proto) { - const struct encaptab *ep = cookie; - struct encaptab *p; - - mtx_lock(&encapmtx); - LIST_FOREACH(p, &encaptab, chain) { - if (p == ep) { - LIST_REMOVE(p, chain); - mtx_unlock(&encapmtx); - free(p, M_NETADDR); /*XXX*/ - return 0; - } - } - mtx_unlock(&encapmtx); - return EINVAL; + if (encap_input(&ipv4_encaptab, *mp, *offp, proto) != IPPROTO_DONE) + return (rip_input(mp, offp, proto)); + return (IPPROTO_DONE); } +#endif /* INET */ -static int -mask_match(const struct encaptab *ep, const struct sockaddr *sp, - const struct sockaddr *dp) +#ifdef INET6 +const struct encaptab * +ip6_encap_attach(const struct encap_config *cfg, void *arg, int mflags) { - struct sockaddr_storage s; - struct sockaddr_storage d; - int i; - const u_int8_t *p, *q; - u_int8_t *r; - int matchlen; - - if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d)) - return 0; - if (sp->sa_family != ep->af || dp->sa_family != ep->af) - return 0; - if (sp->sa_len != ep->src.ss_len || dp->sa_len != ep->dst.ss_len) - return 0; - - matchlen = 0; - - p = (const u_int8_t *)sp; - q = (const u_int8_t *)&ep->srcmask; - r = (u_int8_t *)&s; - for (i = 0 ; i < sp->sa_len; i++) { - r[i] = p[i] & q[i]; - /* XXX estimate */ - matchlen += (q[i] ? 8 : 0); - } - p = (const u_int8_t *)dp; - q = (const u_int8_t *)&ep->dstmask; - r = (u_int8_t *)&d; - for (i = 0 ; i < dp->sa_len; i++) { - r[i] = p[i] & q[i]; - /* XXX rough estimate */ - matchlen += (q[i] ? 8 : 0); - } - - /* need to overwrite len/family portion as we don't compare them */ - s.ss_len = sp->sa_len; - s.ss_family = sp->sa_family; - d.ss_len = dp->sa_len; - d.ss_family = dp->sa_family; - - if (bcmp(&s, &ep->src, ep->src.ss_len) == 0 && - bcmp(&d, &ep->dst, ep->dst.ss_len) == 0) { - return matchlen; - } else - return 0; + return (encap_attach(&ipv6_encaptab, cfg, arg, mflags)); } -static void -encap_fillarg(struct mbuf *m, void *arg) +int +ip6_encap_detach(const struct encaptab *cookie) { - struct m_tag *tag; - if (arg != NULL) { - tag = m_tag_get(PACKET_TAG_ENCAP, sizeof(void *), M_NOWAIT); - if (tag != NULL) { - *(void**)(tag+1) = arg; - m_tag_prepend(m, tag); - } - } + return (encap_detach(&ipv6_encaptab, cookie)); } -void * -encap_getarg(struct mbuf *m) +int +encap6_input(struct mbuf **mp, int *offp, int proto) { - void *p = NULL; - struct m_tag *tag; - tag = m_tag_find(m, PACKET_TAG_ENCAP, NULL); - if (tag) { - p = *(void**)(tag+1); - m_tag_delete(m, tag); - } - return p; + if (encap_input(&ipv6_encaptab, *mp, *offp, proto) != IPPROTO_DONE) + return (rip6_input(mp, offp, proto)); + return (IPPROTO_DONE); } +#endif /* INET6 */ diff --git a/sys/netinet/ip_encap.h b/sys/netinet/ip_encap.h index ef232189f398..f3d1d3afcab8 100644 --- a/sys/netinet/ip_encap.h +++ b/sys/netinet/ip_encap.h @@ -1,65 +1,70 @@ /* $FreeBSD$ */ /* $KAME: ip_encap.h,v 1.7 2000/03/25 07:23:37 sumikawa Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * Copyright (c) 2018 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETINET_IP_ENCAP_H_ #define _NETINET_IP_ENCAP_H_ #ifdef _KERNEL -struct encaptab { - LIST_ENTRY(encaptab) chain; - int af; - int proto; /* -1: don't care, I'll check myself */ - struct sockaddr_storage src; /* my addr */ - struct sockaddr_storage srcmask; - struct sockaddr_storage dst; /* remote addr */ - struct sockaddr_storage dstmask; - int (*func)(const struct mbuf *, int, int, void *); - const struct protosw *psw; /* only pr_input will be used */ - void *arg; /* passed via m->m_pkthdr.aux */ -}; - int encap4_input(struct mbuf **, int *, int); int encap6_input(struct mbuf **, int *, int); -const struct encaptab *encap_attach(int, int, const struct sockaddr *, - const struct sockaddr *, const struct sockaddr *, - const struct sockaddr *, const struct protosw *, void *); -const struct encaptab *encap_attach_func(int, int, - int (*)(const struct mbuf *, int, int, void *), - const struct protosw *, void *); -int encap_detach(const struct encaptab *); -void *encap_getarg(struct mbuf *); + +typedef int (*encap_lookup_t)(const struct mbuf *, int, int, void **); +typedef int (*encap_check_t)(const struct mbuf *, int, int, void *); +typedef int (*encap_input_t)(struct mbuf *, int , int, void *); + +struct encap_config { + int proto; /* protocol */ + int min_length; /* minimum packet length */ + int exact_match; /* a packet is exactly matched */ +#define ENCAP_DRV_LOOKUP 0x7fffffff + + encap_lookup_t lookup; + encap_check_t check; + encap_input_t input; +}; + +struct encaptab; + +const struct encaptab *ip_encap_attach(const struct encap_config *, + void *arg, int mflags); +const struct encaptab *ip6_encap_attach(const struct encap_config *, + void *arg, int mflags); + +int ip_encap_detach(const struct encaptab *); +int ip6_encap_detach(const struct encaptab *); #endif #endif /*_NETINET_IP_ENCAP_H_*/ diff --git a/sys/netinet/ip_gre.c b/sys/netinet/ip_gre.c index 5b865e07bc45..66c1ce895ab9 100644 --- a/sys/netinet/ip_gre.c +++ b/sys/netinet/ip_gre.c @@ -1,169 +1,154 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-NetBSD * * Copyright (c) 1998 The NetBSD Foundation, Inc. * Copyright (c) 2014 Andrey V. Elsukov * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Heiko W.Rupp * * IPv6-over-GRE contributed by Gert Doering * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: ip_gre.c,v 1.29 2003/09/05 23:02:43 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include -extern struct domain inetdomain; -static const struct protosw in_gre_protosw = { - .pr_type = SOCK_RAW, - .pr_domain = &inetdomain, - .pr_protocol = IPPROTO_GRE, - .pr_flags = PR_ATOMIC|PR_ADDR, - .pr_input = gre_input, - .pr_output = rip_output, - .pr_ctlinput = rip_ctlinput, - .pr_ctloutput = rip_ctloutput, - .pr_usrreqs = &rip_usrreqs -}; - #define GRE_TTL 30 VNET_DEFINE(int, ip_gre_ttl) = GRE_TTL; #define V_ip_gre_ttl VNET(ip_gre_ttl) SYSCTL_INT(_net_inet_ip, OID_AUTO, grettl, CTLFLAG_VNET | CTLFLAG_RW, - &VNET_NAME(ip_gre_ttl), 0, ""); + &VNET_NAME(ip_gre_ttl), 0, "Default TTL value for encapsulated packets"); static int in_gre_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { GRE_RLOCK_TRACKER; struct gre_softc *sc; struct ip *ip; sc = (struct gre_softc *)arg; if ((GRE2IFP(sc)->if_flags & IFF_UP) == 0) return (0); M_ASSERTPKTHDR(m); - /* - * We expect that payload contains at least IPv4 - * or IPv6 packet. - */ - if (m->m_pkthdr.len < sizeof(struct greip) + sizeof(struct ip)) - return (0); GRE_RLOCK(sc); if (sc->gre_family == 0) goto bad; KASSERT(sc->gre_family == AF_INET, ("wrong gre_family: %d", sc->gre_family)); ip = mtod(m, struct ip *); if (sc->gre_oip.ip_src.s_addr != ip->ip_dst.s_addr || sc->gre_oip.ip_dst.s_addr != ip->ip_src.s_addr) goto bad; GRE_RUNLOCK(sc); - return (32 * 2); + return (32 * 3); /* src + dst + gre_hdr */ bad: GRE_RUNLOCK(sc); return (0); } int in_gre_output(struct mbuf *m, int af, int hlen) { struct greip *gi; gi = mtod(m, struct greip *); switch (af) { case AF_INET: /* * gre_transmit() has used M_PREPEND() that doesn't guarantee * m_data is contiguous more than hlen bytes. Use m_copydata() * here to avoid m_pullup(). */ m_copydata(m, hlen + offsetof(struct ip, ip_tos), sizeof(u_char), &gi->gi_ip.ip_tos); m_copydata(m, hlen + offsetof(struct ip, ip_id), sizeof(u_short), (caddr_t)&gi->gi_ip.ip_id); break; #ifdef INET6 case AF_INET6: gi->gi_ip.ip_tos = 0; /* XXX */ ip_fillid(&gi->gi_ip); break; #endif } gi->gi_ip.ip_ttl = V_ip_gre_ttl; gi->gi_ip.ip_len = htons(m->m_pkthdr.len); return (ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL)); } +static const struct encap_config ipv4_encap_cfg = { + .proto = IPPROTO_GRE, + .min_length = sizeof(struct greip) + sizeof(struct ip), + .exact_match = (sizeof(in_addr_t) << 4) + 32, + .check = in_gre_encapcheck, + .input = gre_input +}; + int in_gre_attach(struct gre_softc *sc) { KASSERT(sc->gre_ecookie == NULL, ("gre_ecookie isn't NULL")); - sc->gre_ecookie = encap_attach_func(AF_INET, IPPROTO_GRE, - in_gre_encapcheck, &in_gre_protosw, sc); - if (sc->gre_ecookie == NULL) - return (EEXIST); + sc->gre_ecookie = ip_encap_attach(&ipv4_encap_cfg, sc, M_WAITOK); return (0); } diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c index fbf298fa54a0..ee747baa8bcc 100644 --- a/sys/netinet/ip_mroute.c +++ b/sys/netinet/ip_mroute.c @@ -1,2954 +1,2940 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989 Stephen Deering * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 */ /* * IP multicast forwarding procedures * * Written by David Waitzman, BBN Labs, August 1988. * Modified by Steve Deering, Stanford, February 1989. * Modified by Mark J. Steiglitz, Stanford, May, 1991 * Modified by Van Jacobson, LBL, January 1993 * Modified by Ajit Thyagarajan, PARC, August 1993 * Modified by Bill Fenner, PARC, April 1995 * Modified by Ahmed Helmy, SGI, June 1996 * Modified by George Edmond Eddy (Rusty), ISI, February 1998 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 * Modified by Hitoshi Asaeda, WIDE, August 2000 * Modified by Pavlin Radoslavov, ICSI, October 2002 * * MROUTING Revision: 3.5 * and PIM-SMv2 and PIM-DM support, advanced API support, * bandwidth metering and signaling */ /* * TODO: Prefix functions with ipmf_. * TODO: Maintain a refcount on if_allmulti() in ifnet or in the protocol * domain attachment (if_afdata) so we can track consumers of that service. * TODO: Deprecate routing socket path for SIOCGETSGCNT and SIOCGETVIFCNT, * move it to socket options. * TODO: Cleanup LSRR removal further. * TODO: Push RSVP stubs into raw_ip.c. * TODO: Use bitstring.h for vif set. * TODO: Fix mrt6_ioctl dangling ref when dynamically loaded. * TODO: Sync ip6_mroute.c with this file. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_mrouting.h" #define _PIM_VT 1 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IPMF #define KTR_IPMF KTR_INET #endif #define VIFI_INVALID ((vifi_t) -1) static VNET_DEFINE(uint32_t, last_tv_sec); /* last time we processed this */ #define V_last_tv_sec VNET(last_tv_sec) static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast forwarding cache"); /* * Locking. We use two locks: one for the virtual interface table and * one for the forwarding table. These locks may be nested in which case * the VIF lock must always be taken first. Note that each lock is used * to cover not only the specific data structure but also related data * structures. */ static struct mtx mrouter_mtx; #define MROUTER_LOCK() mtx_lock(&mrouter_mtx) #define MROUTER_UNLOCK() mtx_unlock(&mrouter_mtx) #define MROUTER_LOCK_ASSERT() mtx_assert(&mrouter_mtx, MA_OWNED) #define MROUTER_LOCK_INIT() \ mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF) #define MROUTER_LOCK_DESTROY() mtx_destroy(&mrouter_mtx) static int ip_mrouter_cnt; /* # of vnets with active mrouters */ static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */ static VNET_PCPUSTAT_DEFINE(struct mrtstat, mrtstat); VNET_PCPUSTAT_SYSINIT(mrtstat); VNET_PCPUSTAT_SYSUNINIT(mrtstat); SYSCTL_VNET_PCPUSTAT(_net_inet_ip, OID_AUTO, mrtstat, struct mrtstat, mrtstat, "IPv4 Multicast Forwarding Statistics (struct mrtstat, " "netinet/ip_mroute.h)"); static VNET_DEFINE(u_long, mfchash); #define V_mfchash VNET(mfchash) #define MFCHASH(a, g) \ ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \ ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & V_mfchash) #define MFCHASHSIZE 256 static u_long mfchashsize; /* Hash size */ static VNET_DEFINE(u_char *, nexpire); /* 0..mfchashsize-1 */ #define V_nexpire VNET(nexpire) static VNET_DEFINE(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl); #define V_mfchashtbl VNET(mfchashtbl) static struct mtx mfc_mtx; #define MFC_LOCK() mtx_lock(&mfc_mtx) #define MFC_UNLOCK() mtx_unlock(&mfc_mtx) #define MFC_LOCK_ASSERT() mtx_assert(&mfc_mtx, MA_OWNED) #define MFC_LOCK_INIT() \ mtx_init(&mfc_mtx, "IPv4 multicast forwarding cache", NULL, MTX_DEF) #define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) static VNET_DEFINE(vifi_t, numvifs); #define V_numvifs VNET(numvifs) static VNET_DEFINE(struct vif, viftable[MAXVIFS]); #define V_viftable VNET(viftable) SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(viftable), sizeof(V_viftable), "S,vif[MAXVIFS]", "IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)"); static struct mtx vif_mtx; #define VIF_LOCK() mtx_lock(&vif_mtx) #define VIF_UNLOCK() mtx_unlock(&vif_mtx) #define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED) #define VIF_LOCK_INIT() \ mtx_init(&vif_mtx, "IPv4 multicast interfaces", NULL, MTX_DEF) #define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx) static eventhandler_tag if_detach_event_tag = NULL; static VNET_DEFINE(struct callout, expire_upcalls_ch); #define V_expire_upcalls_ch VNET(expire_upcalls_ch) #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ /* * Bandwidth meter variables and constants */ static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); /* * Pending timeouts are stored in a hash table, the key being the * expiration time. Periodically, the entries are analysed and processed. */ #define BW_METER_BUCKETS 1024 static VNET_DEFINE(struct bw_meter*, bw_meter_timers[BW_METER_BUCKETS]); #define V_bw_meter_timers VNET(bw_meter_timers) static VNET_DEFINE(struct callout, bw_meter_ch); #define V_bw_meter_ch VNET(bw_meter_ch) #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ /* * Pending upcalls are stored in a vector which is flushed when * full, or periodically */ static VNET_DEFINE(struct bw_upcall, bw_upcalls[BW_UPCALLS_MAX]); #define V_bw_upcalls VNET(bw_upcalls) static VNET_DEFINE(u_int, bw_upcalls_n); /* # of pending upcalls */ #define V_bw_upcalls_n VNET(bw_upcalls_n) static VNET_DEFINE(struct callout, bw_upcalls_ch); #define V_bw_upcalls_ch VNET(bw_upcalls_ch) #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ static VNET_PCPUSTAT_DEFINE(struct pimstat, pimstat); VNET_PCPUSTAT_SYSINIT(pimstat); VNET_PCPUSTAT_SYSUNINIT(pimstat); SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM"); SYSCTL_VNET_PCPUSTAT(_net_inet_pim, PIMCTL_STATS, stats, struct pimstat, pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)"); static u_long pim_squelch_wholepkt = 0; SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW, &pim_squelch_wholepkt, 0, "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified"); -extern struct domain inetdomain; -static const struct protosw in_pim_protosw = { - .pr_type = SOCK_RAW, - .pr_domain = &inetdomain, - .pr_protocol = IPPROTO_PIM, - .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, - .pr_input = pim_input, - .pr_output = rip_output, - .pr_ctloutput = rip_ctloutput, - .pr_usrreqs = &rip_usrreqs -}; static const struct encaptab *pim_encap_cookie; - static int pim_encapcheck(const struct mbuf *, int, int, void *); +static int pim_input(struct mbuf *, int, int, void *); + +static const struct encap_config ipv4_encap_cfg = { + .proto = IPPROTO_PIM, + .min_length = sizeof(struct ip) + PIM_MINLEN, + .exact_match = 8, + .check = pim_encapcheck, + .input = pim_input +}; /* * Note: the PIM Register encapsulation adds the following in front of a * data packet: * * struct pim_encap_hdr { * struct ip ip; * struct pim_encap_pimhdr pim; * } * */ struct pim_encap_pimhdr { struct pim pim; uint32_t flags; }; #define PIM_ENCAP_TTL 64 static struct ip pim_encap_iphdr = { #if BYTE_ORDER == LITTLE_ENDIAN sizeof(struct ip) >> 2, IPVERSION, #else IPVERSION, sizeof(struct ip) >> 2, #endif 0, /* tos */ sizeof(struct ip), /* total length */ 0, /* id */ 0, /* frag offset */ PIM_ENCAP_TTL, IPPROTO_PIM, 0, /* checksum */ }; static struct pim_encap_pimhdr pim_encap_pimhdr = { { PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 0, /* reserved */ 0, /* checksum */ }, 0 /* flags */ }; static VNET_DEFINE(vifi_t, reg_vif_num) = VIFI_INVALID; #define V_reg_vif_num VNET(reg_vif_num) static VNET_DEFINE(struct ifnet, multicast_register_if); #define V_multicast_register_if VNET(multicast_register_if) /* * Private variables. */ static u_long X_ip_mcast_src(int); static int X_ip_mforward(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); static int X_ip_mrouter_done(void); static int X_ip_mrouter_get(struct socket *, struct sockopt *); static int X_ip_mrouter_set(struct socket *, struct sockopt *); static int X_legal_vif_num(int); static int X_mrt_ioctl(u_long, caddr_t, int); static int add_bw_upcall(struct bw_upcall *); static int add_mfc(struct mfcctl2 *); static int add_vif(struct vifctl *); static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *); static void bw_meter_process(void); static void bw_meter_receive_packet(struct bw_meter *, int, struct timeval *); static void bw_upcalls_send(void); static int del_bw_upcall(struct bw_upcall *); static int del_mfc(struct mfcctl2 *); static int del_vif(vifi_t); static int del_vif_locked(vifi_t); static void expire_bw_meter_process(void *); static void expire_bw_upcalls_send(void *); static void expire_mfc(struct mfc *); static void expire_upcalls(void *); static void free_bw_list(struct bw_meter *); static int get_sg_cnt(struct sioc_sg_req *); static int get_vif_cnt(struct sioc_vif_req *); static void if_detached_event(void *, struct ifnet *); static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); static int ip_mrouter_init(struct socket *, int); static __inline struct mfc * mfc_find(struct in_addr *, struct in_addr *); static void phyint_send(struct ip *, struct vif *, struct mbuf *); static struct mbuf * pim_register_prepare(struct ip *, struct mbuf *); static int pim_register_send(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_rp(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_upcall(struct ip *, struct vif *, struct mbuf *, struct mfc *); static void schedule_bw_meter(struct bw_meter *, struct timeval *); static void send_packet(struct vif *, struct mbuf *); static int set_api_config(uint32_t *); static int set_assert(int); static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); static void unschedule_bw_meter(struct bw_meter *); /* * Kernel multicast forwarding API capabilities and setup. * If more API capabilities are added to the kernel, they should be * recorded in `mrt_api_support'. */ #define MRT_API_VERSION 0x0305 static const int mrt_api_version = MRT_API_VERSION; static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | MRT_MFC_FLAGS_BORDER_VIF | MRT_MFC_RP | MRT_MFC_BW_UPCALL); static VNET_DEFINE(uint32_t, mrt_api_config); #define V_mrt_api_config VNET(mrt_api_config) static VNET_DEFINE(int, pim_assert_enabled); #define V_pim_assert_enabled VNET(pim_assert_enabled) static struct timeval pim_assert_interval = { 3, 0 }; /* Rate limit */ /* * Find a route for a given origin IP address and multicast group address. * Statistics must be updated by the caller. */ static __inline struct mfc * mfc_find(struct in_addr *o, struct in_addr *g) { struct mfc *rt; MFC_LOCK_ASSERT(); LIST_FOREACH(rt, &V_mfchashtbl[MFCHASH(*o, *g)], mfc_hash) { if (in_hosteq(rt->mfc_origin, *o) && in_hosteq(rt->mfc_mcastgrp, *g) && TAILQ_EMPTY(&rt->mfc_stall)) break; } return (rt); } /* * Handle MRT setsockopt commands to modify the multicast forwarding tables. */ static int X_ip_mrouter_set(struct socket *so, struct sockopt *sopt) { int error, optval; vifi_t vifi; struct vifctl vifc; struct mfcctl2 mfc; struct bw_upcall bw_upcall; uint32_t i; if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT) return EPERM; error = 0; switch (sopt->sopt_name) { case MRT_INIT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; error = ip_mrouter_init(so, optval); break; case MRT_DONE: error = ip_mrouter_done(); break; case MRT_ADD_VIF: error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); if (error) break; error = add_vif(&vifc); break; case MRT_DEL_VIF: error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); if (error) break; error = del_vif(vifi); break; case MRT_ADD_MFC: case MRT_DEL_MFC: /* * select data size depending on API version. */ if (sopt->sopt_name == MRT_ADD_MFC && V_mrt_api_config & MRT_API_FLAGS_ALL) { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2), sizeof(struct mfcctl2)); } else { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl), sizeof(struct mfcctl)); bzero((caddr_t)&mfc + sizeof(struct mfcctl), sizeof(mfc) - sizeof(struct mfcctl)); } if (error) break; if (sopt->sopt_name == MRT_ADD_MFC) error = add_mfc(&mfc); else error = del_mfc(&mfc); break; case MRT_ASSERT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; set_assert(optval); break; case MRT_API_CONFIG: error = sooptcopyin(sopt, &i, sizeof i, sizeof i); if (!error) error = set_api_config(&i); if (!error) error = sooptcopyout(sopt, &i, sizeof i); break; case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall, sizeof bw_upcall); if (error) break; if (sopt->sopt_name == MRT_ADD_BW_UPCALL) error = add_bw_upcall(&bw_upcall); else error = del_bw_upcall(&bw_upcall); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle MRT getsockopt commands */ static int X_ip_mrouter_get(struct socket *so, struct sockopt *sopt) { int error; switch (sopt->sopt_name) { case MRT_VERSION: error = sooptcopyout(sopt, &mrt_api_version, sizeof mrt_api_version); break; case MRT_ASSERT: error = sooptcopyout(sopt, &V_pim_assert_enabled, sizeof V_pim_assert_enabled); break; case MRT_API_SUPPORT: error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support); break; case MRT_API_CONFIG: error = sooptcopyout(sopt, &V_mrt_api_config, sizeof V_mrt_api_config); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle ioctl commands to obtain information from the cache */ static int X_mrt_ioctl(u_long cmd, caddr_t data, int fibnum __unused) { int error = 0; /* * Currently the only function calling this ioctl routine is rtioctl_fib(). * Typically, only root can create the raw socket in order to execute * this ioctl method, however the request might be coming from a prison */ error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error) return (error); switch (cmd) { case (SIOCGETVIFCNT): error = get_vif_cnt((struct sioc_vif_req *)data); break; case (SIOCGETSGCNT): error = get_sg_cnt((struct sioc_sg_req *)data); break; default: error = EINVAL; break; } return error; } /* * returns the packet, byte, rpf-failure count for the source group provided */ static int get_sg_cnt(struct sioc_sg_req *req) { struct mfc *rt; MFC_LOCK(); rt = mfc_find(&req->src, &req->grp); if (rt == NULL) { MFC_UNLOCK(); req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; return EADDRNOTAVAIL; } req->pktcnt = rt->mfc_pkt_cnt; req->bytecnt = rt->mfc_byte_cnt; req->wrong_if = rt->mfc_wrong_if; MFC_UNLOCK(); return 0; } /* * returns the input and output packet and byte counts on the vif provided */ static int get_vif_cnt(struct sioc_vif_req *req) { vifi_t vifi = req->vifi; VIF_LOCK(); if (vifi >= V_numvifs) { VIF_UNLOCK(); return EINVAL; } req->icount = V_viftable[vifi].v_pkt_in; req->ocount = V_viftable[vifi].v_pkt_out; req->ibytes = V_viftable[vifi].v_bytes_in; req->obytes = V_viftable[vifi].v_bytes_out; VIF_UNLOCK(); return 0; } static void if_detached_event(void *arg __unused, struct ifnet *ifp) { vifi_t vifi; u_long i; MROUTER_LOCK(); if (V_ip_mrouter == NULL) { MROUTER_UNLOCK(); return; } VIF_LOCK(); MFC_LOCK(); /* * Tear down multicast forwarder state associated with this ifnet. * 1. Walk the vif list, matching vifs against this ifnet. * 2. Walk the multicast forwarding cache (mfc) looking for * inner matches with this vif's index. * 3. Expire any matching multicast forwarding cache entries. * 4. Free vif state. This should disable ALLMULTI on the interface. */ for (vifi = 0; vifi < V_numvifs; vifi++) { if (V_viftable[vifi].v_ifp != ifp) continue; for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { if (rt->mfc_parent == vifi) { expire_mfc(rt); } } } del_vif_locked(vifi); } MFC_UNLOCK(); VIF_UNLOCK(); MROUTER_UNLOCK(); } /* * Enable multicast forwarding. */ static int ip_mrouter_init(struct socket *so, int version) { CTR3(KTR_IPMF, "%s: so_type %d, pr_protocol %d", __func__, so->so_type, so->so_proto->pr_protocol); if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP) return EOPNOTSUPP; if (version != 1) return ENOPROTOOPT; MROUTER_LOCK(); if (ip_mrouter_unloading) { MROUTER_UNLOCK(); return ENOPROTOOPT; } if (V_ip_mrouter != NULL) { MROUTER_UNLOCK(); return EADDRINUSE; } V_mfchashtbl = hashinit_flags(mfchashsize, M_MRTABLE, &V_mfchash, HASH_NOWAIT); callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, curvnet); callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, curvnet); callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, curvnet); V_ip_mrouter = so; ip_mrouter_cnt++; MROUTER_UNLOCK(); CTR1(KTR_IPMF, "%s: done", __func__); return 0; } /* * Disable multicast forwarding. */ static int X_ip_mrouter_done(void) { struct ifnet *ifp; u_long i; vifi_t vifi; MROUTER_LOCK(); if (V_ip_mrouter == NULL) { MROUTER_UNLOCK(); return EINVAL; } /* * Detach/disable hooks to the reset of the system. */ V_ip_mrouter = NULL; ip_mrouter_cnt--; V_mrt_api_config = 0; VIF_LOCK(); /* * For each phyint in use, disable promiscuous reception of all IP * multicasts. */ for (vifi = 0; vifi < V_numvifs; vifi++) { if (!in_nullhost(V_viftable[vifi].v_lcl_addr) && !(V_viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { ifp = V_viftable[vifi].v_ifp; if_allmulti(ifp, 0); } } bzero((caddr_t)V_viftable, sizeof(V_viftable)); V_numvifs = 0; V_pim_assert_enabled = 0; VIF_UNLOCK(); callout_stop(&V_expire_upcalls_ch); callout_stop(&V_bw_upcalls_ch); callout_stop(&V_bw_meter_ch); MFC_LOCK(); /* * Free all multicast forwarding cache entries. * Do not use hashdestroy(), as we must perform other cleanup. */ for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { expire_mfc(rt); } } free(V_mfchashtbl, M_MRTABLE); V_mfchashtbl = NULL; bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize); V_bw_upcalls_n = 0; bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers)); MFC_UNLOCK(); V_reg_vif_num = VIFI_INVALID; MROUTER_UNLOCK(); CTR1(KTR_IPMF, "%s: done", __func__); return 0; } /* * Set PIM assert processing global */ static int set_assert(int i) { if ((i != 1) && (i != 0)) return EINVAL; V_pim_assert_enabled = i; return 0; } /* * Configure API capabilities */ int set_api_config(uint32_t *apival) { u_long i; /* * We can set the API capabilities only if it is the first operation * after MRT_INIT. I.e.: * - there are no vifs installed * - pim_assert is not enabled * - the MFC table is empty */ if (V_numvifs > 0) { *apival = 0; return EPERM; } if (V_pim_assert_enabled) { *apival = 0; return EPERM; } MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { if (LIST_FIRST(&V_mfchashtbl[i]) != NULL) { MFC_UNLOCK(); *apival = 0; return EPERM; } } MFC_UNLOCK(); V_mrt_api_config = *apival & mrt_api_support; *apival = V_mrt_api_config; return 0; } /* * Add a vif to the vif table */ static int add_vif(struct vifctl *vifcp) { struct vif *vifp = V_viftable + vifcp->vifc_vifi; struct sockaddr_in sin = {sizeof sin, AF_INET}; struct ifaddr *ifa; struct ifnet *ifp; int error; VIF_LOCK(); if (vifcp->vifc_vifi >= MAXVIFS) { VIF_UNLOCK(); return EINVAL; } /* rate limiting is no longer supported by this code */ if (vifcp->vifc_rate_limit != 0) { log(LOG_ERR, "rate limiting is no longer supported\n"); VIF_UNLOCK(); return EINVAL; } if (!in_nullhost(vifp->v_lcl_addr)) { VIF_UNLOCK(); return EADDRINUSE; } if (in_nullhost(vifcp->vifc_lcl_addr)) { VIF_UNLOCK(); return EADDRNOTAVAIL; } /* Find the interface with an address in AF_INET family */ if (vifcp->vifc_flags & VIFF_REGISTER) { /* * XXX: Because VIFF_REGISTER does not really need a valid * local interface (e.g. it could be 127.0.0.2), we don't * check its address. */ ifp = NULL; } else { sin.sin_addr = vifcp->vifc_lcl_addr; NET_EPOCH_ENTER(); ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == NULL) { NET_EPOCH_EXIT(); VIF_UNLOCK(); return EADDRNOTAVAIL; } ifp = ifa->ifa_ifp; NET_EPOCH_EXIT(); } if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) { CTR1(KTR_IPMF, "%s: tunnels are no longer supported", __func__); VIF_UNLOCK(); return EOPNOTSUPP; } else if (vifcp->vifc_flags & VIFF_REGISTER) { ifp = &V_multicast_register_if; CTR2(KTR_IPMF, "%s: add register vif for ifp %p", __func__, ifp); if (V_reg_vif_num == VIFI_INVALID) { if_initname(&V_multicast_register_if, "register_vif", 0); V_multicast_register_if.if_flags = IFF_LOOPBACK; V_reg_vif_num = vifcp->vifc_vifi; } } else { /* Make sure the interface supports multicast */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { VIF_UNLOCK(); return EOPNOTSUPP; } /* Enable promiscuous reception of all IP multicasts from the if */ error = if_allmulti(ifp, 1); if (error) { VIF_UNLOCK(); return error; } } vifp->v_flags = vifcp->vifc_flags; vifp->v_threshold = vifcp->vifc_threshold; vifp->v_lcl_addr = vifcp->vifc_lcl_addr; vifp->v_rmt_addr = vifcp->vifc_rmt_addr; vifp->v_ifp = ifp; /* initialize per vif pkt counters */ vifp->v_pkt_in = 0; vifp->v_pkt_out = 0; vifp->v_bytes_in = 0; vifp->v_bytes_out = 0; /* Adjust numvifs up if the vifi is higher than numvifs */ if (V_numvifs <= vifcp->vifc_vifi) V_numvifs = vifcp->vifc_vifi + 1; VIF_UNLOCK(); CTR4(KTR_IPMF, "%s: add vif %d laddr 0x%08x thresh %x", __func__, (int)vifcp->vifc_vifi, ntohl(vifcp->vifc_lcl_addr.s_addr), (int)vifcp->vifc_threshold); return 0; } /* * Delete a vif from the vif table */ static int del_vif_locked(vifi_t vifi) { struct vif *vifp; VIF_LOCK_ASSERT(); if (vifi >= V_numvifs) { return EINVAL; } vifp = &V_viftable[vifi]; if (in_nullhost(vifp->v_lcl_addr)) { return EADDRNOTAVAIL; } if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) if_allmulti(vifp->v_ifp, 0); if (vifp->v_flags & VIFF_REGISTER) V_reg_vif_num = VIFI_INVALID; bzero((caddr_t)vifp, sizeof (*vifp)); CTR2(KTR_IPMF, "%s: delete vif %d", __func__, (int)vifi); /* Adjust numvifs down */ for (vifi = V_numvifs; vifi > 0; vifi--) if (!in_nullhost(V_viftable[vifi-1].v_lcl_addr)) break; V_numvifs = vifi; return 0; } static int del_vif(vifi_t vifi) { int cc; VIF_LOCK(); cc = del_vif_locked(vifi); VIF_UNLOCK(); return cc; } /* * update an mfc entry without resetting counters and S,G addresses. */ static void update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { int i; rt->mfc_parent = mfccp->mfcc_parent; for (i = 0; i < V_numvifs; i++) { rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; rt->mfc_flags[i] = mfccp->mfcc_flags[i] & V_mrt_api_config & MRT_MFC_FLAGS_ALL; } /* set the RP address */ if (V_mrt_api_config & MRT_MFC_RP) rt->mfc_rp = mfccp->mfcc_rp; else rt->mfc_rp.s_addr = INADDR_ANY; } /* * fully initialize an mfc entry from the parameter. */ static void init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { rt->mfc_origin = mfccp->mfcc_origin; rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; update_mfc_params(rt, mfccp); /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; timevalclear(&rt->mfc_last_assert); } static void expire_mfc(struct mfc *rt) { struct rtdetq *rte, *nrte; MFC_LOCK_ASSERT(); free_bw_list(rt->mfc_bw_meter); TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { m_freem(rte->m); TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); free(rte, M_MRTABLE); } LIST_REMOVE(rt, mfc_hash); free(rt, M_MRTABLE); } /* * Add an mfc entry */ static int add_mfc(struct mfcctl2 *mfccp) { struct mfc *rt; struct rtdetq *rte, *nrte; u_long hash = 0; u_short nstl; VIF_LOCK(); MFC_LOCK(); rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); /* If an entry already exists, just update the fields */ if (rt) { CTR4(KTR_IPMF, "%s: update mfc orig 0x%08x group %lx parent %x", __func__, ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent); update_mfc_params(rt, mfccp); MFC_UNLOCK(); VIF_UNLOCK(); return (0); } /* * Find the entry for which the upcall was made and update */ nstl = 0; hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) && !TAILQ_EMPTY(&rt->mfc_stall)) { CTR5(KTR_IPMF, "%s: add mfc orig 0x%08x group %lx parent %x qh %p", __func__, ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent, TAILQ_FIRST(&rt->mfc_stall)); if (nstl++) CTR1(KTR_IPMF, "%s: multiple matches", __func__); init_mfc_params(rt, mfccp); rt->mfc_expire = 0; /* Don't clean this guy up */ V_nexpire[hash]--; /* Free queued packets, but attempt to forward them first. */ TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { if (rte->ifp != NULL) ip_mdq(rte->m, rte->ifp, rt, -1); m_freem(rte->m); TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall--; free(rte, M_MRTABLE); } } } /* * It is possible that an entry is being inserted without an upcall */ if (nstl == 0) { CTR1(KTR_IPMF, "%s: adding mfc w/o upcall", __func__); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) { init_mfc_params(rt, mfccp); if (rt->mfc_expire) V_nexpire[hash]--; rt->mfc_expire = 0; break; /* XXX */ } } if (rt == NULL) { /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return (ENOBUFS); } init_mfc_params(rt, mfccp); TAILQ_INIT(&rt->mfc_stall); rt->mfc_nstall = 0; rt->mfc_expire = 0; rt->mfc_bw_meter = NULL; /* insert new entry at head of hash chain */ LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash); } } MFC_UNLOCK(); VIF_UNLOCK(); return (0); } /* * Delete an mfc entry */ static int del_mfc(struct mfcctl2 *mfccp) { struct in_addr origin; struct in_addr mcastgrp; struct mfc *rt; origin = mfccp->mfcc_origin; mcastgrp = mfccp->mfcc_mcastgrp; CTR3(KTR_IPMF, "%s: delete mfc orig 0x%08x group %lx", __func__, ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr)); MFC_LOCK(); rt = mfc_find(&origin, &mcastgrp); if (rt == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } /* * free the bw_meter entries */ free_bw_list(rt->mfc_bw_meter); rt->mfc_bw_meter = NULL; LIST_REMOVE(rt, mfc_hash); free(rt, M_MRTABLE); MFC_UNLOCK(); return (0); } /* * Send a message to the routing daemon on the multicast routing socket. */ static int socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) { if (s) { SOCKBUF_LOCK(&s->so_rcv); if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm, NULL) != 0) { sorwakeup_locked(s); return 0; } SOCKBUF_UNLOCK(&s->so_rcv); } m_freem(mm); return -1; } /* * IP multicast forwarding function. This function assumes that the packet * pointed to by "ip" has arrived on (or is about to be sent to) the interface * pointed to by "ifp", and the packet is to be relayed to other networks * that have members of the packet's destination IP multicast group. * * The packet is returned unscathed to the caller, unless it is * erroneous, in which case a non-zero return value tells the caller to * discard it. */ #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo) { struct mfc *rt; int error; vifi_t vifi; CTR3(KTR_IPMF, "ip_mforward: delete mfc orig 0x%08x group %lx ifp %p", ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), ifp); if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) { /* * Packet arrived via a physical interface or * an encapsulated tunnel or a register_vif. */ } else { /* * Packet arrived through a source-route tunnel. * Source-route tunnels are no longer supported. */ return (1); } VIF_LOCK(); MFC_LOCK(); if (imo && ((vifi = imo->imo_multicast_vif) < V_numvifs)) { if (ip->ip_ttl < MAXTTL) ip->ip_ttl++; /* compensate for -1 in *_send routines */ error = ip_mdq(m, ifp, NULL, vifi); MFC_UNLOCK(); VIF_UNLOCK(); return error; } /* * Don't forward a packet with time-to-live of zero or one, * or a packet destined to a local-only group. */ if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) { MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* * Determine forwarding vifs from the forwarding cache table */ MRTSTAT_INC(mrts_mfc_lookups); rt = mfc_find(&ip->ip_src, &ip->ip_dst); /* Entry exists, so forward if necessary */ if (rt != NULL) { error = ip_mdq(m, ifp, rt, -1); MFC_UNLOCK(); VIF_UNLOCK(); return error; } else { /* * If we don't have a route for packet's origin, * Make a copy of the packet & send message to routing daemon */ struct mbuf *mb0; struct rtdetq *rte; u_long hash; int hlen = ip->ip_hl << 2; MRTSTAT_INC(mrts_mfc_misses); MRTSTAT_INC(mrts_no_route); CTR2(KTR_IPMF, "ip_mforward: no mfc for (0x%08x,%lx)", ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr)); /* * Allocate mbufs early so that we don't do extra work if we are * just going to fail anyway. Make sure to pullup the header so * that other people can't step on it. */ rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT|M_ZERO); if (rte == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } mb0 = m_copypacket(m, M_NOWAIT); if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < hlen)) mb0 = m_pullup(mb0, hlen); if (mb0 == NULL) { free(rte, M_MRTABLE); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* is there an upcall waiting for this flow ? */ hash = MFCHASH(ip->ip_src, ip->ip_dst); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(ip->ip_src, rt->mfc_origin) && in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) && !TAILQ_EMPTY(&rt->mfc_stall)) break; } if (rt == NULL) { int i; struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct mbuf *mm; /* * Locate the vifi for the incoming interface for this packet. * If none found, drop packet. */ for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= V_numvifs) /* vif not found, drop packet */ goto non_fatal; /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) goto fail; /* Make a copy of the header to send to the user level process */ mm = m_copym(mb0, 0, hlen, M_NOWAIT); if (mm == NULL) goto fail1; /* * Send message to routing daemon to install * a route into the kernel table */ im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_NOCACHE; im->im_mbz = 0; im->im_vif = vifi; MRTSTAT_INC(mrts_upcalls); k_igmpsrc.sin_addr = ip->ip_src; if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { CTR0(KTR_IPMF, "ip_mforward: socket queue full"); MRTSTAT_INC(mrts_upq_sockfull); fail1: free(rt, M_MRTABLE); fail: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* insert new entry at head of hash chain */ rt->mfc_origin.s_addr = ip->ip_src.s_addr; rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; rt->mfc_expire = UPCALL_EXPIRE; V_nexpire[hash]++; for (i = 0; i < V_numvifs; i++) { rt->mfc_ttls[i] = 0; rt->mfc_flags[i] = 0; } rt->mfc_parent = -1; /* clear the RP address */ rt->mfc_rp.s_addr = INADDR_ANY; rt->mfc_bw_meter = NULL; /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; timevalclear(&rt->mfc_last_assert); TAILQ_INIT(&rt->mfc_stall); rt->mfc_nstall = 0; /* link into table */ LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash); TAILQ_INSERT_HEAD(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall++; } else { /* determine if queue has overflowed */ if (rt->mfc_nstall > MAX_UPQ) { MRTSTAT_INC(mrts_upq_ovflw); non_fatal: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return (0); } TAILQ_INSERT_TAIL(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall++; } rte->m = mb0; rte->ifp = ifp; MFC_UNLOCK(); VIF_UNLOCK(); return 0; } } /* * Clean up the cache entry if upcall is not serviced */ static void expire_upcalls(void *arg) { u_long i; CURVNET_SET((struct vnet *) arg); MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; if (V_nexpire[i] == 0) continue; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { if (TAILQ_EMPTY(&rt->mfc_stall)) continue; if (rt->mfc_expire == 0 || --rt->mfc_expire > 0) continue; /* * free the bw_meter entries */ while (rt->mfc_bw_meter != NULL) { struct bw_meter *x = rt->mfc_bw_meter; rt->mfc_bw_meter = x->bm_mfc_next; free(x, M_BWMETER); } MRTSTAT_INC(mrts_cache_cleanups); CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__, (u_long)ntohl(rt->mfc_origin.s_addr), (u_long)ntohl(rt->mfc_mcastgrp.s_addr)); expire_mfc(rt); } } MFC_UNLOCK(); callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, curvnet); CURVNET_RESTORE(); } /* * Packet forwarding routine once entry in the cache is made */ static int ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) { struct ip *ip = mtod(m, struct ip *); vifi_t vifi; int plen = ntohs(ip->ip_len); VIF_LOCK_ASSERT(); /* * If xmt_vif is not -1, send on only the requested vif. * * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) */ if (xmt_vif < V_numvifs) { if (V_viftable[xmt_vif].v_flags & VIFF_REGISTER) pim_register_send(ip, V_viftable + xmt_vif, m, rt); else phyint_send(ip, V_viftable + xmt_vif, m); return 1; } /* * Don't forward if it didn't arrive from the parent vif for its origin. */ vifi = rt->mfc_parent; if ((vifi >= V_numvifs) || (V_viftable[vifi].v_ifp != ifp)) { CTR4(KTR_IPMF, "%s: rx on wrong ifp %p (vifi %d, v_ifp %p)", __func__, ifp, (int)vifi, V_viftable[vifi].v_ifp); MRTSTAT_INC(mrts_wrong_if); ++rt->mfc_wrong_if; /* * If we are doing PIM assert processing, send a message * to the routing daemon. * * XXX: A PIM-SM router needs the WRONGVIF detection so it * can complete the SPT switch, regardless of the type * of the iif (broadcast media, GRE tunnel, etc). */ if (V_pim_assert_enabled && (vifi < V_numvifs) && V_viftable[vifi].v_ifp) { if (ifp == &V_multicast_register_if) PIMSTAT_INC(pims_rcv_registers_wrongiif); /* Get vifi for the incoming packet */ for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= V_numvifs) return 0; /* The iif is not found: ignore the packet. */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF) return 0; /* WRONGVIF disabled: ignore the packet */ if (ratecheck(&rt->mfc_last_assert, &pim_assert_interval)) { struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct igmpmsg *im; int hlen = ip->ip_hl << 2; struct mbuf *mm = m_copym(m, 0, hlen, M_NOWAIT); if (mm && (!M_WRITABLE(mm) || mm->m_len < hlen)) mm = m_pullup(mm, hlen); if (mm == NULL) return ENOBUFS; im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_WRONGVIF; im->im_mbz = 0; im->im_vif = vifi; MRTSTAT_INC(mrts_upcalls); k_igmpsrc.sin_addr = im->im_src; if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { CTR1(KTR_IPMF, "%s: socket queue full", __func__); MRTSTAT_INC(mrts_upq_sockfull); return ENOBUFS; } } } return 0; } /* If I sourced this packet, it counts as output, else it was input. */ if (in_hosteq(ip->ip_src, V_viftable[vifi].v_lcl_addr)) { V_viftable[vifi].v_pkt_out++; V_viftable[vifi].v_bytes_out += plen; } else { V_viftable[vifi].v_pkt_in++; V_viftable[vifi].v_bytes_in += plen; } rt->mfc_pkt_cnt++; rt->mfc_byte_cnt += plen; /* * For each vif, decide if a copy of the packet should be forwarded. * Forward if: * - the ttl exceeds the vif's threshold * - there are group members downstream on interface */ for (vifi = 0; vifi < V_numvifs; vifi++) if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) { V_viftable[vifi].v_pkt_out++; V_viftable[vifi].v_bytes_out += plen; if (V_viftable[vifi].v_flags & VIFF_REGISTER) pim_register_send(ip, V_viftable + vifi, m, rt); else phyint_send(ip, V_viftable + vifi, m); } /* * Perform upcall-related bw measuring. */ if (rt->mfc_bw_meter != NULL) { struct bw_meter *x; struct timeval now; microtime(&now); MFC_LOCK_ASSERT(); for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) bw_meter_receive_packet(x, plen, &now); } return 0; } /* * Check if a vif number is legal/ok. This is used by in_mcast.c. */ static int X_legal_vif_num(int vif) { int ret; ret = 0; if (vif < 0) return (ret); VIF_LOCK(); if (vif < V_numvifs) ret = 1; VIF_UNLOCK(); return (ret); } /* * Return the local address used by this vif */ static u_long X_ip_mcast_src(int vifi) { in_addr_t addr; addr = INADDR_ANY; if (vifi < 0) return (addr); VIF_LOCK(); if (vifi < V_numvifs) addr = V_viftable[vifi].v_lcl_addr.s_addr; VIF_UNLOCK(); return (addr); } static void phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) { struct mbuf *mb_copy; int hlen = ip->ip_hl << 2; VIF_LOCK_ASSERT(); /* * Make a new reference to the packet; make sure that * the IP header is actually copied, not just referenced, * so that ip_output() only scribbles on the copy. */ mb_copy = m_copypacket(m, M_NOWAIT); if (mb_copy && (!M_WRITABLE(mb_copy) || mb_copy->m_len < hlen)) mb_copy = m_pullup(mb_copy, hlen); if (mb_copy == NULL) return; send_packet(vifp, mb_copy); } static void send_packet(struct vif *vifp, struct mbuf *m) { struct ip_moptions imo; struct in_multi *imm[2]; int error __unused; VIF_LOCK_ASSERT(); imo.imo_multicast_ifp = vifp->v_ifp; imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; imo.imo_multicast_loop = 1; imo.imo_multicast_vif = -1; imo.imo_num_memberships = 0; imo.imo_max_memberships = 2; imo.imo_membership = &imm[0]; /* * Re-entrancy should not be a problem here, because * the packets that we send out and are looped back at us * should get rejected because they appear to come from * the loopback interface, thus preventing looping. */ error = ip_output(m, NULL, NULL, IP_FORWARDING, &imo, NULL); CTR3(KTR_IPMF, "%s: vif %td err %d", __func__, (ptrdiff_t)(vifp - V_viftable), error); } /* * Stubs for old RSVP socket shim implementation. */ static int X_ip_rsvp_vif(struct socket *so __unused, struct sockopt *sopt __unused) { return (EOPNOTSUPP); } static void X_ip_rsvp_force_done(struct socket *so __unused) { } static int X_rsvp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; m = *mp; *mp = NULL; if (!V_rsvp_on) m_freem(m); return (IPPROTO_DONE); } /* * Code for bandwidth monitors */ /* * Define common interface for timeval-related methods */ #define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp) #define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp)) #define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp)) static uint32_t compute_bw_meter_flags(struct bw_upcall *req) { uint32_t flags = 0; if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) flags |= BW_METER_UNIT_PACKETS; if (req->bu_flags & BW_UPCALL_UNIT_BYTES) flags |= BW_METER_UNIT_BYTES; if (req->bu_flags & BW_UPCALL_GEQ) flags |= BW_METER_GEQ; if (req->bu_flags & BW_UPCALL_LEQ) flags |= BW_METER_LEQ; return flags; } /* * Add a bw_meter entry */ static int add_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; struct timeval now; struct bw_meter *x; uint32_t flags; if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; /* Test if the flags are valid */ if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) return EINVAL; if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) return EINVAL; if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) return EINVAL; /* Test if the threshold time interval is valid */ if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) return EINVAL; flags = compute_bw_meter_flags(req); /* * Find if we have already same bw_meter entry */ MFC_LOCK(); mfc = mfc_find(&req->bu_src, &req->bu_dst); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) { MFC_UNLOCK(); return 0; /* XXX Already installed */ } } /* Allocate the new bw_meter entry */ x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT); if (x == NULL) { MFC_UNLOCK(); return ENOBUFS; } /* Set the new bw_meter entry */ x->bm_threshold.b_time = req->bu_threshold.b_time; microtime(&now); x->bm_start_time = now; x->bm_threshold.b_packets = req->bu_threshold.b_packets; x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags = flags; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; /* Add the new bw_meter entry to the front of entries for this MFC */ x->bm_mfc = mfc; x->bm_mfc_next = mfc->mfc_bw_meter; mfc->mfc_bw_meter = x; schedule_bw_meter(x, &now); MFC_UNLOCK(); return 0; } static void free_bw_list(struct bw_meter *list) { while (list != NULL) { struct bw_meter *x = list; list = list->bm_mfc_next; unschedule_bw_meter(x); free(x, M_BWMETER); } } /* * Delete one or multiple bw_meter entries */ static int del_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct bw_meter *x; if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; MFC_LOCK(); /* Find the corresponding MFC entry */ mfc = mfc_find(&req->bu_src, &req->bu_dst); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { /* * Delete all bw_meter entries for this mfc */ struct bw_meter *list; list = mfc->mfc_bw_meter; mfc->mfc_bw_meter = NULL; free_bw_list(list); MFC_UNLOCK(); return 0; } else { /* Delete a single bw_meter entry */ struct bw_meter *prev; uint32_t flags = 0; flags = compute_bw_meter_flags(req); /* Find the bw_meter entry to delete */ for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; prev = x, x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) break; } if (x != NULL) { /* Delete entry from the list for this MFC */ if (prev != NULL) prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ else x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ unschedule_bw_meter(x); MFC_UNLOCK(); /* Free the bw_meter entry */ free(x, M_BWMETER); return 0; } else { MFC_UNLOCK(); return EINVAL; } } /* NOTREACHED */ } /* * Perform bandwidth measurement processing that may result in an upcall */ static void bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) { struct timeval delta; MFC_LOCK_ASSERT(); delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); if (x->bm_flags & BW_METER_GEQ) { /* * Processing for ">=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should deliver an upcall */ if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); x->bm_flags |= BW_METER_UPCALL_DELIVERED; } } } else if (x->bm_flags & BW_METER_LEQ) { /* * Processing for "<=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* * We are behind time with the multicast forwarding table * scanning for "<=" type of bw_meter entries, so test now * if we should deliver an upcall. */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); } /* Reschedule the bw_meter entry */ unschedule_bw_meter(x); schedule_bw_meter(x, nowp); } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should restart the measuring interval */ if ((x->bm_flags & BW_METER_UNIT_PACKETS && x->bm_measured.b_packets <= x->bm_threshold.b_packets) || (x->bm_flags & BW_METER_UNIT_BYTES && x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { /* Don't restart the measuring interval */ } else { /* Do restart the measuring interval */ /* * XXX: note that we don't unschedule and schedule, because this * might be too much overhead per packet. Instead, when we process * all entries for a given timer hash bin, we check whether it is * really a timeout. If not, we reschedule at that time. */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } } } /* * Prepare a bandwidth-related upcall */ static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) { struct timeval delta; struct bw_upcall *u; MFC_LOCK_ASSERT(); /* * Compute the measured time interval */ delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); /* * If there are too many pending upcalls, deliver them now */ if (V_bw_upcalls_n >= BW_UPCALLS_MAX) bw_upcalls_send(); /* * Set the bw_upcall entry */ u = &V_bw_upcalls[V_bw_upcalls_n++]; u->bu_src = x->bm_mfc->mfc_origin; u->bu_dst = x->bm_mfc->mfc_mcastgrp; u->bu_threshold.b_time = x->bm_threshold.b_time; u->bu_threshold.b_packets = x->bm_threshold.b_packets; u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; u->bu_measured.b_time = delta; u->bu_measured.b_packets = x->bm_measured.b_packets; u->bu_measured.b_bytes = x->bm_measured.b_bytes; u->bu_flags = 0; if (x->bm_flags & BW_METER_UNIT_PACKETS) u->bu_flags |= BW_UPCALL_UNIT_PACKETS; if (x->bm_flags & BW_METER_UNIT_BYTES) u->bu_flags |= BW_UPCALL_UNIT_BYTES; if (x->bm_flags & BW_METER_GEQ) u->bu_flags |= BW_UPCALL_GEQ; if (x->bm_flags & BW_METER_LEQ) u->bu_flags |= BW_UPCALL_LEQ; } /* * Send the pending bandwidth-related upcalls */ static void bw_upcalls_send(void) { struct mbuf *m; int len = V_bw_upcalls_n * sizeof(V_bw_upcalls[0]); struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; static struct igmpmsg igmpmsg = { 0, /* unused1 */ 0, /* unused2 */ IGMPMSG_BW_UPCALL,/* im_msgtype */ 0, /* im_mbz */ 0, /* im_vif */ 0, /* unused3 */ { 0 }, /* im_src */ { 0 } }; /* im_dst */ MFC_LOCK_ASSERT(); if (V_bw_upcalls_n == 0) return; /* No pending upcalls */ V_bw_upcalls_n = 0; /* * Allocate a new mbuf, initialize it with the header and * the payload for the pending calls. */ m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); return; } m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&V_bw_upcalls[0]); /* * Send the upcalls * XXX do we need to set the address in k_igmpsrc ? */ MRTSTAT_INC(mrts_upcalls); if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) { log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); MRTSTAT_INC(mrts_upq_sockfull); } } /* * Compute the timeout hash value for the bw_meter entries */ #define BW_METER_TIMEHASH(bw_meter, hash) \ do { \ struct timeval next_timeval = (bw_meter)->bm_start_time; \ \ BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ (hash) = next_timeval.tv_sec; \ if (next_timeval.tv_usec) \ (hash)++; /* XXX: make sure we don't timeout early */ \ (hash) %= BW_METER_BUCKETS; \ } while (0) /* * Schedule a timer to process periodically bw_meter entry of type "<=" * by linking the entry in the proper hash bucket. */ static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) { int time_hash; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; /* * Compute the timeout hash value and insert the entry */ BW_METER_TIMEHASH(x, time_hash); x->bm_time_next = V_bw_meter_timers[time_hash]; V_bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; } /* * Unschedule the periodic timer that processes bw_meter entry of type "<=" * by removing the entry from the proper hash bucket. */ static void unschedule_bw_meter(struct bw_meter *x) { int time_hash; struct bw_meter *prev, *tmp; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Compute the timeout hash value and delete the entry */ time_hash = x->bm_time_hash; if (time_hash >= BW_METER_BUCKETS) return; /* Entry was not scheduled */ for (prev = NULL, tmp = V_bw_meter_timers[time_hash]; tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) if (tmp == x) break; if (tmp == NULL) panic("unschedule_bw_meter: bw_meter entry not found"); if (prev != NULL) prev->bm_time_next = x->bm_time_next; else V_bw_meter_timers[time_hash] = x->bm_time_next; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; } /* * Process all "<=" type of bw_meter that should be processed now, * and for each entry prepare an upcall if necessary. Each processed * entry is rescheduled again for the (periodic) processing. * * This is run periodically (once per second normally). On each round, * all the potentially matching entries are in the hash slot that we are * looking at. */ static void bw_meter_process() { uint32_t loops; int i; struct timeval now, process_endtime; microtime(&now); if (V_last_tv_sec == now.tv_sec) return; /* nothing to do */ loops = now.tv_sec - V_last_tv_sec; V_last_tv_sec = now.tv_sec; if (loops > BW_METER_BUCKETS) loops = BW_METER_BUCKETS; MFC_LOCK(); /* * Process all bins of bw_meter entries from the one after the last * processed to the current one. On entry, i points to the last bucket * visited, so we need to increment i at the beginning of the loop. */ for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { struct bw_meter *x, *tmp_list; if (++i >= BW_METER_BUCKETS) i = 0; /* Disconnect the list of bw_meter entries from the bin */ tmp_list = V_bw_meter_timers[i]; V_bw_meter_timers[i] = NULL; /* Process the list of bw_meter entries */ while (tmp_list != NULL) { x = tmp_list; tmp_list = tmp_list->bm_time_next; /* Test if the time interval is over */ process_endtime = x->bm_start_time; BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); if (BW_TIMEVALCMP(&process_endtime, &now, >)) { /* Not yet: reschedule, but don't reset */ int time_hash; BW_METER_TIMEHASH(x, time_hash); if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { /* * XXX: somehow the bin processing is a bit ahead of time. * Put the entry in the next bin. */ if (++time_hash >= BW_METER_BUCKETS) time_hash = 0; } x->bm_time_next = V_bw_meter_timers[time_hash]; V_bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; continue; } /* * Test if we should deliver an upcall */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, &now); } /* * Reschedule for next processing */ schedule_bw_meter(x, &now); } } /* Send all upcalls that are pending delivery */ bw_upcalls_send(); MFC_UNLOCK(); } /* * A periodic function for sending all upcalls that are pending delivery */ static void expire_bw_upcalls_send(void *arg) { CURVNET_SET((struct vnet *) arg); MFC_LOCK(); bw_upcalls_send(); MFC_UNLOCK(); callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, curvnet); CURVNET_RESTORE(); } /* * A periodic function for periodic scanning of the multicast forwarding * table for processing all "<=" bw_meter entries. */ static void expire_bw_meter_process(void *arg) { CURVNET_SET((struct vnet *) arg); if (V_mrt_api_config & MRT_MFC_BW_UPCALL) bw_meter_process(); callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, curvnet); CURVNET_RESTORE(); } /* * End of bandwidth monitoring code */ /* * Send the packet up to the user daemon, or eventually do kernel encapsulation * */ static int pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m, struct mfc *rt) { struct mbuf *mb_copy, *mm; /* * Do not send IGMP_WHOLEPKT notifications to userland, if the * rendezvous point was unspecified, and we were told not to. */ if (pim_squelch_wholepkt != 0 && (V_mrt_api_config & MRT_MFC_RP) && in_nullhost(rt->mfc_rp)) return 0; mb_copy = pim_register_prepare(ip, m); if (mb_copy == NULL) return ENOBUFS; /* * Send all the fragments. Note that the mbuf for each fragment * is freed by the sending machinery. */ for (mm = mb_copy; mm; mm = mb_copy) { mb_copy = mm->m_nextpkt; mm->m_nextpkt = 0; mm = m_pullup(mm, sizeof(struct ip)); if (mm != NULL) { ip = mtod(mm, struct ip *); if ((V_mrt_api_config & MRT_MFC_RP) && !in_nullhost(rt->mfc_rp)) { pim_register_send_rp(ip, vifp, mm, rt); } else { pim_register_send_upcall(ip, vifp, mm, rt); } } } return 0; } /* * Return a copy of the data packet that is ready for PIM Register * encapsulation. * XXX: Note that in the returned copy the IP header is a valid one. */ static struct mbuf * pim_register_prepare(struct ip *ip, struct mbuf *m) { struct mbuf *mb_copy = NULL; int mtu; /* Take care of delayed checksums */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } /* * Copy the old packet & pullup its IP header into the * new mbuf so we can modify it. */ mb_copy = m_copypacket(m, M_NOWAIT); if (mb_copy == NULL) return NULL; mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); if (mb_copy == NULL) return NULL; /* take care of the TTL */ ip = mtod(mb_copy, struct ip *); --ip->ip_ttl; /* Compute the MTU after the PIM Register encapsulation */ mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); if (ntohs(ip->ip_len) <= mtu) { /* Turn the IP header into a valid one */ ip->ip_sum = 0; ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); } else { /* Fragment the packet */ mb_copy->m_pkthdr.csum_flags |= CSUM_IP; if (ip_fragment(ip, &mb_copy, mtu, 0) != 0) { m_freem(mb_copy); return NULL; } } return mb_copy; } /* * Send an upcall with the data packet to the user-level process. */ static int pim_register_send_upcall(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; int len = ntohs(ip->ip_len); struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; VIF_LOCK_ASSERT(); /* * Add a new mbuf with an upcall header */ mb_first = m_gethdr(M_NOWAIT, MT_DATA); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); mb_first->m_len = sizeof(struct igmpmsg); mb_first->m_next = mb_copy; /* Send message to routing daemon */ im = mtod(mb_first, struct igmpmsg *); im->im_msgtype = IGMPMSG_WHOLEPKT; im->im_mbz = 0; im->im_vif = vifp - V_viftable; im->im_src = ip->ip_src; im->im_dst = ip->ip_dst; k_igmpsrc.sin_addr = ip->ip_src; MRTSTAT_INC(mrts_upcalls); if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) { CTR1(KTR_IPMF, "%s: socket queue full", __func__); MRTSTAT_INC(mrts_upq_sockfull); return ENOBUFS; } /* Keep statistics */ PIMSTAT_INC(pims_snd_registers_msgs); PIMSTAT_ADD(pims_snd_registers_bytes, len); return 0; } /* * Encapsulate the data packet in PIM Register message and send it to the RP. */ static int pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; struct ip *ip_outer; struct pim_encap_pimhdr *pimhdr; int len = ntohs(ip->ip_len); vifi_t vifi = rt->mfc_parent; VIF_LOCK_ASSERT(); if ((vifi >= V_numvifs) || in_nullhost(V_viftable[vifi].v_lcl_addr)) { m_freem(mb_copy); return EADDRNOTAVAIL; /* The iif vif is invalid */ } /* * Add a new mbuf with the encapsulating header */ mb_first = m_gethdr(M_NOWAIT, MT_DATA); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); mb_first->m_next = mb_copy; mb_first->m_pkthdr.len = len + mb_first->m_len; /* * Fill in the encapsulating IP and PIM header */ ip_outer = mtod(mb_first, struct ip *); *ip_outer = pim_encap_iphdr; ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr)); ip_outer->ip_src = V_viftable[vifi].v_lcl_addr; ip_outer->ip_dst = rt->mfc_rp; /* * Copy the inner header TOS to the outer header, and take care of the * IP_DF bit. */ ip_outer->ip_tos = ip->ip_tos; if (ip->ip_off & htons(IP_DF)) ip_outer->ip_off |= htons(IP_DF); ip_fillid(ip_outer); pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer + sizeof(pim_encap_iphdr)); *pimhdr = pim_encap_pimhdr; /* If the iif crosses a border, set the Border-bit */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & V_mrt_api_config) pimhdr->flags |= htonl(PIM_BORDER_REGISTER); mb_first->m_data += sizeof(pim_encap_iphdr); pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); mb_first->m_data -= sizeof(pim_encap_iphdr); send_packet(vifp, mb_first); /* Keep statistics */ PIMSTAT_INC(pims_snd_registers_msgs); PIMSTAT_ADD(pims_snd_registers_bytes, len); return 0; } /* * pim_encapcheck() is called by the encap4_input() path at runtime to * determine if a packet is for PIM; allowing PIM to be dynamically loaded * into the kernel. */ static int -pim_encapcheck(const struct mbuf *m, int off, int proto, void *arg) +pim_encapcheck(const struct mbuf *m __unused, int off __unused, + int proto __unused, void *arg __unused) { -#ifdef DIAGNOSTIC KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM")); -#endif - if (proto != IPPROTO_PIM) - return 0; /* not for us; reject the datagram. */ - - return 64; /* claim the datagram. */ + return (8); /* claim the datagram. */ } /* * PIM-SMv2 and PIM-DM messages processing. * Receives and verifies the PIM control messages, and passes them * up to the listening socket, using rip_input(). * The only message with special processing is the PIM_REGISTER message * (used by PIM-SM): the PIM header is stripped off, and the inner packet * is passed to if_simloop(). */ -int -pim_input(struct mbuf **mp, int *offp, int proto) +static int +pim_input(struct mbuf *m, int off, int proto, void *arg __unused) { - struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct pim *pim; - int iphlen = *offp; + int iphlen = off; int minlen; int datalen = ntohs(ip->ip_len) - iphlen; int ip_tos; - - *mp = NULL; /* Keep statistics */ PIMSTAT_INC(pims_rcv_total_msgs); PIMSTAT_ADD(pims_rcv_total_bytes, datalen); /* * Validate lengths */ if (datalen < PIM_MINLEN) { PIMSTAT_INC(pims_rcv_tooshort); CTR3(KTR_IPMF, "%s: short packet (%d) from 0x%08x", __func__, datalen, ntohl(ip->ip_src.s_addr)); m_freem(m); return (IPPROTO_DONE); } /* * If the packet is at least as big as a REGISTER, go agead * and grab the PIM REGISTER header size, to avoid another * possible m_pullup() later. * * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 */ minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); /* * Get the IP and PIM headers in contiguous memory, and * possibly the PIM REGISTER header. */ if (m->m_len < minlen && (m = m_pullup(m, minlen)) == NULL) { CTR1(KTR_IPMF, "%s: m_pullup() failed", __func__); return (IPPROTO_DONE); } /* m_pullup() may have given us a new mbuf so reset ip. */ ip = mtod(m, struct ip *); ip_tos = ip->ip_tos; /* adjust mbuf to point to the PIM header */ m->m_data += iphlen; m->m_len -= iphlen; pim = mtod(m, struct pim *); /* * Validate checksum. If PIM REGISTER, exclude the data packet. * * XXX: some older PIMv2 implementations don't make this distinction, * so for compatibility reason perform the checksum over part of the * message, and if error, then over the whole message. */ if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { /* do nothing, checksum okay */ } else if (in_cksum(m, datalen)) { PIMSTAT_INC(pims_rcv_badsum); CTR1(KTR_IPMF, "%s: invalid checksum", __func__); m_freem(m); return (IPPROTO_DONE); } /* PIM version check */ if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { PIMSTAT_INC(pims_rcv_badversion); CTR3(KTR_IPMF, "%s: bad version %d expect %d", __func__, (int)PIM_VT_V(pim->pim_vt), PIM_VERSION); m_freem(m); return (IPPROTO_DONE); } /* restore mbuf back to the outer IP */ m->m_data -= iphlen; m->m_len += iphlen; if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { /* * Since this is a REGISTER, we'll make a copy of the register * headers ip + pim + u_int32 + encap_ip, to be passed up to the * routing daemon. */ struct sockaddr_in dst = { sizeof(dst), AF_INET }; struct mbuf *mcp; struct ip *encap_ip; u_int32_t *reghdr; struct ifnet *vifp; VIF_LOCK(); if ((V_reg_vif_num >= V_numvifs) || (V_reg_vif_num == VIFI_INVALID)) { VIF_UNLOCK(); CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__, (int)V_reg_vif_num); m_freem(m); return (IPPROTO_DONE); } /* XXX need refcnt? */ vifp = V_viftable[V_reg_vif_num].v_ifp; VIF_UNLOCK(); /* * Validate length */ if (datalen < PIM_REG_MINLEN) { PIMSTAT_INC(pims_rcv_tooshort); PIMSTAT_INC(pims_rcv_badregisters); CTR1(KTR_IPMF, "%s: register packet size too small", __func__); m_freem(m); return (IPPROTO_DONE); } reghdr = (u_int32_t *)(pim + 1); encap_ip = (struct ip *)(reghdr + 1); CTR3(KTR_IPMF, "%s: register: encap ip src 0x%08x len %d", __func__, ntohl(encap_ip->ip_src.s_addr), ntohs(encap_ip->ip_len)); /* verify the version number of the inner packet */ if (encap_ip->ip_v != IPVERSION) { PIMSTAT_INC(pims_rcv_badregisters); CTR1(KTR_IPMF, "%s: bad encap ip version", __func__); m_freem(m); return (IPPROTO_DONE); } /* verify the inner packet is destined to a mcast group */ if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) { PIMSTAT_INC(pims_rcv_badregisters); CTR2(KTR_IPMF, "%s: bad encap ip dest 0x%08x", __func__, ntohl(encap_ip->ip_dst.s_addr)); m_freem(m); return (IPPROTO_DONE); } /* If a NULL_REGISTER, pass it to the daemon */ if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) goto pim_input_to_daemon; /* * Copy the TOS from the outer IP header to the inner IP header. */ if (encap_ip->ip_tos != ip_tos) { /* Outer TOS -> inner TOS */ encap_ip->ip_tos = ip_tos; /* Recompute the inner header checksum. Sigh... */ /* adjust mbuf to point to the inner IP header */ m->m_data += (iphlen + PIM_MINLEN); m->m_len -= (iphlen + PIM_MINLEN); encap_ip->ip_sum = 0; encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); /* restore mbuf to point back to the outer IP header */ m->m_data -= (iphlen + PIM_MINLEN); m->m_len += (iphlen + PIM_MINLEN); } /* * Decapsulate the inner IP packet and loopback to forward it * as a normal multicast packet. Also, make a copy of the * outer_iphdr + pimhdr + reghdr + encap_iphdr * to pass to the daemon later, so it can take the appropriate * actions (e.g., send back PIM_REGISTER_STOP). * XXX: here m->m_data points to the outer IP header. */ mcp = m_copym(m, 0, iphlen + PIM_REG_MINLEN, M_NOWAIT); if (mcp == NULL) { CTR1(KTR_IPMF, "%s: m_copym() failed", __func__); m_freem(m); return (IPPROTO_DONE); } /* Keep statistics */ /* XXX: registers_bytes include only the encap. mcast pkt */ PIMSTAT_INC(pims_rcv_registers_msgs); PIMSTAT_ADD(pims_rcv_registers_bytes, ntohs(encap_ip->ip_len)); /* * forward the inner ip packet; point m_data at the inner ip. */ m_adj(m, iphlen + PIM_MINLEN); CTR4(KTR_IPMF, "%s: forward decap'd REGISTER: src %lx dst %lx vif %d", __func__, (u_long)ntohl(encap_ip->ip_src.s_addr), (u_long)ntohl(encap_ip->ip_dst.s_addr), (int)V_reg_vif_num); /* NB: vifp was collected above; can it change on us? */ if_simloop(vifp, m, dst.sin_family, 0); /* prepare the register head to send to the mrouting daemon */ m = mcp; } pim_input_to_daemon: /* * Pass the PIM message up to the daemon; if it is a Register message, * pass the 'head' only up to the daemon. This includes the * outer IP header, PIM header, PIM-Register header and the * inner IP header. * XXX: the outer IP header pkt size of a Register is not adjust to * reflect the fact that the inner multicast data is truncated. */ - *mp = m; - rip_input(mp, offp, proto); - - return (IPPROTO_DONE); + return (rip_input(&m, &off, proto)); } static int sysctl_mfctable(SYSCTL_HANDLER_ARGS) { struct mfc *rt; int error, i; if (req->newptr) return (EPERM); if (V_mfchashtbl == NULL) /* XXX unlocked */ return (0); error = sysctl_wire_old_buffer(req, 0); if (error) return (error); MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { LIST_FOREACH(rt, &V_mfchashtbl[i], mfc_hash) { error = SYSCTL_OUT(req, rt, sizeof(struct mfc)); if (error) goto out_locked; } } out_locked: MFC_UNLOCK(); return (error); } static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, sysctl_mfctable, "IPv4 Multicast Forwarding Table " "(struct *mfc[mfchashsize], netinet/ip_mroute.h)"); static void vnet_mroute_init(const void *unused __unused) { V_nexpire = malloc(mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO); bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers)); callout_init(&V_expire_upcalls_ch, 1); callout_init(&V_bw_upcalls_ch, 1); callout_init(&V_bw_meter_ch, 1); } VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init, NULL); static void vnet_mroute_uninit(const void *unused __unused) { free(V_nexpire, M_MRTABLE); V_nexpire = NULL; } VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, vnet_mroute_uninit, NULL); static int ip_mroute_modevent(module_t mod, int type, void *unused) { switch (type) { case MOD_LOAD: MROUTER_LOCK_INIT(); if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, if_detached_event, NULL, EVENTHANDLER_PRI_ANY); if (if_detach_event_tag == NULL) { printf("ip_mroute: unable to register " "ifnet_departure_event handler\n"); MROUTER_LOCK_DESTROY(); return (EINVAL); } MFC_LOCK_INIT(); VIF_LOCK_INIT(); mfchashsize = MFCHASHSIZE; if (TUNABLE_ULONG_FETCH("net.inet.ip.mfchashsize", &mfchashsize) && !powerof2(mfchashsize)) { printf("WARNING: %s not a power of 2; using default\n", "net.inet.ip.mfchashsize"); mfchashsize = MFCHASHSIZE; } pim_squelch_wholepkt = 0; TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt", &pim_squelch_wholepkt); - pim_encap_cookie = encap_attach_func(AF_INET, IPPROTO_PIM, - pim_encapcheck, &in_pim_protosw, NULL); + pim_encap_cookie = ip_encap_attach(&ipv4_encap_cfg, NULL, M_WAITOK); if (pim_encap_cookie == NULL) { printf("ip_mroute: unable to attach pim encap\n"); VIF_LOCK_DESTROY(); MFC_LOCK_DESTROY(); MROUTER_LOCK_DESTROY(); return (EINVAL); } ip_mcast_src = X_ip_mcast_src; ip_mforward = X_ip_mforward; ip_mrouter_done = X_ip_mrouter_done; ip_mrouter_get = X_ip_mrouter_get; ip_mrouter_set = X_ip_mrouter_set; ip_rsvp_force_done = X_ip_rsvp_force_done; ip_rsvp_vif = X_ip_rsvp_vif; legal_vif_num = X_legal_vif_num; mrt_ioctl = X_mrt_ioctl; rsvp_input_p = X_rsvp_input; break; case MOD_UNLOAD: /* * Typically module unload happens after the user-level * process has shutdown the kernel services (the check * below insures someone can't just yank the module out * from under a running process). But if the module is * just loaded and then unloaded w/o starting up a user * process we still need to cleanup. */ MROUTER_LOCK(); if (ip_mrouter_cnt != 0) { MROUTER_UNLOCK(); return (EINVAL); } ip_mrouter_unloading = 1; MROUTER_UNLOCK(); EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); if (pim_encap_cookie) { - encap_detach(pim_encap_cookie); + ip_encap_detach(pim_encap_cookie); pim_encap_cookie = NULL; } ip_mcast_src = NULL; ip_mforward = NULL; ip_mrouter_done = NULL; ip_mrouter_get = NULL; ip_mrouter_set = NULL; ip_rsvp_force_done = NULL; ip_rsvp_vif = NULL; legal_vif_num = NULL; mrt_ioctl = NULL; rsvp_input_p = NULL; VIF_LOCK_DESTROY(); MFC_LOCK_DESTROY(); MROUTER_LOCK_DESTROY(); break; default: return EOPNOTSUPP; } return 0; } static moduledata_t ip_mroutemod = { "ip_mroute", ip_mroute_modevent, 0 }; DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE); diff --git a/sys/netinet/pim_var.h b/sys/netinet/pim_var.h index e6398a4dfa95..dfb06928dc52 100644 --- a/sys/netinet/pim_var.h +++ b/sys/netinet/pim_var.h @@ -1,81 +1,79 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1998-2000 * University of Southern California/Information Sciences Institute. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _NETINET_PIM_VAR_H_ #define _NETINET_PIM_VAR_H_ /* * Protocol Independent Multicast (PIM), * kernel variables and implementation-specific definitions. * * Written by George Edmond Eddy (Rusty), ISI, February 1998. * Modified by Pavlin Radoslavov, USC/ISI, May 1998, Aug 1999, October 2000. * Modified by Hitoshi Asaeda, WIDE, August 1998. */ /* * PIM statistics kept in the kernel */ struct pimstat { uint64_t pims_rcv_total_msgs; /* total PIM messages received */ uint64_t pims_rcv_total_bytes; /* total PIM bytes received */ uint64_t pims_rcv_tooshort; /* rcvd with too few bytes */ uint64_t pims_rcv_badsum; /* rcvd with bad checksum */ uint64_t pims_rcv_badversion; /* rcvd bad PIM version */ uint64_t pims_rcv_registers_msgs; /* rcvd regs. msgs (data only) */ uint64_t pims_rcv_registers_bytes; /* rcvd regs. bytes (data only) */ uint64_t pims_rcv_registers_wrongiif; /* rcvd regs. on wrong iif */ uint64_t pims_rcv_badregisters; /* rcvd invalid registers */ uint64_t pims_snd_registers_msgs; /* sent regs. msgs (data only) */ uint64_t pims_snd_registers_bytes; /* sent regs. bytes (data only) */ }; #ifdef _KERNEL #define PIMSTAT_ADD(name, val) \ VNET_PCPUSTAT_ADD(struct pimstat, pimstat, name, (val)) #define PIMSTAT_INC(name) PIMSTAT_ADD(name, 1) #endif /* * Identifiers for PIM sysctl nodes */ #define PIMCTL_STATS 1 /* statistics (read-only) */ #ifdef _KERNEL - -int pim_input(struct mbuf **, int *, int); SYSCTL_DECL(_net_inet_pim); #endif #endif /* _NETINET_PIM_VAR_H_ */ diff --git a/sys/netinet6/in6_gif.c b/sys/netinet6/in6_gif.c index e67d64eaf131..0fc0e66cab05 100644 --- a/sys/netinet6/in6_gif.c +++ b/sys/netinet6/in6_gif.c @@ -1,232 +1,221 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_gif.c,v 1.49 2001/05/14 14:02:17 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #ifdef INET #include #endif #include #ifdef INET6 #include #include #include #endif #include #ifdef INET6 #include #include #endif #include #define GIF_HLIM 30 static VNET_DEFINE(int, ip6_gif_hlim) = GIF_HLIM; #define V_ip6_gif_hlim VNET(ip6_gif_hlim) SYSCTL_DECL(_net_inet6_ip6); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_GIF_HLIM, gifhlim, CTLFLAG_VNET | CTLFLAG_RW, - &VNET_NAME(ip6_gif_hlim), 0, ""); - -static int in6_gif_input(struct mbuf **, int *, int); - -extern struct domain inet6domain; -static struct protosw in6_gif_protosw = { - .pr_type = SOCK_RAW, - .pr_domain = &inet6domain, - .pr_protocol = 0, /* IPPROTO_IPV[46] */ - .pr_flags = PR_ATOMIC|PR_ADDR, - .pr_input = in6_gif_input, - .pr_output = rip6_output, - .pr_ctloutput = rip6_ctloutput, - .pr_usrreqs = &rip6_usrreqs -}; +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_GIF_HLIM, gifhlim, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_gif_hlim), 0, + "Default hop limit for encapsulated packets"); int in6_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn) { GIF_RLOCK_TRACKER; struct gif_softc *sc = ifp->if_softc; struct ip6_hdr *ip6; int len; /* prepend new IP header */ len = sizeof(struct ip6_hdr); #ifndef __NO_STRICT_ALIGNMENT if (proto == IPPROTO_ETHERIP) len += ETHERIP_ALIGN; #endif M_PREPEND(m, len, M_NOWAIT); if (m == NULL) return (ENOBUFS); #ifndef __NO_STRICT_ALIGNMENT if (proto == IPPROTO_ETHERIP) { len = mtod(m, vm_offset_t) & 3; KASSERT(len == 0 || len == ETHERIP_ALIGN, ("in6_gif_output: unexpected misalignment")); m->m_data += len; m->m_len -= ETHERIP_ALIGN; } #endif ip6 = mtod(m, struct ip6_hdr *); GIF_RLOCK(sc); if (sc->gif_family != AF_INET6) { m_freem(m); GIF_RUNLOCK(sc); return (ENETDOWN); } bcopy(sc->gif_ip6hdr, ip6, sizeof(struct ip6_hdr)); GIF_RUNLOCK(sc); ip6->ip6_flow |= htonl((uint32_t)ecn << 20); ip6->ip6_nxt = proto; ip6->ip6_hlim = V_ip6_gif_hlim; /* * force fragmentation to minimum MTU, to avoid path MTU discovery. * it is too painful to ask for resend of inner packet, to achieve * path MTU discovery for encapsulated packets. */ return (ip6_output(m, 0, NULL, IPV6_MINMTU, 0, NULL, NULL)); } static int -in6_gif_input(struct mbuf **mp, int *offp, int proto) +in6_gif_input(struct mbuf *m, int off, int proto, void *arg) { - struct mbuf *m = *mp; + struct gif_softc *sc = arg; struct ifnet *gifp; - struct gif_softc *sc; struct ip6_hdr *ip6; uint8_t ecn; - sc = encap_getarg(m); if (sc == NULL) { m_freem(m); IP6STAT_INC(ip6s_nogif); return (IPPROTO_DONE); } gifp = GIF2IFP(sc); if ((gifp->if_flags & IFF_UP) != 0) { ip6 = mtod(m, struct ip6_hdr *); ecn = (ntohl(ip6->ip6_flow) >> 20) & 0xff; - m_adj(m, *offp); + m_adj(m, off); gif_input(m, gifp, proto, ecn); } else { m_freem(m); IP6STAT_INC(ip6s_nogif); } return (IPPROTO_DONE); } /* * we know that we are in IFF_UP, outer address available, and outer family * matched the physical addr family. see gif_encapcheck(). */ int in6_gif_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { const struct ip6_hdr *ip6; struct gif_softc *sc; int ret; /* sanity check done in caller */ sc = (struct gif_softc *)arg; GIF_RLOCK_ASSERT(sc); /* * Check for address match. Note that the check is for an incoming * packet. We should compare the *source* address in our configuration * and the *destination* address of the packet, and vice versa. */ ip6 = mtod(m, const struct ip6_hdr *); if (!IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_src, &ip6->ip6_dst)) return (0); ret = 128; if (!IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_dst, &ip6->ip6_src)) { if ((sc->gif_options & GIF_IGNORE_SOURCE) == 0) return (0); } else ret += 128; /* ingress filters on outer source */ if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0) { struct nhop6_basic nh6; /* XXX empty scope id */ if (fib6_lookup_nh_basic(sc->gif_fibnum, &ip6->ip6_src, 0, 0, 0, &nh6) != 0) return (0); if (nh6.nh_ifp != m->m_pkthdr.rcvif) return (0); } return (ret); } +static const struct encap_config ipv6_encap_cfg = { + .proto = -1, + .min_length = sizeof(struct ip6_hdr), + .exact_match = (sizeof(struct in6_addr) << 4) + 8, + .check = gif_encapcheck, + .input = in6_gif_input +}; + int in6_gif_attach(struct gif_softc *sc) { KASSERT(sc->gif_ecookie == NULL, ("gif_ecookie isn't NULL")); - sc->gif_ecookie = encap_attach_func(AF_INET6, -1, gif_encapcheck, - (void *)&in6_gif_protosw, sc); - if (sc->gif_ecookie == NULL) - return (EEXIST); + sc->gif_ecookie = ip6_encap_attach(&ipv6_encap_cfg, sc, M_WAITOK); return (0); } diff --git a/sys/netinet6/ip6_gre.c b/sys/netinet6/ip6_gre.c index 095a1deeb633..4519582c525b 100644 --- a/sys/netinet6/ip6_gre.c +++ b/sys/netinet6/ip6_gre.c @@ -1,146 +1,138 @@ /*- * Copyright (c) 2014 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #ifdef INET #include #include #endif #include #include #include #include #include -extern struct domain inet6domain; -struct protosw in6_gre_protosw = { - .pr_type = SOCK_RAW, - .pr_domain = &inet6domain, - .pr_protocol = IPPROTO_GRE, - .pr_flags = PR_ATOMIC|PR_ADDR, - .pr_input = gre_input, - .pr_output = rip6_output, - .pr_ctloutput = rip6_ctloutput, - .pr_usrreqs = &rip6_usrreqs -}; - VNET_DEFINE(int, ip6_gre_hlim) = IPV6_DEFHLIM; #define V_ip6_gre_hlim VNET(ip6_gre_hlim) SYSCTL_DECL(_net_inet6_ip6); SYSCTL_INT(_net_inet6_ip6, OID_AUTO, grehlim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_gre_hlim), 0, "Default hop limit for encapsulated packets"); static int in6_gre_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { GRE_RLOCK_TRACKER; struct gre_softc *sc; struct ip6_hdr *ip6; sc = (struct gre_softc *)arg; if ((GRE2IFP(sc)->if_flags & IFF_UP) == 0) return (0); M_ASSERTPKTHDR(m); /* * We expect that payload contains at least IPv4 * or IPv6 packet. */ if (m->m_pkthdr.len < sizeof(struct greip6) + #ifdef INET sizeof(struct ip)) #else sizeof(struct ip6_hdr)) #endif return (0); GRE_RLOCK(sc); if (sc->gre_family == 0) goto bad; KASSERT(sc->gre_family == AF_INET6, ("wrong gre_family: %d", sc->gre_family)); ip6 = mtod(m, struct ip6_hdr *); if (!IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_src, &ip6->ip6_dst) || !IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_dst, &ip6->ip6_src)) goto bad; GRE_RUNLOCK(sc); - return (128 * 2); + return (128 * 2 + 32); bad: GRE_RUNLOCK(sc); return (0); } int in6_gre_output(struct mbuf *m, int af, int hlen) { struct greip6 *gi6; gi6 = mtod(m, struct greip6 *); gi6->gi6_ip6.ip6_hlim = V_ip6_gre_hlim; return (ip6_output(m, NULL, NULL, IPV6_MINMTU, NULL, NULL, NULL)); } +static const struct encap_config ipv6_encap_cfg = { + .proto = IPPROTO_GRE, + .min_length = sizeof(struct greip6) + sizeof(struct ip), + .exact_match = (sizeof(struct in6_addr) << 4) + 32, + .check = in6_gre_encapcheck, + .input = gre_input +}; + int in6_gre_attach(struct gre_softc *sc) { KASSERT(sc->gre_ecookie == NULL, ("gre_ecookie isn't NULL")); - sc->gre_ecookie = encap_attach_func(AF_INET6, IPPROTO_GRE, - in6_gre_encapcheck, &in6_gre_protosw, sc); - if (sc->gre_ecookie == NULL) - return (EEXIST); + sc->gre_ecookie = ip6_encap_attach(&ipv6_encap_cfg, sc, M_WAITOK); return (0); } diff --git a/sys/netinet6/ip6_mroute.c b/sys/netinet6/ip6_mroute.c index 524d35a27d99..b4a7a3adda5b 100644 --- a/sys/netinet6/ip6_mroute.c +++ b/sys/netinet6/ip6_mroute.c @@ -1,1972 +1,1963 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_mroute.c,v 1.58 2001/12/18 02:36:31 itojun Exp $ */ /*- * Copyright (c) 1989 Stephen Deering * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 * BSDI ip_mroute.c,v 2.10 1996/11/14 00:29:52 jch Exp */ /* * IP multicast forwarding procedures * * Written by David Waitzman, BBN Labs, August 1988. * Modified by Steve Deering, Stanford, February 1989. * Modified by Mark J. Steiglitz, Stanford, May, 1991 * Modified by Van Jacobson, LBL, January 1993 * Modified by Ajit Thyagarajan, PARC, August 1993 * Modified by Bill Fenner, PARC, April 1994 * * MROUTING Revision: 3.5.1.2 + PIM-SMv2 (pimd) Support */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_MRTABLE6, "mf6c", "multicast forwarding cache entry"); static int ip6_mdq(struct mbuf *, struct ifnet *, struct mf6c *); static void phyint_send(struct ip6_hdr *, struct mif6 *, struct mbuf *); static int register_send(struct ip6_hdr *, struct mif6 *, struct mbuf *); static int set_pim6(int *); static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in6 *); extern int in6_mcast_loop; extern struct domain inet6domain; static const struct encaptab *pim6_encap_cookie; -static const struct protosw in6_pim_protosw = { - .pr_type = SOCK_RAW, - .pr_domain = &inet6domain, - .pr_protocol = IPPROTO_PIM, - .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, - .pr_input = pim6_input, - .pr_output = rip6_output, - .pr_ctloutput = rip6_ctloutput, - .pr_usrreqs = &rip6_usrreqs -}; static int pim6_encapcheck(const struct mbuf *, int, int, void *); +static int pim6_input(struct mbuf *, int, int, void *); + +static const struct encap_config ipv6_encap_cfg = { + .proto = IPPROTO_PIM, + .min_length = sizeof(struct ip6_hdr) + PIM_MINLEN, + .exact_match = 8, + .check = pim6_encapcheck, + .input = pim6_input +}; + static VNET_DEFINE(int, ip6_mrouter_ver) = 0; #define V_ip6_mrouter_ver VNET(ip6_mrouter_ver) SYSCTL_DECL(_net_inet6); SYSCTL_DECL(_net_inet6_ip6); static SYSCTL_NODE(_net_inet6, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM"); static struct mrt6stat mrt6stat; SYSCTL_STRUCT(_net_inet6_ip6, OID_AUTO, mrt6stat, CTLFLAG_RW, &mrt6stat, mrt6stat, "Multicast Routing Statistics (struct mrt6stat, netinet6/ip6_mroute.h)"); #define MRT6STAT_INC(name) mrt6stat.name += 1 #define NO_RTE_FOUND 0x1 #define RTE_FOUND 0x2 static struct mtx mrouter6_mtx; #define MROUTER6_LOCK() mtx_lock(&mrouter6_mtx) #define MROUTER6_UNLOCK() mtx_unlock(&mrouter6_mtx) #define MROUTER6_LOCK_ASSERT() do { \ mtx_assert(&mrouter6_mtx, MA_OWNED); \ NET_ASSERT_GIANT(); \ } while (0) #define MROUTER6_LOCK_INIT() \ mtx_init(&mrouter6_mtx, "IPv6 multicast forwarding", NULL, MTX_DEF) #define MROUTER6_LOCK_DESTROY() mtx_destroy(&mrouter6_mtx) static struct mf6c *mf6ctable[MF6CTBLSIZ]; SYSCTL_OPAQUE(_net_inet6_ip6, OID_AUTO, mf6ctable, CTLFLAG_RD, &mf6ctable, sizeof(mf6ctable), "S,*mf6ctable[MF6CTBLSIZ]", "IPv6 Multicast Forwarding Table (struct *mf6ctable[MF6CTBLSIZ], " "netinet6/ip6_mroute.h)"); static struct mtx mfc6_mtx; #define MFC6_LOCK() mtx_lock(&mfc6_mtx) #define MFC6_UNLOCK() mtx_unlock(&mfc6_mtx) #define MFC6_LOCK_ASSERT() do { \ mtx_assert(&mfc6_mtx, MA_OWNED); \ NET_ASSERT_GIANT(); \ } while (0) #define MFC6_LOCK_INIT() \ mtx_init(&mfc6_mtx, "IPv6 multicast forwarding cache", NULL, MTX_DEF) #define MFC6_LOCK_DESTROY() mtx_destroy(&mfc6_mtx) static u_char n6expire[MF6CTBLSIZ]; static struct mif6 mif6table[MAXMIFS]; static int sysctl_mif6table(SYSCTL_HANDLER_ARGS) { struct mif6_sctl *out; int error; out = malloc(sizeof(struct mif6_sctl) * MAXMIFS, M_TEMP, M_WAITOK); for (int i = 0; i < MAXMIFS; i++) { out[i].m6_flags = mif6table[i].m6_flags; out[i].m6_rate_limit = mif6table[i].m6_rate_limit; out[i].m6_lcl_addr = mif6table[i].m6_lcl_addr; if (mif6table[i].m6_ifp != NULL) out[i].m6_ifp = mif6table[i].m6_ifp->if_index; else out[i].m6_ifp = 0; out[i].m6_pkt_in = mif6table[i].m6_pkt_in; out[i].m6_pkt_out = mif6table[i].m6_pkt_out; out[i].m6_bytes_in = mif6table[i].m6_bytes_in; out[i].m6_bytes_out = mif6table[i].m6_bytes_out; } error = SYSCTL_OUT(req, out, sizeof(struct mif6_sctl) * MAXMIFS); free(out, M_TEMP); return (error); } SYSCTL_PROC(_net_inet6_ip6, OID_AUTO, mif6table, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, sysctl_mif6table, "S,mif6_sctl[MAXMIFS]", "IPv6 Multicast Interfaces (struct mif6_sctl[MAXMIFS], " "netinet6/ip6_mroute.h)"); static struct mtx mif6_mtx; #define MIF6_LOCK() mtx_lock(&mif6_mtx) #define MIF6_UNLOCK() mtx_unlock(&mif6_mtx) #define MIF6_LOCK_ASSERT() mtx_assert(&mif6_mtx, MA_OWNED) #define MIF6_LOCK_INIT() \ mtx_init(&mif6_mtx, "IPv6 multicast interfaces", NULL, MTX_DEF) #define MIF6_LOCK_DESTROY() mtx_destroy(&mif6_mtx) #ifdef MRT6DEBUG static VNET_DEFINE(u_int, mrt6debug) = 0; /* debug level */ #define V_mrt6debug VNET(mrt6debug) #define DEBUG_MFC 0x02 #define DEBUG_FORWARD 0x04 #define DEBUG_EXPIRE 0x08 #define DEBUG_XMIT 0x10 #define DEBUG_REG 0x20 #define DEBUG_PIM 0x40 #define DEBUG_ERR 0x80 #define DEBUG_ANY 0x7f #define MRT6_DLOG(m, fmt, ...) \ if (V_mrt6debug & (m)) \ log(((m) & DEBUG_ERR) ? LOG_ERR: LOG_DEBUG, \ "%s: " fmt "\n", __func__, ##__VA_ARGS__) #else #define MRT6_DLOG(m, fmt, ...) #endif static void expire_upcalls(void *); #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ /* * XXX TODO: maintain a count to if_allmulti() calls in struct ifnet. */ /* * 'Interfaces' associated with decapsulator (so we can tell * packets that went through it from ones that get reflected * by a broken gateway). Different from IPv4 register_if, * these interfaces are linked into the system ifnet list, * because per-interface IPv6 statistics are maintained in * ifp->if_afdata. But it does not have any routes point * to them. I.e., packets can't be sent this way. They * only exist as a placeholder for multicast source * verification. */ static struct ifnet *multicast_register_if6; #define ENCAP_HOPS 64 /* * Private variables. */ static mifi_t nummifs = 0; static mifi_t reg_mif_num = (mifi_t)-1; static struct pim6stat pim6stat; SYSCTL_STRUCT(_net_inet6_pim, PIM6CTL_STATS, stats, CTLFLAG_RW, &pim6stat, pim6stat, "PIM Statistics (struct pim6stat, netinet6/pim6_var.h)"); #define PIM6STAT_INC(name) pim6stat.name += 1 static VNET_DEFINE(int, pim6); #define V_pim6 VNET(pim6) /* * Hash function for a source, group entry */ #define MF6CHASH(a, g) MF6CHASHMOD((a).s6_addr32[0] ^ (a).s6_addr32[1] ^ \ (a).s6_addr32[2] ^ (a).s6_addr32[3] ^ \ (g).s6_addr32[0] ^ (g).s6_addr32[1] ^ \ (g).s6_addr32[2] ^ (g).s6_addr32[3]) /* * Find a route for a given origin IPv6 address and Multicast group address. */ #define MF6CFIND(o, g, rt) do { \ struct mf6c *_rt = mf6ctable[MF6CHASH(o,g)]; \ rt = NULL; \ while (_rt) { \ if (IN6_ARE_ADDR_EQUAL(&_rt->mf6c_origin.sin6_addr, &(o)) && \ IN6_ARE_ADDR_EQUAL(&_rt->mf6c_mcastgrp.sin6_addr, &(g)) && \ (_rt->mf6c_stall == NULL)) { \ rt = _rt; \ break; \ } \ _rt = _rt->mf6c_next; \ } \ if (rt == NULL) { \ MRT6STAT_INC(mrt6s_mfc_misses); \ } \ } while (/*CONSTCOND*/ 0) /* * Macros to compute elapsed time efficiently * Borrowed from Van Jacobson's scheduling code * XXX: replace with timersub() ? */ #define TV_DELTA(a, b, delta) do { \ int xxs; \ \ delta = (a).tv_usec - (b).tv_usec; \ if ((xxs = (a).tv_sec - (b).tv_sec)) { \ switch (xxs) { \ case 2: \ delta += 1000000; \ /* FALLTHROUGH */ \ case 1: \ delta += 1000000; \ break; \ default: \ delta += (1000000 * xxs); \ } \ } \ } while (/*CONSTCOND*/ 0) /* XXX: replace with timercmp(a, b, <) ? */ #define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) #ifdef UPCALL_TIMING #define UPCALL_MAX 50 static u_long upcall_data[UPCALL_MAX + 1]; static void collate(); #endif /* UPCALL_TIMING */ static int ip6_mrouter_init(struct socket *, int, int); static int add_m6fc(struct mf6cctl *); static int add_m6if(struct mif6ctl *); static int del_m6fc(struct mf6cctl *); static int del_m6if(mifi_t *); static int del_m6if_locked(mifi_t *); static int get_mif6_cnt(struct sioc_mif_req6 *); static int get_sg_cnt(struct sioc_sg_req6 *); static struct callout expire_upcalls_ch; int X_ip6_mforward(struct ip6_hdr *, struct ifnet *, struct mbuf *); int X_ip6_mrouter_done(void); int X_ip6_mrouter_set(struct socket *, struct sockopt *); int X_ip6_mrouter_get(struct socket *, struct sockopt *); int X_mrt6_ioctl(u_long, caddr_t); /* * Handle MRT setsockopt commands to modify the multicast routing tables. */ int X_ip6_mrouter_set(struct socket *so, struct sockopt *sopt) { int error = 0; int optval; struct mif6ctl mifc; struct mf6cctl mfcc; mifi_t mifi; if (so != V_ip6_mrouter && sopt->sopt_name != MRT6_INIT) return (EPERM); switch (sopt->sopt_name) { case MRT6_INIT: #ifdef MRT6_OINIT case MRT6_OINIT: #endif error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; error = ip6_mrouter_init(so, optval, sopt->sopt_name); break; case MRT6_DONE: error = X_ip6_mrouter_done(); break; case MRT6_ADD_MIF: error = sooptcopyin(sopt, &mifc, sizeof(mifc), sizeof(mifc)); if (error) break; error = add_m6if(&mifc); break; case MRT6_ADD_MFC: error = sooptcopyin(sopt, &mfcc, sizeof(mfcc), sizeof(mfcc)); if (error) break; error = add_m6fc(&mfcc); break; case MRT6_DEL_MFC: error = sooptcopyin(sopt, &mfcc, sizeof(mfcc), sizeof(mfcc)); if (error) break; error = del_m6fc(&mfcc); break; case MRT6_DEL_MIF: error = sooptcopyin(sopt, &mifi, sizeof(mifi), sizeof(mifi)); if (error) break; error = del_m6if(&mifi); break; case MRT6_PIM: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; error = set_pim6(&optval); break; default: error = EOPNOTSUPP; break; } return (error); } /* * Handle MRT getsockopt commands */ int X_ip6_mrouter_get(struct socket *so, struct sockopt *sopt) { int error = 0; if (so != V_ip6_mrouter) return (EACCES); switch (sopt->sopt_name) { case MRT6_PIM: error = sooptcopyout(sopt, &V_pim6, sizeof(V_pim6)); break; } return (error); } /* * Handle ioctl commands to obtain information from the cache */ int X_mrt6_ioctl(u_long cmd, caddr_t data) { int ret; ret = EINVAL; switch (cmd) { case SIOCGETSGCNT_IN6: ret = get_sg_cnt((struct sioc_sg_req6 *)data); break; case SIOCGETMIFCNT_IN6: ret = get_mif6_cnt((struct sioc_mif_req6 *)data); break; default: break; } return (ret); } /* * returns the packet, byte, rpf-failure count for the source group provided */ static int get_sg_cnt(struct sioc_sg_req6 *req) { struct mf6c *rt; int ret; ret = 0; MFC6_LOCK(); MF6CFIND(req->src.sin6_addr, req->grp.sin6_addr, rt); if (rt == NULL) { ret = ESRCH; } else { req->pktcnt = rt->mf6c_pkt_cnt; req->bytecnt = rt->mf6c_byte_cnt; req->wrong_if = rt->mf6c_wrong_if; } MFC6_UNLOCK(); return (ret); } /* * returns the input and output packet and byte counts on the mif provided */ static int get_mif6_cnt(struct sioc_mif_req6 *req) { mifi_t mifi; int ret; ret = 0; mifi = req->mifi; MIF6_LOCK(); if (mifi >= nummifs) { ret = EINVAL; } else { req->icount = mif6table[mifi].m6_pkt_in; req->ocount = mif6table[mifi].m6_pkt_out; req->ibytes = mif6table[mifi].m6_bytes_in; req->obytes = mif6table[mifi].m6_bytes_out; } MIF6_UNLOCK(); return (ret); } static int set_pim6(int *i) { if ((*i != 1) && (*i != 0)) return (EINVAL); V_pim6 = *i; return (0); } /* * Enable multicast routing */ static int ip6_mrouter_init(struct socket *so, int v, int cmd) { MRT6_DLOG(DEBUG_ANY, "so_type = %d, pr_protocol = %d", so->so_type, so->so_proto->pr_protocol); if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_ICMPV6) return (EOPNOTSUPP); if (v != 1) return (ENOPROTOOPT); MROUTER6_LOCK(); if (V_ip6_mrouter != NULL) { MROUTER6_UNLOCK(); return (EADDRINUSE); } V_ip6_mrouter = so; V_ip6_mrouter_ver = cmd; bzero((caddr_t)mf6ctable, sizeof(mf6ctable)); bzero((caddr_t)n6expire, sizeof(n6expire)); V_pim6 = 0;/* used for stubbing out/in pim stuff */ callout_init(&expire_upcalls_ch, 0); callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); MROUTER6_UNLOCK(); MRT6_DLOG(DEBUG_ANY, "finished"); return (0); } /* * Disable IPv6 multicast forwarding. */ int X_ip6_mrouter_done(void) { mifi_t mifi; u_long i; struct mf6c *rt; struct rtdetq *rte; MROUTER6_LOCK(); if (V_ip6_mrouter == NULL) { MROUTER6_UNLOCK(); return (EINVAL); } /* * For each phyint in use, disable promiscuous reception of all IPv6 * multicasts. */ for (mifi = 0; mifi < nummifs; mifi++) { if (mif6table[mifi].m6_ifp && !(mif6table[mifi].m6_flags & MIFF_REGISTER)) { if_allmulti(mif6table[mifi].m6_ifp, 0); } } bzero((caddr_t)mif6table, sizeof(mif6table)); nummifs = 0; V_pim6 = 0; /* used to stub out/in pim specific code */ callout_stop(&expire_upcalls_ch); /* * Free all multicast forwarding cache entries. */ MFC6_LOCK(); for (i = 0; i < MF6CTBLSIZ; i++) { rt = mf6ctable[i]; while (rt) { struct mf6c *frt; for (rte = rt->mf6c_stall; rte != NULL; ) { struct rtdetq *n = rte->next; m_freem(rte->m); free(rte, M_MRTABLE6); rte = n; } frt = rt; rt = rt->mf6c_next; free(frt, M_MRTABLE6); } } bzero((caddr_t)mf6ctable, sizeof(mf6ctable)); MFC6_UNLOCK(); /* * Reset register interface */ if (reg_mif_num != (mifi_t)-1 && multicast_register_if6 != NULL) { if_detach(multicast_register_if6); if_free(multicast_register_if6); reg_mif_num = (mifi_t)-1; multicast_register_if6 = NULL; } V_ip6_mrouter = NULL; V_ip6_mrouter_ver = 0; MROUTER6_UNLOCK(); MRT6_DLOG(DEBUG_ANY, "finished"); return (0); } static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 }; /* * Add a mif to the mif table */ static int add_m6if(struct mif6ctl *mifcp) { struct mif6 *mifp; struct ifnet *ifp; int error; MIF6_LOCK(); if (mifcp->mif6c_mifi >= MAXMIFS) { MIF6_UNLOCK(); return (EINVAL); } mifp = mif6table + mifcp->mif6c_mifi; if (mifp->m6_ifp != NULL) { MIF6_UNLOCK(); return (EADDRINUSE); /* XXX: is it appropriate? */ } if (mifcp->mif6c_pifi == 0 || mifcp->mif6c_pifi > V_if_index) { MIF6_UNLOCK(); return (ENXIO); } ifp = ifnet_byindex(mifcp->mif6c_pifi); if (mifcp->mif6c_flags & MIFF_REGISTER) { if (reg_mif_num == (mifi_t)-1) { ifp = if_alloc(IFT_OTHER); if_initname(ifp, "register_mif", 0); ifp->if_flags |= IFF_LOOPBACK; if_attach(ifp); multicast_register_if6 = ifp; reg_mif_num = mifcp->mif6c_mifi; /* * it is impossible to guess the ifindex of the * register interface. So mif6c_pifi is automatically * calculated. */ mifcp->mif6c_pifi = ifp->if_index; } else { ifp = multicast_register_if6; } } else { /* Make sure the interface supports multicast */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { MIF6_UNLOCK(); return (EOPNOTSUPP); } error = if_allmulti(ifp, 1); if (error) { MIF6_UNLOCK(); return (error); } } mifp->m6_flags = mifcp->mif6c_flags; mifp->m6_ifp = ifp; /* initialize per mif pkt counters */ mifp->m6_pkt_in = 0; mifp->m6_pkt_out = 0; mifp->m6_bytes_in = 0; mifp->m6_bytes_out = 0; /* Adjust nummifs up if the mifi is higher than nummifs */ if (nummifs <= mifcp->mif6c_mifi) nummifs = mifcp->mif6c_mifi + 1; MIF6_UNLOCK(); MRT6_DLOG(DEBUG_ANY, "mif #%d, phyint %s", mifcp->mif6c_mifi, if_name(ifp)); return (0); } /* * Delete a mif from the mif table */ static int del_m6if_locked(mifi_t *mifip) { struct mif6 *mifp = mif6table + *mifip; mifi_t mifi; struct ifnet *ifp; MIF6_LOCK_ASSERT(); if (*mifip >= nummifs) return (EINVAL); if (mifp->m6_ifp == NULL) return (EINVAL); if (!(mifp->m6_flags & MIFF_REGISTER)) { /* XXX: TODO: Maintain an ALLMULTI refcount in struct ifnet. */ ifp = mifp->m6_ifp; if_allmulti(ifp, 0); } else { if (reg_mif_num != (mifi_t)-1 && multicast_register_if6 != NULL) { if_detach(multicast_register_if6); if_free(multicast_register_if6); reg_mif_num = (mifi_t)-1; multicast_register_if6 = NULL; } } bzero((caddr_t)mifp, sizeof(*mifp)); /* Adjust nummifs down */ for (mifi = nummifs; mifi > 0; mifi--) if (mif6table[mifi - 1].m6_ifp) break; nummifs = mifi; MRT6_DLOG(DEBUG_ANY, "mif %d, nummifs %d", *mifip, nummifs); return (0); } static int del_m6if(mifi_t *mifip) { int cc; MIF6_LOCK(); cc = del_m6if_locked(mifip); MIF6_UNLOCK(); return (cc); } /* * Add an mfc entry */ static int add_m6fc(struct mf6cctl *mfccp) { struct mf6c *rt; u_long hash; struct rtdetq *rte; u_short nstl; char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN]; MFC6_LOCK(); MF6CFIND(mfccp->mf6cc_origin.sin6_addr, mfccp->mf6cc_mcastgrp.sin6_addr, rt); /* If an entry already exists, just update the fields */ if (rt) { MRT6_DLOG(DEBUG_MFC, "no upcall o %s g %s p %x", ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr), ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent); rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; MFC6_UNLOCK(); return (0); } /* * Find the entry for which the upcall was made and update */ hash = MF6CHASH(mfccp->mf6cc_origin.sin6_addr, mfccp->mf6cc_mcastgrp.sin6_addr); for (rt = mf6ctable[hash], nstl = 0; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr, &mfccp->mf6cc_origin.sin6_addr) && IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr, &mfccp->mf6cc_mcastgrp.sin6_addr) && (rt->mf6c_stall != NULL)) { if (nstl++) log(LOG_ERR, "add_m6fc: %s o %s g %s p %x dbx %p\n", "multiple kernel entries", ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr), ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent, rt->mf6c_stall); MRT6_DLOG(DEBUG_MFC, "o %s g %s p %x dbg %p", ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr), ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent, rt->mf6c_stall); rt->mf6c_origin = mfccp->mf6cc_origin; rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp; rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; /* initialize pkt counters per src-grp */ rt->mf6c_pkt_cnt = 0; rt->mf6c_byte_cnt = 0; rt->mf6c_wrong_if = 0; rt->mf6c_expire = 0; /* Don't clean this guy up */ n6expire[hash]--; /* free packets Qed at the end of this entry */ for (rte = rt->mf6c_stall; rte != NULL; ) { struct rtdetq *n = rte->next; ip6_mdq(rte->m, rte->ifp, rt); m_freem(rte->m); #ifdef UPCALL_TIMING collate(&(rte->t)); #endif /* UPCALL_TIMING */ free(rte, M_MRTABLE6); rte = n; } rt->mf6c_stall = NULL; } } /* * It is possible that an entry is being inserted without an upcall */ if (nstl == 0) { MRT6_DLOG(DEBUG_MFC, "no upcall h %lu o %s g %s p %x", hash, ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr), ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent); for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr, &mfccp->mf6cc_origin.sin6_addr)&& IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr, &mfccp->mf6cc_mcastgrp.sin6_addr)) { rt->mf6c_origin = mfccp->mf6cc_origin; rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp; rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; /* initialize pkt counters per src-grp */ rt->mf6c_pkt_cnt = 0; rt->mf6c_byte_cnt = 0; rt->mf6c_wrong_if = 0; if (rt->mf6c_expire) n6expire[hash]--; rt->mf6c_expire = 0; } } if (rt == NULL) { /* no upcall, so make a new entry */ rt = (struct mf6c *)malloc(sizeof(*rt), M_MRTABLE6, M_NOWAIT); if (rt == NULL) { MFC6_UNLOCK(); return (ENOBUFS); } /* insert new entry at head of hash chain */ rt->mf6c_origin = mfccp->mf6cc_origin; rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp; rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; /* initialize pkt counters per src-grp */ rt->mf6c_pkt_cnt = 0; rt->mf6c_byte_cnt = 0; rt->mf6c_wrong_if = 0; rt->mf6c_expire = 0; rt->mf6c_stall = NULL; /* link into table */ rt->mf6c_next = mf6ctable[hash]; mf6ctable[hash] = rt; } } MFC6_UNLOCK(); return (0); } #ifdef UPCALL_TIMING /* * collect delay statistics on the upcalls */ static void collate(struct timeval *t) { u_long d; struct timeval tp; u_long delta; GET_TIME(tp); if (TV_LT(*t, tp)) { TV_DELTA(tp, *t, delta); d = delta >> 10; if (d > UPCALL_MAX) d = UPCALL_MAX; ++upcall_data[d]; } } #endif /* UPCALL_TIMING */ /* * Delete an mfc entry */ static int del_m6fc(struct mf6cctl *mfccp) { #ifdef MRT6DEBUG char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN]; #endif struct sockaddr_in6 origin; struct sockaddr_in6 mcastgrp; struct mf6c *rt; struct mf6c **nptr; u_long hash; origin = mfccp->mf6cc_origin; mcastgrp = mfccp->mf6cc_mcastgrp; hash = MF6CHASH(origin.sin6_addr, mcastgrp.sin6_addr); MRT6_DLOG(DEBUG_MFC, "orig %s mcastgrp %s", ip6_sprintf(ip6bufo, &origin.sin6_addr), ip6_sprintf(ip6bufg, &mcastgrp.sin6_addr)); MFC6_LOCK(); nptr = &mf6ctable[hash]; while ((rt = *nptr) != NULL) { if (IN6_ARE_ADDR_EQUAL(&origin.sin6_addr, &rt->mf6c_origin.sin6_addr) && IN6_ARE_ADDR_EQUAL(&mcastgrp.sin6_addr, &rt->mf6c_mcastgrp.sin6_addr) && rt->mf6c_stall == NULL) break; nptr = &rt->mf6c_next; } if (rt == NULL) { MFC6_UNLOCK(); return (EADDRNOTAVAIL); } *nptr = rt->mf6c_next; free(rt, M_MRTABLE6); MFC6_UNLOCK(); return (0); } static int socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in6 *src) { if (s) { if (sbappendaddr(&s->so_rcv, (struct sockaddr *)src, mm, (struct mbuf *)0) != 0) { sorwakeup(s); return (0); } } m_freem(mm); return (-1); } /* * IPv6 multicast forwarding function. This function assumes that the packet * pointed to by "ip6" has arrived on (or is about to be sent to) the interface * pointed to by "ifp", and the packet is to be relayed to other networks * that have members of the packet's destination IPv6 multicast group. * * The packet is returned unscathed to the caller, unless it is * erroneous, in which case a non-zero return value tells the caller to * discard it. * * NOTE: this implementation assumes that m->m_pkthdr.rcvif is NULL iff * this function is called in the originating context (i.e., not when * forwarding a packet from other node). ip6_output(), which is currently the * only function that calls this function is called in the originating context, * explicitly ensures this condition. It is caller's responsibility to ensure * that if this function is called from somewhere else in the originating * context in the future. */ int X_ip6_mforward(struct ip6_hdr *ip6, struct ifnet *ifp, struct mbuf *m) { struct rtdetq *rte; struct mbuf *mb0; struct mf6c *rt; struct mif6 *mifp; struct mbuf *mm; u_long hash; mifi_t mifi; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; #ifdef UPCALL_TIMING struct timeval tp; GET_TIME(tp); #endif /* UPCALL_TIMING */ MRT6_DLOG(DEBUG_FORWARD, "src %s, dst %s, ifindex %d", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ifp->if_index); /* * Don't forward a packet with Hop limit of zero or one, * or a packet destined to a local-only group. */ if (ip6->ip6_hlim <= 1 || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) || IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) return (0); ip6->ip6_hlim--; /* * Source address check: do not forward packets with unspecified * source. It was discussed in July 2000, on ipngwg mailing list. * This is rather more serious than unicast cases, because some * MLD packets can be sent with the unspecified source address * (although such packets must normally set 1 to the hop limit field). */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { IP6STAT_INC(ip6s_cantforward); if (V_ip6_log_time + V_ip6_log_interval < time_uptime) { V_ip6_log_time = time_uptime; log(LOG_DEBUG, "cannot forward " "from %s to %s nxt %d received on %s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif)); } return (0); } MFC6_LOCK(); /* * Determine forwarding mifs from the forwarding cache table */ MF6CFIND(ip6->ip6_src, ip6->ip6_dst, rt); MRT6STAT_INC(mrt6s_mfc_lookups); /* Entry exists, so forward if necessary */ if (rt) { MFC6_UNLOCK(); return (ip6_mdq(m, ifp, rt)); } /* * If we don't have a route for packet's origin, * Make a copy of the packet & send message to routing daemon. */ MRT6STAT_INC(mrt6s_no_route); MRT6_DLOG(DEBUG_FORWARD | DEBUG_MFC, "no rte s %s g %s", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst)); /* * Allocate mbufs early so that we don't do extra work if we * are just going to fail anyway. */ rte = (struct rtdetq *)malloc(sizeof(*rte), M_MRTABLE6, M_NOWAIT); if (rte == NULL) { MFC6_UNLOCK(); return (ENOBUFS); } mb0 = m_copym(m, 0, M_COPYALL, M_NOWAIT); /* * Pullup packet header if needed before storing it, * as other references may modify it in the meantime. */ if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < sizeof(struct ip6_hdr))) mb0 = m_pullup(mb0, sizeof(struct ip6_hdr)); if (mb0 == NULL) { free(rte, M_MRTABLE6); MFC6_UNLOCK(); return (ENOBUFS); } /* is there an upcall waiting for this packet? */ hash = MF6CHASH(ip6->ip6_src, ip6->ip6_dst); for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &rt->mf6c_origin.sin6_addr) && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &rt->mf6c_mcastgrp.sin6_addr) && (rt->mf6c_stall != NULL)) break; } if (rt == NULL) { struct mrt6msg *im; #ifdef MRT6_OINIT struct omrt6msg *oim; #endif /* no upcall, so make a new entry */ rt = (struct mf6c *)malloc(sizeof(*rt), M_MRTABLE6, M_NOWAIT); if (rt == NULL) { free(rte, M_MRTABLE6); m_freem(mb0); MFC6_UNLOCK(); return (ENOBUFS); } /* * Make a copy of the header to send to the user * level process */ mm = m_copym(mb0, 0, sizeof(struct ip6_hdr), M_NOWAIT); if (mm == NULL) { free(rte, M_MRTABLE6); m_freem(mb0); free(rt, M_MRTABLE6); MFC6_UNLOCK(); return (ENOBUFS); } /* * Send message to routing daemon */ sin6.sin6_addr = ip6->ip6_src; im = NULL; #ifdef MRT6_OINIT oim = NULL; #endif switch (V_ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim = mtod(mm, struct omrt6msg *); oim->im6_msgtype = MRT6MSG_NOCACHE; oim->im6_mbz = 0; break; #endif case MRT6_INIT: im = mtod(mm, struct mrt6msg *); im->im6_msgtype = MRT6MSG_NOCACHE; im->im6_mbz = 0; break; default: free(rte, M_MRTABLE6); m_freem(mb0); free(rt, M_MRTABLE6); MFC6_UNLOCK(); return (EINVAL); } MRT6_DLOG(DEBUG_FORWARD, "getting the iif info in the kernel"); for (mifp = mif6table, mifi = 0; mifi < nummifs && mifp->m6_ifp != ifp; mifp++, mifi++) ; switch (V_ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim->im6_mif = mifi; break; #endif case MRT6_INIT: im->im6_mif = mifi; break; } if (socket_send(V_ip6_mrouter, mm, &sin6) < 0) { log(LOG_WARNING, "ip6_mforward: ip6_mrouter " "socket queue full\n"); MRT6STAT_INC(mrt6s_upq_sockfull); free(rte, M_MRTABLE6); m_freem(mb0); free(rt, M_MRTABLE6); MFC6_UNLOCK(); return (ENOBUFS); } MRT6STAT_INC(mrt6s_upcalls); /* insert new entry at head of hash chain */ bzero(rt, sizeof(*rt)); rt->mf6c_origin.sin6_family = AF_INET6; rt->mf6c_origin.sin6_len = sizeof(struct sockaddr_in6); rt->mf6c_origin.sin6_addr = ip6->ip6_src; rt->mf6c_mcastgrp.sin6_family = AF_INET6; rt->mf6c_mcastgrp.sin6_len = sizeof(struct sockaddr_in6); rt->mf6c_mcastgrp.sin6_addr = ip6->ip6_dst; rt->mf6c_expire = UPCALL_EXPIRE; n6expire[hash]++; rt->mf6c_parent = MF6C_INCOMPLETE_PARENT; /* link into table */ rt->mf6c_next = mf6ctable[hash]; mf6ctable[hash] = rt; /* Add this entry to the end of the queue */ rt->mf6c_stall = rte; } else { /* determine if q has overflowed */ struct rtdetq **p; int npkts = 0; for (p = &rt->mf6c_stall; *p != NULL; p = &(*p)->next) if (++npkts > MAX_UPQ6) { MRT6STAT_INC(mrt6s_upq_ovflw); free(rte, M_MRTABLE6); m_freem(mb0); MFC6_UNLOCK(); return (0); } /* Add this entry to the end of the queue */ *p = rte; } rte->next = NULL; rte->m = mb0; rte->ifp = ifp; #ifdef UPCALL_TIMING rte->t = tp; #endif /* UPCALL_TIMING */ MFC6_UNLOCK(); return (0); } /* * Clean up cache entries if upcalls are not serviced * Call from the Slow Timeout mechanism, every half second. */ static void expire_upcalls(void *unused) { #ifdef MRT6DEBUG char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN]; #endif struct rtdetq *rte; struct mf6c *mfc, **nptr; u_long i; MFC6_LOCK(); for (i = 0; i < MF6CTBLSIZ; i++) { if (n6expire[i] == 0) continue; nptr = &mf6ctable[i]; while ((mfc = *nptr) != NULL) { rte = mfc->mf6c_stall; /* * Skip real cache entries * Make sure it wasn't marked to not expire (shouldn't happen) * If it expires now */ if (rte != NULL && mfc->mf6c_expire != 0 && --mfc->mf6c_expire == 0) { MRT6_DLOG(DEBUG_EXPIRE, "expiring (%s %s)", ip6_sprintf(ip6bufo, &mfc->mf6c_origin.sin6_addr), ip6_sprintf(ip6bufg, &mfc->mf6c_mcastgrp.sin6_addr)); /* * drop all the packets * free the mbuf with the pkt, if, timing info */ do { struct rtdetq *n = rte->next; m_freem(rte->m); free(rte, M_MRTABLE6); rte = n; } while (rte != NULL); MRT6STAT_INC(mrt6s_cache_cleanups); n6expire[i]--; *nptr = mfc->mf6c_next; free(mfc, M_MRTABLE6); } else { nptr = &mfc->mf6c_next; } } } MFC6_UNLOCK(); callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); } /* * Packet forwarding routine once entry in the cache is made */ static int ip6_mdq(struct mbuf *m, struct ifnet *ifp, struct mf6c *rt) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); mifi_t mifi, iif; struct mif6 *mifp; int plen = m->m_pkthdr.len; struct in6_addr src0, dst0; /* copies for local work */ u_int32_t iszone, idzone, oszone, odzone; int error = 0; /* * Macro to send packet on mif. Since RSVP packets don't get counted on * input, they shouldn't get counted on output, so statistics keeping is * separate. */ #define MC6_SEND(ip6, mifp, m) do { \ if ((mifp)->m6_flags & MIFF_REGISTER) \ register_send((ip6), (mifp), (m)); \ else \ phyint_send((ip6), (mifp), (m)); \ } while (/*CONSTCOND*/ 0) /* * Don't forward if it didn't arrive from the parent mif * for its origin. */ mifi = rt->mf6c_parent; if ((mifi >= nummifs) || (mif6table[mifi].m6_ifp != ifp)) { /* came in the wrong interface */ MRT6_DLOG(DEBUG_FORWARD, "wrong if: ifid %d mifi %d mififid %x", ifp->if_index, mifi, mif6table[mifi].m6_ifp->if_index); MRT6STAT_INC(mrt6s_wrong_if); rt->mf6c_wrong_if++; /* * If we are doing PIM processing, and we are forwarding * packets on this interface, send a message to the * routing daemon. */ /* have to make sure this is a valid mif */ if (mifi < nummifs && mif6table[mifi].m6_ifp) if (V_pim6 && (m->m_flags & M_LOOP) == 0) { /* * Check the M_LOOP flag to avoid an * unnecessary PIM assert. * XXX: M_LOOP is an ad-hoc hack... */ static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 }; struct mbuf *mm; struct mrt6msg *im; #ifdef MRT6_OINIT struct omrt6msg *oim; #endif mm = m_copym(m, 0, sizeof(struct ip6_hdr), M_NOWAIT); if (mm && (!M_WRITABLE(mm) || mm->m_len < sizeof(struct ip6_hdr))) mm = m_pullup(mm, sizeof(struct ip6_hdr)); if (mm == NULL) return (ENOBUFS); #ifdef MRT6_OINIT oim = NULL; #endif im = NULL; switch (V_ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim = mtod(mm, struct omrt6msg *); oim->im6_msgtype = MRT6MSG_WRONGMIF; oim->im6_mbz = 0; break; #endif case MRT6_INIT: im = mtod(mm, struct mrt6msg *); im->im6_msgtype = MRT6MSG_WRONGMIF; im->im6_mbz = 0; break; default: m_freem(mm); return (EINVAL); } for (mifp = mif6table, iif = 0; iif < nummifs && mifp && mifp->m6_ifp != ifp; mifp++, iif++) ; switch (V_ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim->im6_mif = iif; sin6.sin6_addr = oim->im6_src; break; #endif case MRT6_INIT: im->im6_mif = iif; sin6.sin6_addr = im->im6_src; break; } MRT6STAT_INC(mrt6s_upcalls); if (socket_send(V_ip6_mrouter, mm, &sin6) < 0) { MRT6_DLOG(DEBUG_ANY, "ip6_mrouter socket queue full"); MRT6STAT_INC(mrt6s_upq_sockfull); return (ENOBUFS); } /* if socket Q full */ } /* if PIM */ return (0); } /* if wrong iif */ /* If I sourced this packet, it counts as output, else it was input. */ if (m->m_pkthdr.rcvif == NULL) { /* XXX: is rcvif really NULL when output?? */ mif6table[mifi].m6_pkt_out++; mif6table[mifi].m6_bytes_out += plen; } else { mif6table[mifi].m6_pkt_in++; mif6table[mifi].m6_bytes_in += plen; } rt->mf6c_pkt_cnt++; rt->mf6c_byte_cnt += plen; /* * For each mif, forward a copy of the packet if there are group * members downstream on the interface. */ src0 = ip6->ip6_src; dst0 = ip6->ip6_dst; if ((error = in6_setscope(&src0, ifp, &iszone)) != 0 || (error = in6_setscope(&dst0, ifp, &idzone)) != 0) { IP6STAT_INC(ip6s_badscope); return (error); } for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++) { if (IF_ISSET(mifi, &rt->mf6c_ifset)) { /* * check if the outgoing packet is going to break * a scope boundary. * XXX For packets through PIM register tunnel * interface, we believe a routing daemon. */ if (!(mif6table[rt->mf6c_parent].m6_flags & MIFF_REGISTER) && !(mif6table[mifi].m6_flags & MIFF_REGISTER)) { if (in6_setscope(&src0, mif6table[mifi].m6_ifp, &oszone) || in6_setscope(&dst0, mif6table[mifi].m6_ifp, &odzone) || iszone != oszone || idzone != odzone) { IP6STAT_INC(ip6s_badscope); continue; } } mifp->m6_pkt_out++; mifp->m6_bytes_out += plen; MC6_SEND(ip6, mifp, m); } } return (0); } static void phyint_send(struct ip6_hdr *ip6, struct mif6 *mifp, struct mbuf *m) { #ifdef MRT6DEBUG char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; #endif struct mbuf *mb_copy; struct ifnet *ifp = mifp->m6_ifp; int error = 0; u_long linkmtu; /* * Make a new reference to the packet; make sure that * the IPv6 header is actually copied, not just referenced, * so that ip6_output() only scribbles on the copy. */ mb_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (mb_copy && (!M_WRITABLE(mb_copy) || mb_copy->m_len < sizeof(struct ip6_hdr))) mb_copy = m_pullup(mb_copy, sizeof(struct ip6_hdr)); if (mb_copy == NULL) { return; } /* set MCAST flag to the outgoing packet */ mb_copy->m_flags |= M_MCAST; /* * If we sourced the packet, call ip6_output since we may devide * the packet into fragments when the packet is too big for the * outgoing interface. * Otherwise, we can simply send the packet to the interface * sending queue. */ if (m->m_pkthdr.rcvif == NULL) { struct ip6_moptions im6o; im6o.im6o_multicast_ifp = ifp; /* XXX: ip6_output will override ip6->ip6_hlim */ im6o.im6o_multicast_hlim = ip6->ip6_hlim; im6o.im6o_multicast_loop = 1; error = ip6_output(mb_copy, NULL, NULL, IPV6_FORWARDING, &im6o, NULL, NULL); MRT6_DLOG(DEBUG_XMIT, "mif %u err %d", (uint16_t)(mifp - mif6table), error); return; } /* * If configured to loop back multicasts by default, * loop back a copy now. */ if (in6_mcast_loop) ip6_mloopback(ifp, m); /* * Put the packet into the sending queue of the outgoing interface * if it would fit in the MTU of the interface. */ linkmtu = IN6_LINKMTU(ifp); if (mb_copy->m_pkthdr.len <= linkmtu || linkmtu < IPV6_MMTU) { struct sockaddr_in6 dst6; bzero(&dst6, sizeof(dst6)); dst6.sin6_len = sizeof(struct sockaddr_in6); dst6.sin6_family = AF_INET6; dst6.sin6_addr = ip6->ip6_dst; IP_PROBE(send, NULL, NULL, ip6, ifp, NULL, ip6); /* * We just call if_output instead of nd6_output here, since * we need no ND for a multicast forwarded packet...right? */ m_clrprotoflags(m); /* Avoid confusing lower layers. */ error = (*ifp->if_output)(ifp, mb_copy, (struct sockaddr *)&dst6, NULL); MRT6_DLOG(DEBUG_XMIT, "mif %u err %d", (uint16_t)(mifp - mif6table), error); } else { /* * pMTU discovery is intentionally disabled by default, since * various router may notify pMTU in multicast, which can be * a DDoS to a router */ if (V_ip6_mcast_pmtu) icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0, linkmtu); else { MRT6_DLOG(DEBUG_XMIT, " packet too big on %s o %s " "g %s size %d (discarded)", if_name(ifp), ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), mb_copy->m_pkthdr.len); m_freem(mb_copy); /* simply discard the packet */ } } } static int register_send(struct ip6_hdr *ip6, struct mif6 *mif, struct mbuf *m) { #ifdef MRT6DEBUG char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; #endif struct mbuf *mm; int i, len = m->m_pkthdr.len; static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 }; struct mrt6msg *im6; MRT6_DLOG(DEBUG_ANY, "src %s dst %s", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst)); PIM6STAT_INC(pim6s_snd_registers); /* Make a copy of the packet to send to the user level process. */ mm = m_gethdr(M_NOWAIT, MT_DATA); if (mm == NULL) return (ENOBUFS); mm->m_data += max_linkhdr; mm->m_len = sizeof(struct ip6_hdr); if ((mm->m_next = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) { m_freem(mm); return (ENOBUFS); } i = MHLEN - M_LEADINGSPACE(mm); if (i > len) i = len; mm = m_pullup(mm, i); if (mm == NULL) return (ENOBUFS); /* TODO: check it! */ mm->m_pkthdr.len = len + sizeof(struct ip6_hdr); /* * Send message to routing daemon */ sin6.sin6_addr = ip6->ip6_src; im6 = mtod(mm, struct mrt6msg *); im6->im6_msgtype = MRT6MSG_WHOLEPKT; im6->im6_mbz = 0; im6->im6_mif = mif - mif6table; /* iif info is not given for reg. encap.n */ MRT6STAT_INC(mrt6s_upcalls); if (socket_send(V_ip6_mrouter, mm, &sin6) < 0) { MRT6_DLOG(DEBUG_ANY, "ip6_mrouter socket queue full"); MRT6STAT_INC(mrt6s_upq_sockfull); return (ENOBUFS); } return (0); } /* * pim6_encapcheck() is called by the encap6_input() path at runtime to * determine if a packet is for PIM; allowing PIM to be dynamically loaded * into the kernel. */ static int -pim6_encapcheck(const struct mbuf *m, int off, int proto, void *arg) +pim6_encapcheck(const struct mbuf *m __unused, int off __unused, + int proto __unused, void *arg __unused) { -#ifdef DIAGNOSTIC KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM")); -#endif - if (proto != IPPROTO_PIM) - return 0; /* not for us; reject the datagram. */ - - return 64; /* claim the datagram. */ + return (8); /* claim the datagram. */ } /* * PIM sparse mode hook * Receives the pim control messages, and passes them up to the listening * socket, using rip6_input. * The only message processed is the REGISTER pim message; the pim header * is stripped off, and the inner packet is passed to register_mforward. */ -int -pim6_input(struct mbuf **mp, int *offp, int proto) +static int +pim6_input(struct mbuf *m, int off, int proto, void *arg __unused) { struct pim *pim; /* pointer to a pim struct */ struct ip6_hdr *ip6; int pimlen; - struct mbuf *m = *mp; int minlen; - int off = *offp; PIM6STAT_INC(pim6s_rcv_total); ip6 = mtod(m, struct ip6_hdr *); - pimlen = m->m_pkthdr.len - *offp; + pimlen = m->m_pkthdr.len - off; /* * Validate lengths */ if (pimlen < PIM_MINLEN) { PIM6STAT_INC(pim6s_rcv_tooshort); MRT6_DLOG(DEBUG_PIM, "PIM packet too short"); m_freem(m); return (IPPROTO_DONE); } /* * if the packet is at least as big as a REGISTER, go ahead * and grab the PIM REGISTER header size, to avoid another * possible m_pullup() later. * * PIM_MINLEN == pimhdr + u_int32 == 8 * PIM6_REG_MINLEN == pimhdr + reghdr + eip6hdr == 4 + 4 + 40 */ minlen = (pimlen >= PIM6_REG_MINLEN) ? PIM6_REG_MINLEN : PIM_MINLEN; /* * Make sure that the IP6 and PIM headers in contiguous memory, and * possibly the PIM REGISTER header */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, minlen, IPPROTO_DONE); /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* adjust mbuf to point to the PIM header */ pim = (struct pim *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(pim, struct pim *, m, off, minlen); if (pim == NULL) { PIM6STAT_INC(pim6s_rcv_tooshort); return (IPPROTO_DONE); } #endif #define PIM6_CHECKSUM #ifdef PIM6_CHECKSUM { int cksumlen; /* * Validate checksum. * If PIM REGISTER, exclude the data packet */ if (pim->pim_type == PIM_REGISTER) cksumlen = PIM_MINLEN; else cksumlen = pimlen; if (in6_cksum(m, IPPROTO_PIM, off, cksumlen)) { PIM6STAT_INC(pim6s_rcv_badsum); MRT6_DLOG(DEBUG_PIM, "invalid checksum"); m_freem(m); return (IPPROTO_DONE); } } #endif /* PIM_CHECKSUM */ /* PIM version check */ if (pim->pim_ver != PIM_VERSION) { PIM6STAT_INC(pim6s_rcv_badversion); MRT6_DLOG(DEBUG_ANY | DEBUG_ERR, "incorrect version %d, expecting %d", pim->pim_ver, PIM_VERSION); m_freem(m); return (IPPROTO_DONE); } if (pim->pim_type == PIM_REGISTER) { /* * since this is a REGISTER, we'll make a copy of the register * headers ip6+pim+u_int32_t+encap_ip6, to be passed up to the * routing daemon. */ static struct sockaddr_in6 dst = { sizeof(dst), AF_INET6 }; struct mbuf *mcp; struct ip6_hdr *eip6; u_int32_t *reghdr; int rc; #ifdef MRT6DEBUG char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; #endif PIM6STAT_INC(pim6s_rcv_registers); if ((reg_mif_num >= nummifs) || (reg_mif_num == (mifi_t) -1)) { MRT6_DLOG(DEBUG_PIM, "register mif not set: %d", reg_mif_num); m_freem(m); return (IPPROTO_DONE); } reghdr = (u_int32_t *)(pim + 1); if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) goto pim6_input_to_daemon; /* * Validate length */ if (pimlen < PIM6_REG_MINLEN) { PIM6STAT_INC(pim6s_rcv_tooshort); PIM6STAT_INC(pim6s_rcv_badregisters); MRT6_DLOG(DEBUG_ANY | DEBUG_ERR, "register packet " "size too small %d from %s", pimlen, ip6_sprintf(ip6bufs, &ip6->ip6_src)); m_freem(m); return (IPPROTO_DONE); } eip6 = (struct ip6_hdr *) (reghdr + 1); MRT6_DLOG(DEBUG_PIM, "eip6: %s -> %s, eip6 plen %d", ip6_sprintf(ip6bufs, &eip6->ip6_src), ip6_sprintf(ip6bufd, &eip6->ip6_dst), ntohs(eip6->ip6_plen)); /* verify the version number of the inner packet */ if ((eip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { PIM6STAT_INC(pim6s_rcv_badregisters); MRT6_DLOG(DEBUG_ANY, "invalid IP version (%d) " "of the inner packet", (eip6->ip6_vfc & IPV6_VERSION)); m_freem(m); return (IPPROTO_DONE); } /* verify the inner packet is destined to a mcast group */ if (!IN6_IS_ADDR_MULTICAST(&eip6->ip6_dst)) { PIM6STAT_INC(pim6s_rcv_badregisters); MRT6_DLOG(DEBUG_PIM, "inner packet of register " "is not multicast %s", ip6_sprintf(ip6bufd, &eip6->ip6_dst)); m_freem(m); return (IPPROTO_DONE); } /* * make a copy of the whole header to pass to the daemon later. */ mcp = m_copym(m, 0, off + PIM6_REG_MINLEN, M_NOWAIT); if (mcp == NULL) { MRT6_DLOG(DEBUG_ANY | DEBUG_ERR, "pim register: " "could not copy register head"); m_freem(m); return (IPPROTO_DONE); } /* * forward the inner ip6 packet; point m_data at the inner ip6. */ m_adj(m, off + PIM_MINLEN); MRT6_DLOG(DEBUG_PIM, "forwarding decapsulated register: " "src %s, dst %s, mif %d", ip6_sprintf(ip6bufs, &eip6->ip6_src), ip6_sprintf(ip6bufd, &eip6->ip6_dst), reg_mif_num); rc = if_simloop(mif6table[reg_mif_num].m6_ifp, m, dst.sin6_family, 0); /* prepare the register head to send to the mrouting daemon */ m = mcp; } /* * Pass the PIM message up to the daemon; if it is a register message * pass the 'head' only up to the daemon. This includes the * encapsulator ip6 header, pim header, register header and the * encapsulated ip6 header. */ pim6_input_to_daemon: - rip6_input(&m, offp, proto); - return (IPPROTO_DONE); + return (rip6_input(&m, &off, proto)); } static int ip6_mroute_modevent(module_t mod, int type, void *unused) { switch (type) { case MOD_LOAD: MROUTER6_LOCK_INIT(); MFC6_LOCK_INIT(); MIF6_LOCK_INIT(); - pim6_encap_cookie = encap_attach_func(AF_INET6, IPPROTO_PIM, - pim6_encapcheck, - (const struct protosw *)&in6_pim_protosw, NULL); + pim6_encap_cookie = ip6_encap_attach(&ipv6_encap_cfg, + NULL, M_WAITOK); if (pim6_encap_cookie == NULL) { printf("ip6_mroute: unable to attach pim6 encap\n"); MIF6_LOCK_DESTROY(); MFC6_LOCK_DESTROY(); MROUTER6_LOCK_DESTROY(); return (EINVAL); } ip6_mforward = X_ip6_mforward; ip6_mrouter_done = X_ip6_mrouter_done; ip6_mrouter_get = X_ip6_mrouter_get; ip6_mrouter_set = X_ip6_mrouter_set; mrt6_ioctl = X_mrt6_ioctl; break; case MOD_UNLOAD: if (V_ip6_mrouter != NULL) return EINVAL; if (pim6_encap_cookie) { - encap_detach(pim6_encap_cookie); + ip6_encap_detach(pim6_encap_cookie); pim6_encap_cookie = NULL; } X_ip6_mrouter_done(); ip6_mforward = NULL; ip6_mrouter_done = NULL; ip6_mrouter_get = NULL; ip6_mrouter_set = NULL; mrt6_ioctl = NULL; MIF6_LOCK_DESTROY(); MFC6_LOCK_DESTROY(); MROUTER6_LOCK_DESTROY(); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t ip6_mroutemod = { "ip6_mroute", ip6_mroute_modevent, 0 }; DECLARE_MODULE(ip6_mroute, ip6_mroutemod, SI_SUB_PROTO_MC, SI_ORDER_ANY); diff --git a/sys/netinet6/pim6_var.h b/sys/netinet6/pim6_var.h index 7afe89b9f844..7288c67eee22 100644 --- a/sys/netinet6/pim6_var.h +++ b/sys/netinet6/pim6_var.h @@ -1,65 +1,61 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: pim6_var.h,v 1.8 2000/06/06 08:07:43 jinmei Exp $ * $FreeBSD$ */ /* * Protocol Independent Multicast (PIM), * implementation-specific definitions. * * Written by George Edmond Eddy (Rusty), ISI, February 1998 * Modified by Pavlin Ivanov Radoslavov, USC/ISI, May 1998 */ #ifndef _NETINET6_PIM6_VAR_H_ #define _NETINET6_PIM6_VAR_H_ struct pim6stat { uint64_t pim6s_rcv_total; /* total PIM messages received */ uint64_t pim6s_rcv_tooshort; /* received with too few bytes */ uint64_t pim6s_rcv_badsum; /* received with bad checksum */ uint64_t pim6s_rcv_badversion; /* received bad PIM version */ uint64_t pim6s_rcv_registers; /* received registers */ uint64_t pim6s_rcv_badregisters; /* received invalid registers */ uint64_t pim6s_snd_registers; /* sent registers */ }; -#if (defined(KERNEL)) || (defined(_KERNEL)) -int pim6_input(struct mbuf **, int*, int); -#endif /* KERNEL */ - /* * Identifiers for PIM sysctl nodes */ #define PIM6CTL_STATS 1 /* statistics (read-only) */ #endif /* _NETINET6_PIM6_VAR_H_ */ diff --git a/sys/netipsec/xform_ipcomp.c b/sys/netipsec/xform_ipcomp.c index 7226e31dc191..7a4a630bb693 100644 --- a/sys/netipsec/xform_ipcomp.c +++ b/sys/netipsec/xform_ipcomp.c @@ -1,787 +1,777 @@ /* $FreeBSD$ */ /* $OpenBSD: ip_ipcomp.c,v 1.1 2001/07/05 12:08:52 jjbg Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001 Jean-Jacques Bernard-Gundol (jj@wabbitt.org) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* IP payload compression protocol (IPComp), see RFC 2393 */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #include #include #include #include #include #include VNET_DEFINE(int, ipcomp_enable) = 1; VNET_PCPUSTAT_DEFINE(struct ipcompstat, ipcompstat); VNET_PCPUSTAT_SYSINIT(ipcompstat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ipcompstat); #endif /* VIMAGE */ SYSCTL_DECL(_net_inet_ipcomp); SYSCTL_INT(_net_inet_ipcomp, OID_AUTO, ipcomp_enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipcomp_enable), 0, ""); SYSCTL_VNET_PCPUSTAT(_net_inet_ipcomp, IPSECCTL_STATS, stats, struct ipcompstat, ipcompstat, "IPCOMP statistics (struct ipcompstat, netipsec/ipcomp_var.h"); static int ipcomp_input_cb(struct cryptop *crp); static int ipcomp_output_cb(struct cryptop *crp); /* * RFC 3173 p 2.2. Non-Expansion Policy: * If the total size of a compressed payload and the IPComp header, as * defined in section 3, is not smaller than the size of the original * payload, the IP datagram MUST be sent in the original non-compressed * form. * * When we use IPComp in tunnel mode, for small packets we will receive * encapsulated IP-IP datagrams without any compression and without IPComp * header. */ static int ipcomp_encapcheck(union sockaddr_union *src, union sockaddr_union *dst) { struct secasvar *sav; sav = key_allocsa_tunnel(src, dst, IPPROTO_IPCOMP); if (sav == NULL) return (0); key_freesav(&sav); if (src->sa.sa_family == AF_INET) return (sizeof(struct in_addr) << 4); else return (sizeof(struct in6_addr) << 4); } static int -ipcomp_nonexp_input(struct mbuf **mp, int *offp, int proto) +ipcomp_nonexp_input(struct mbuf *m, int off, int proto, void *arg __unused) { int isr; switch (proto) { #ifdef INET case IPPROTO_IPV4: isr = NETISR_IP; break; #endif #ifdef INET6 case IPPROTO_IPV6: isr = NETISR_IPV6; break; #endif default: IPCOMPSTAT_INC(ipcomps_nopf); - m_freem(*mp); + m_freem(m); return (IPPROTO_DONE); } - m_adj(*mp, *offp); - IPCOMPSTAT_ADD(ipcomps_ibytes, (*mp)->m_pkthdr.len); + m_adj(m, off); + IPCOMPSTAT_ADD(ipcomps_ibytes, m->m_pkthdr.len); IPCOMPSTAT_INC(ipcomps_input); - netisr_dispatch(isr, *mp); + netisr_dispatch(isr, m); return (IPPROTO_DONE); } /* * ipcomp_init() is called when an CPI is being set up. */ static int ipcomp_init(struct secasvar *sav, struct xformsw *xsp) { const struct comp_algo *tcomp; struct cryptoini cric; /* NB: algorithm really comes in alg_enc and not alg_comp! */ tcomp = comp_algorithm_lookup(sav->alg_enc); if (tcomp == NULL) { DPRINTF(("%s: unsupported compression algorithm %d\n", __func__, sav->alg_comp)); return EINVAL; } sav->alg_comp = sav->alg_enc; /* set for doing histogram */ sav->tdb_xform = xsp; sav->tdb_compalgxform = tcomp; /* Initialize crypto session */ bzero(&cric, sizeof (cric)); cric.cri_alg = sav->tdb_compalgxform->type; return crypto_newsession(&sav->tdb_cryptoid, &cric, V_crypto_support); } /* * ipcomp_zeroize() used when IPCA is deleted */ static int ipcomp_zeroize(struct secasvar *sav) { int err; err = crypto_freesession(sav->tdb_cryptoid); sav->tdb_cryptoid = 0; return err; } /* * ipcomp_input() gets called to uncompress an input packet */ static int ipcomp_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff) { struct xform_data *xd; struct cryptodesc *crdc; struct cryptop *crp; struct ipcomp *ipcomp; caddr_t addr; int error, hlen = IPCOMP_HLENGTH; /* * Check that the next header of the IPComp is not IPComp again, before * doing any real work. Given it is not possible to do double * compression it means someone is playing tricks on us. */ error = ENOBUFS; if (m->m_len < skip + hlen && (m = m_pullup(m, skip + hlen)) == NULL) { IPCOMPSTAT_INC(ipcomps_hdrops); /*XXX*/ DPRINTF(("%s: m_pullup failed\n", __func__)); key_freesav(&sav); return (error); } addr = (caddr_t) mtod(m, struct ip *) + skip; ipcomp = (struct ipcomp *)addr; if (ipcomp->comp_nxt == IPPROTO_IPCOMP) { IPCOMPSTAT_INC(ipcomps_pdrops); /* XXX have our own stats? */ DPRINTF(("%s: recursive compression detected\n", __func__)); error = EINVAL; goto bad; } /* Get crypto descriptors */ crp = crypto_getreq(1); if (crp == NULL) { DPRINTF(("%s: no crypto descriptors\n", __func__)); IPCOMPSTAT_INC(ipcomps_crypto); goto bad; } /* Get IPsec-specific opaque pointer */ xd = malloc(sizeof(*xd), M_XDATA, M_NOWAIT | M_ZERO); if (xd == NULL) { DPRINTF(("%s: cannot allocate xform_data\n", __func__)); IPCOMPSTAT_INC(ipcomps_crypto); crypto_freereq(crp); goto bad; } crdc = crp->crp_desc; crdc->crd_skip = skip + hlen; crdc->crd_len = m->m_pkthdr.len - (skip + hlen); crdc->crd_inject = skip; /* Decompression operation */ crdc->crd_alg = sav->tdb_compalgxform->type; /* Crypto operation descriptor */ crp->crp_ilen = m->m_pkthdr.len - (skip + hlen); crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC; crp->crp_buf = (caddr_t) m; crp->crp_callback = ipcomp_input_cb; crp->crp_opaque = (caddr_t) xd; /* These are passed as-is to the callback */ xd->sav = sav; xd->protoff = protoff; xd->skip = skip; xd->vnet = curvnet; SECASVAR_LOCK(sav); crp->crp_sid = xd->cryptoid = sav->tdb_cryptoid; SECASVAR_UNLOCK(sav); return crypto_dispatch(crp); bad: m_freem(m); key_freesav(&sav); return (error); } /* * IPComp input callback from the crypto driver. */ static int ipcomp_input_cb(struct cryptop *crp) { IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]); struct xform_data *xd; struct mbuf *m; struct secasvar *sav; struct secasindex *saidx; caddr_t addr; uint64_t cryptoid; int hlen = IPCOMP_HLENGTH, error, clen; int skip, protoff; uint8_t nproto; m = (struct mbuf *) crp->crp_buf; xd = (struct xform_data *) crp->crp_opaque; CURVNET_SET(xd->vnet); sav = xd->sav; skip = xd->skip; protoff = xd->protoff; cryptoid = xd->cryptoid; saidx = &sav->sah->saidx; IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET || saidx->dst.sa.sa_family == AF_INET6, ("unexpected protocol family %u", saidx->dst.sa.sa_family)); /* Check for crypto errors */ if (crp->crp_etype) { if (crp->crp_etype == EAGAIN) { /* Reset the session ID */ if (ipsec_updateid(sav, &crp->crp_sid, &cryptoid) != 0) crypto_freesession(cryptoid); xd->cryptoid = crp->crp_sid; CURVNET_RESTORE(); return (crypto_dispatch(crp)); } IPCOMPSTAT_INC(ipcomps_noxform); DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } /* Shouldn't happen... */ if (m == NULL) { IPCOMPSTAT_INC(ipcomps_crypto); DPRINTF(("%s: null mbuf returned from crypto\n", __func__)); error = EINVAL; goto bad; } IPCOMPSTAT_INC(ipcomps_hist[sav->alg_comp]); clen = crp->crp_olen; /* Length of data after processing */ /* Release the crypto descriptors */ free(xd, M_XDATA), xd = NULL; crypto_freereq(crp), crp = NULL; /* In case it's not done already, adjust the size of the mbuf chain */ m->m_pkthdr.len = clen + hlen + skip; if (m->m_len < skip + hlen && (m = m_pullup(m, skip + hlen)) == NULL) { IPCOMPSTAT_INC(ipcomps_hdrops); /*XXX*/ DPRINTF(("%s: m_pullup failed\n", __func__)); error = EINVAL; /*XXX*/ goto bad; } /* Keep the next protocol field */ addr = (caddr_t) mtod(m, struct ip *) + skip; nproto = ((struct ipcomp *) addr)->comp_nxt; /* Remove the IPCOMP header */ error = m_striphdr(m, skip, hlen); if (error) { IPCOMPSTAT_INC(ipcomps_hdrops); DPRINTF(("%s: bad mbuf chain, IPCA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)), (u_long) ntohl(sav->spi))); goto bad; } /* Restore the Next Protocol field */ m_copyback(m, protoff, sizeof (u_int8_t), (u_int8_t *) &nproto); switch (saidx->dst.sa.sa_family) { #ifdef INET6 case AF_INET6: error = ipsec6_common_input_cb(m, sav, skip, protoff); break; #endif #ifdef INET case AF_INET: error = ipsec4_common_input_cb(m, sav, skip, protoff); break; #endif default: panic("%s: Unexpected address family: %d saidx=%p", __func__, saidx->dst.sa.sa_family, saidx); } CURVNET_RESTORE(); return error; bad: CURVNET_RESTORE(); if (sav != NULL) key_freesav(&sav); if (m != NULL) m_freem(m); if (xd != NULL) free(xd, M_XDATA); if (crp != NULL) crypto_freereq(crp); return error; } /* * IPComp output routine, called by ipsec[46]_perform_request() */ static int ipcomp_output(struct mbuf *m, struct secpolicy *sp, struct secasvar *sav, u_int idx, int skip, int protoff) { IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]); const struct comp_algo *ipcompx; struct cryptodesc *crdc; struct cryptop *crp; struct xform_data *xd; int error, ralen, maxpacketsize; IPSEC_ASSERT(sav != NULL, ("null SA")); ipcompx = sav->tdb_compalgxform; IPSEC_ASSERT(ipcompx != NULL, ("null compression xform")); /* * Do not touch the packet in case our payload to compress * is lower than the minimal threshold of the compression * alogrithm. We will just send out the data uncompressed. * See RFC 3173, 2.2. Non-Expansion Policy. */ if (m->m_pkthdr.len <= ipcompx->minlen) { IPCOMPSTAT_INC(ipcomps_threshold); return ipsec_process_done(m, sp, sav, idx); } ralen = m->m_pkthdr.len - skip; /* Raw payload length before comp. */ IPCOMPSTAT_INC(ipcomps_output); /* Check for maximum packet size violations. */ switch (sav->sah->saidx.dst.sa.sa_family) { #ifdef INET case AF_INET: maxpacketsize = IP_MAXPACKET; break; #endif /* INET */ #ifdef INET6 case AF_INET6: maxpacketsize = IPV6_MAXPACKET; break; #endif /* INET6 */ default: IPCOMPSTAT_INC(ipcomps_nopf); DPRINTF(("%s: unknown/unsupported protocol family %d, " "IPCA %s/%08lx\n", __func__, sav->sah->saidx.dst.sa.sa_family, ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)), (u_long) ntohl(sav->spi))); error = EPFNOSUPPORT; goto bad; } if (ralen + skip + IPCOMP_HLENGTH > maxpacketsize) { IPCOMPSTAT_INC(ipcomps_toobig); DPRINTF(("%s: packet in IPCA %s/%08lx got too big " "(len %u, max len %u)\n", __func__, ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)), (u_long) ntohl(sav->spi), ralen + skip + IPCOMP_HLENGTH, maxpacketsize)); error = EMSGSIZE; goto bad; } /* Update the counters */ IPCOMPSTAT_ADD(ipcomps_obytes, m->m_pkthdr.len - skip); m = m_unshare(m, M_NOWAIT); if (m == NULL) { IPCOMPSTAT_INC(ipcomps_hdrops); DPRINTF(("%s: cannot clone mbuf chain, IPCA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)), (u_long) ntohl(sav->spi))); error = ENOBUFS; goto bad; } /* Ok now, we can pass to the crypto processing. */ /* Get crypto descriptors */ crp = crypto_getreq(1); if (crp == NULL) { IPCOMPSTAT_INC(ipcomps_crypto); DPRINTF(("%s: failed to acquire crypto descriptor\n",__func__)); error = ENOBUFS; goto bad; } crdc = crp->crp_desc; /* Compression descriptor */ crdc->crd_skip = skip; crdc->crd_len = ralen; crdc->crd_flags = CRD_F_COMP; crdc->crd_inject = skip; /* Compression operation */ crdc->crd_alg = ipcompx->type; /* IPsec-specific opaque crypto info */ xd = malloc(sizeof(struct xform_data), M_XDATA, M_NOWAIT | M_ZERO); if (xd == NULL) { IPCOMPSTAT_INC(ipcomps_crypto); DPRINTF(("%s: failed to allocate xform_data\n", __func__)); crypto_freereq(crp); error = ENOBUFS; goto bad; } xd->sp = sp; xd->sav = sav; xd->idx = idx; xd->skip = skip; xd->protoff = protoff; xd->vnet = curvnet; /* Crypto operation descriptor */ crp->crp_ilen = m->m_pkthdr.len; /* Total input length */ crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC; crp->crp_buf = (caddr_t) m; crp->crp_callback = ipcomp_output_cb; crp->crp_opaque = (caddr_t) xd; SECASVAR_LOCK(sav); crp->crp_sid = xd->cryptoid = sav->tdb_cryptoid; SECASVAR_UNLOCK(sav); return crypto_dispatch(crp); bad: if (m) m_freem(m); key_freesav(&sav); key_freesp(&sp); return (error); } /* * IPComp output callback from the crypto driver. */ static int ipcomp_output_cb(struct cryptop *crp) { IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]); struct xform_data *xd; struct secpolicy *sp; struct secasvar *sav; struct mbuf *m; uint64_t cryptoid; u_int idx; int error, skip, protoff; m = (struct mbuf *) crp->crp_buf; xd = (struct xform_data *) crp->crp_opaque; CURVNET_SET(xd->vnet); idx = xd->idx; sp = xd->sp; sav = xd->sav; skip = xd->skip; protoff = xd->protoff; cryptoid = xd->cryptoid; /* Check for crypto errors */ if (crp->crp_etype) { if (crp->crp_etype == EAGAIN) { /* Reset the session ID */ if (ipsec_updateid(sav, &crp->crp_sid, &cryptoid) != 0) crypto_freesession(cryptoid); xd->cryptoid = crp->crp_sid; CURVNET_RESTORE(); return (crypto_dispatch(crp)); } IPCOMPSTAT_INC(ipcomps_noxform); DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } /* Shouldn't happen... */ if (m == NULL) { IPCOMPSTAT_INC(ipcomps_crypto); DPRINTF(("%s: bogus return buffer from crypto\n", __func__)); error = EINVAL; goto bad; } IPCOMPSTAT_INC(ipcomps_hist[sav->alg_comp]); if (crp->crp_ilen - skip > crp->crp_olen) { struct mbuf *mo; struct ipcomp *ipcomp; int roff; uint8_t prot; /* Compression helped, inject IPCOMP header. */ mo = m_makespace(m, skip, IPCOMP_HLENGTH, &roff); if (mo == NULL) { IPCOMPSTAT_INC(ipcomps_wrap); DPRINTF(("%s: IPCOMP header inject failed " "for IPCA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)), (u_long) ntohl(sav->spi))); error = ENOBUFS; goto bad; } ipcomp = (struct ipcomp *)(mtod(mo, caddr_t) + roff); /* Initialize the IPCOMP header */ /* XXX alignment always correct? */ switch (sav->sah->saidx.dst.sa.sa_family) { #ifdef INET case AF_INET: ipcomp->comp_nxt = mtod(m, struct ip *)->ip_p; break; #endif /* INET */ #ifdef INET6 case AF_INET6: ipcomp->comp_nxt = mtod(m, struct ip6_hdr *)->ip6_nxt; break; #endif } ipcomp->comp_flags = 0; ipcomp->comp_cpi = htons((u_int16_t) ntohl(sav->spi)); /* Fix Next Protocol in IPv4/IPv6 header */ prot = IPPROTO_IPCOMP; m_copyback(m, protoff, sizeof(u_int8_t), (u_char *)&prot); /* Adjust the length in the IP header */ switch (sav->sah->saidx.dst.sa.sa_family) { #ifdef INET case AF_INET: mtod(m, struct ip *)->ip_len = htons(m->m_pkthdr.len); break; #endif /* INET */ #ifdef INET6 case AF_INET6: mtod(m, struct ip6_hdr *)->ip6_plen = htons(m->m_pkthdr.len) - sizeof(struct ip6_hdr); break; #endif /* INET6 */ default: IPCOMPSTAT_INC(ipcomps_nopf); DPRINTF(("%s: unknown/unsupported protocol " "family %d, IPCA %s/%08lx\n", __func__, sav->sah->saidx.dst.sa.sa_family, ipsec_address(&sav->sah->saidx.dst, buf, sizeof(buf)), (u_long) ntohl(sav->spi))); error = EPFNOSUPPORT; goto bad; } } else { /* Compression was useless, we have lost time. */ IPCOMPSTAT_INC(ipcomps_uncompr); DPRINTF(("%s: compressions was useless %d - %d <= %d\n", __func__, crp->crp_ilen, skip, crp->crp_olen)); /* XXX remember state to not compress the next couple * of packets, RFC 3173, 2.2. Non-Expansion Policy */ } /* Release the crypto descriptor */ free(xd, M_XDATA); crypto_freereq(crp); /* NB: m is reclaimed by ipsec_process_done. */ error = ipsec_process_done(m, sp, sav, idx); CURVNET_RESTORE(); return (error); bad: if (m) m_freem(m); CURVNET_RESTORE(); free(xd, M_XDATA); crypto_freereq(crp); key_freesav(&sav); key_freesp(&sp); return (error); } #ifdef INET -static const struct encaptab *ipe4_cookie = NULL; -extern struct domain inetdomain; -static struct protosw ipcomp4_protosw = { - .pr_type = SOCK_RAW, - .pr_domain = &inetdomain, - .pr_protocol = 0 /* IPPROTO_IPV[46] */, - .pr_flags = PR_ATOMIC | PR_ADDR | PR_LASTHDR, - .pr_input = ipcomp_nonexp_input, - .pr_output = rip_output, - .pr_ctloutput = rip_ctloutput, - .pr_usrreqs = &rip_usrreqs -}; - static int ipcomp4_nonexp_encapcheck(const struct mbuf *m, int off, int proto, void *arg __unused) { union sockaddr_union src, dst; const struct ip *ip; if (V_ipcomp_enable == 0) return (0); if (proto != IPPROTO_IPV4 && proto != IPPROTO_IPV6) return (0); bzero(&src, sizeof(src)); bzero(&dst, sizeof(dst)); src.sa.sa_family = dst.sa.sa_family = AF_INET; src.sin.sin_len = dst.sin.sin_len = sizeof(struct sockaddr_in); ip = mtod(m, const struct ip *); src.sin.sin_addr = ip->ip_src; dst.sin.sin_addr = ip->ip_dst; return (ipcomp_encapcheck(&src, &dst)); } + +static const struct encaptab *ipe4_cookie = NULL; +static const struct encap_config ipv4_encap_cfg = { + .proto = -1, + .min_length = sizeof(struct ip), + .exact_match = sizeof(in_addr_t) << 4, + .check = ipcomp4_nonexp_encapcheck, + .input = ipcomp_nonexp_input +}; #endif #ifdef INET6 -static const struct encaptab *ipe6_cookie = NULL; -extern struct domain inet6domain; -static struct protosw ipcomp6_protosw = { - .pr_type = SOCK_RAW, - .pr_domain = &inet6domain, - .pr_protocol = 0 /* IPPROTO_IPV[46] */, - .pr_flags = PR_ATOMIC | PR_ADDR | PR_LASTHDR, - .pr_input = ipcomp_nonexp_input, - .pr_output = rip6_output, - .pr_ctloutput = rip6_ctloutput, - .pr_usrreqs = &rip6_usrreqs -}; - static int ipcomp6_nonexp_encapcheck(const struct mbuf *m, int off, int proto, void *arg __unused) { union sockaddr_union src, dst; const struct ip6_hdr *ip6; if (V_ipcomp_enable == 0) return (0); if (proto != IPPROTO_IPV4 && proto != IPPROTO_IPV6) return (0); bzero(&src, sizeof(src)); bzero(&dst, sizeof(dst)); src.sa.sa_family = dst.sa.sa_family = AF_INET; src.sin6.sin6_len = dst.sin6.sin6_len = sizeof(struct sockaddr_in6); ip6 = mtod(m, const struct ip6_hdr *); src.sin6.sin6_addr = ip6->ip6_src; dst.sin6.sin6_addr = ip6->ip6_dst; if (IN6_IS_SCOPE_LINKLOCAL(&src.sin6.sin6_addr)) { /* XXX: sa6_recoverscope() */ src.sin6.sin6_scope_id = ntohs(src.sin6.sin6_addr.s6_addr16[1]); src.sin6.sin6_addr.s6_addr16[1] = 0; } if (IN6_IS_SCOPE_LINKLOCAL(&dst.sin6.sin6_addr)) { /* XXX: sa6_recoverscope() */ dst.sin6.sin6_scope_id = ntohs(dst.sin6.sin6_addr.s6_addr16[1]); dst.sin6.sin6_addr.s6_addr16[1] = 0; } return (ipcomp_encapcheck(&src, &dst)); } + +static const struct encaptab *ipe6_cookie = NULL; +static const struct encap_config ipv6_encap_cfg = { + .proto = -1, + .min_length = sizeof(struct ip6_hdr), + .exact_match = sizeof(struct in6_addr) << 4, + .check = ipcomp6_nonexp_encapcheck, + .input = ipcomp_nonexp_input +}; #endif static struct xformsw ipcomp_xformsw = { .xf_type = XF_IPCOMP, .xf_name = "IPcomp", .xf_init = ipcomp_init, .xf_zeroize = ipcomp_zeroize, .xf_input = ipcomp_input, .xf_output = ipcomp_output, }; static void ipcomp_attach(void) { #ifdef INET - ipe4_cookie = encap_attach_func(AF_INET, -1, - ipcomp4_nonexp_encapcheck, &ipcomp4_protosw, NULL); + ipe4_cookie = ip_encap_attach(&ipv4_encap_cfg, NULL, M_WAITOK); #endif #ifdef INET6 - ipe6_cookie = encap_attach_func(AF_INET6, -1, - ipcomp6_nonexp_encapcheck, &ipcomp6_protosw, NULL); + ipe6_cookie = ip6_encap_attach(&ipv6_encap_cfg, NULL, M_WAITOK); #endif xform_attach(&ipcomp_xformsw); } static void ipcomp_detach(void) { #ifdef INET - encap_detach(ipe4_cookie); + ip_encap_detach(ipe4_cookie); #endif #ifdef INET6 - encap_detach(ipe6_cookie); + ip6_encap_detach(ipe6_cookie); #endif xform_detach(&ipcomp_xformsw); } SYSINIT(ipcomp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipcomp_attach, NULL); SYSUNINIT(ipcomp_xform_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipcomp_detach, NULL);