diff --git a/sys/net/pfil.c b/sys/net/pfil.c index 35ac338095b6..78b87f3179c5 100644 --- a/sys/net/pfil.c +++ b/sys/net/pfil.c @@ -1,385 +1,307 @@ /* $FreeBSD$ */ /* $NetBSD: pfil.c,v 1.20 2001/11/12 23:49:46 lukem Exp $ */ /*- * Copyright (c) 1996 Matthew R. Green * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include +#include #include +#include #include #include #include #include #include #include #include #include #include #include static struct mtx pfil_global_lock; MTX_SYSINIT(pfil_heads_lock, &pfil_global_lock, "pfil_head_list lock", MTX_DEF); static int pfil_list_add(pfil_list_t *, struct packet_filter_hook *, int); static int pfil_list_remove(pfil_list_t *, int (*)(void *, struct mbuf **, struct ifnet *, int, struct inpcb *), void *); LIST_HEAD(, pfil_head) pfil_head_list = LIST_HEAD_INITIALIZER(&pfil_head_list); -static __inline void -PFIL_RLOCK(struct pfil_head *ph) -{ - mtx_lock(&ph->ph_mtx); - ph->ph_busy_count++; - mtx_unlock(&ph->ph_mtx); -} - -static __inline void -PFIL_RUNLOCK(struct pfil_head *ph) -{ - mtx_lock(&ph->ph_mtx); - ph->ph_busy_count--; - if (ph->ph_busy_count == 0 && ph->ph_want_write) - cv_signal(&ph->ph_cv); - mtx_unlock(&ph->ph_mtx); -} - -static __inline void -PFIL_WLOCK(struct pfil_head *ph) -{ - mtx_lock(&ph->ph_mtx); - ph->ph_want_write = 1; - while (ph->ph_busy_count > 0) - cv_wait(&ph->ph_cv, &ph->ph_mtx); -} - -static __inline int -PFIL_TRY_WLOCK(struct pfil_head *ph) -{ - mtx_lock(&ph->ph_mtx); - ph->ph_want_write = 1; - if (ph->ph_busy_count > 0) { - ph->ph_want_write = 0; - mtx_unlock(&ph->ph_mtx); - return EBUSY; - } - return 0; -} - -static __inline void -PFIL_WUNLOCK(struct pfil_head *ph) -{ - ph->ph_want_write = 0; - cv_signal(&ph->ph_cv); - mtx_unlock(&ph->ph_mtx); -} - -#define PFIL_LIST_LOCK() mtx_lock(&pfil_global_lock) -#define PFIL_LIST_UNLOCK() mtx_unlock(&pfil_global_lock) - /* * pfil_run_hooks() runs the specified packet filter hooks. */ int pfil_run_hooks(struct pfil_head *ph, struct mbuf **mp, struct ifnet *ifp, int dir, struct inpcb *inp) { struct packet_filter_hook *pfh; struct mbuf *m = *mp; int rv = 0; - if (ph->ph_busy_count == -1) - return (0); - /* - * Prevent packet filtering from starving the modification of - * the packet filters. We would prefer a reader/writer locking - * mechanism with guaranteed ordering, though. - */ - if (ph->ph_want_write) { - m_freem(*mp); - *mp = NULL; - return (ENOBUFS); - } - PFIL_RLOCK(ph); + KASSERT(ph->ph_nhooks >= 0, ("Pfil hook count dropped < 0")); for (pfh = pfil_hook_get(dir, ph); pfh != NULL; pfh = TAILQ_NEXT(pfh, pfil_link)) { if (pfh->pfil_func != NULL) { rv = (*pfh->pfil_func)(pfh->pfil_arg, &m, ifp, dir, inp); if (rv != 0 || m == NULL) break; } } PFIL_RUNLOCK(ph); *mp = m; return (rv); } /* * pfil_head_register() registers a pfil_head with the packet filter * hook mechanism. */ int pfil_head_register(struct pfil_head *ph) { struct pfil_head *lph; PFIL_LIST_LOCK(); LIST_FOREACH(lph, &pfil_head_list, ph_list) if (ph->ph_type == lph->ph_type && ph->ph_un.phu_val == lph->ph_un.phu_val) { PFIL_LIST_UNLOCK(); return EEXIST; } PFIL_LIST_UNLOCK(); - if (mtx_initialized(&ph->ph_mtx)) { /* should not happen */ - KASSERT((0), ("%s: allready initialized!", __func__)); - return EBUSY; - } else { - ph->ph_busy_count = -1; - ph->ph_want_write = 1; - mtx_init(&ph->ph_mtx, "pfil_head_mtx", NULL, MTX_DEF); - cv_init(&ph->ph_cv, "pfil_head_cv"); - mtx_lock(&ph->ph_mtx); /* XXX: race? */ - } + rw_init(&ph->ph_mtx, "PFil hook read/write mutex"); + PFIL_WLOCK(ph); + ph->ph_nhooks = 0; TAILQ_INIT(&ph->ph_in); TAILQ_INIT(&ph->ph_out); PFIL_LIST_LOCK(); LIST_INSERT_HEAD(&pfil_head_list, ph, ph_list); PFIL_LIST_UNLOCK(); - + PFIL_WUNLOCK(ph); - + return (0); } /* * pfil_head_unregister() removes a pfil_head from the packet filter * hook mechanism. */ int pfil_head_unregister(struct pfil_head *ph) { struct packet_filter_hook *pfh, *pfnext; PFIL_LIST_LOCK(); /* * LIST_REMOVE is safe for unlocked pfil_heads in ph_list. * No need to WLOCK all of them. */ LIST_REMOVE(ph, ph_list); PFIL_LIST_UNLOCK(); - PFIL_WLOCK(ph); /* XXX: may sleep (cv_wait)! */ + PFIL_WLOCK(ph); TAILQ_FOREACH_SAFE(pfh, &ph->ph_in, pfil_link, pfnext) free(pfh, M_IFADDR); TAILQ_FOREACH_SAFE(pfh, &ph->ph_out, pfil_link, pfnext) free(pfh, M_IFADDR); - cv_destroy(&ph->ph_cv); - mtx_destroy(&ph->ph_mtx); + rw_destroy(&ph->ph_mtx); return (0); } /* * pfil_head_get() returns the pfil_head for a given key/dlt. */ struct pfil_head * pfil_head_get(int type, u_long val) { struct pfil_head *ph; PFIL_LIST_LOCK(); LIST_FOREACH(ph, &pfil_head_list, ph_list) if (ph->ph_type == type && ph->ph_un.phu_val == val) break; PFIL_LIST_UNLOCK(); return (ph); } /* * pfil_add_hook() adds a function to the packet filter hook. the * flags are: * PFIL_IN call me on incoming packets * PFIL_OUT call me on outgoing packets * PFIL_ALL call me on all of the above * PFIL_WAITOK OK to call malloc with M_WAITOK. */ int pfil_add_hook(int (*func)(void *, struct mbuf **, struct ifnet *, int, struct inpcb *), void *arg, int flags, struct pfil_head *ph) { struct packet_filter_hook *pfh1 = NULL; struct packet_filter_hook *pfh2 = NULL; int err; /* Get memory */ if (flags & PFIL_IN) { pfh1 = (struct packet_filter_hook *)malloc(sizeof(*pfh1), M_IFADDR, (flags & PFIL_WAITOK) ? M_WAITOK : M_NOWAIT); if (pfh1 == NULL) { err = ENOMEM; goto error; } } if (flags & PFIL_OUT) { pfh2 = (struct packet_filter_hook *)malloc(sizeof(*pfh1), M_IFADDR, (flags & PFIL_WAITOK) ? M_WAITOK : M_NOWAIT); if (pfh2 == NULL) { err = ENOMEM; goto error; } } /* Lock */ - if (flags & PFIL_WAITOK) - PFIL_WLOCK(ph); - else { - err = PFIL_TRY_WLOCK(ph); - if (err) - goto error; - } + PFIL_WLOCK(ph); /* Add */ if (flags & PFIL_IN) { pfh1->pfil_func = func; pfh1->pfil_arg = arg; err = pfil_list_add(&ph->ph_in, pfh1, flags & ~PFIL_OUT); if (err) goto done; + ph->ph_nhooks++; } if (flags & PFIL_OUT) { pfh2->pfil_func = func; pfh2->pfil_arg = arg; err = pfil_list_add(&ph->ph_out, pfh2, flags & ~PFIL_IN); if (err) { if (flags & PFIL_IN) pfil_list_remove(&ph->ph_in, func, arg); goto done; } + ph->ph_nhooks++; } - ph->ph_busy_count = 0; PFIL_WUNLOCK(ph); return 0; done: PFIL_WUNLOCK(ph); error: if (pfh1 != NULL) free(pfh1, M_IFADDR); if (pfh2 != NULL) free(pfh2, M_IFADDR); return err; } /* * pfil_remove_hook removes a specific function from the packet filter * hook list. */ int pfil_remove_hook(int (*func)(void *, struct mbuf **, struct ifnet *, int, struct inpcb *), void *arg, int flags, struct pfil_head *ph) { int err = 0; - if (flags & PFIL_WAITOK) - PFIL_WLOCK(ph); - else { - err = PFIL_TRY_WLOCK(ph); - if (err) - return err; - } + PFIL_WLOCK(ph); - if (flags & PFIL_IN) + if (flags & PFIL_IN) { err = pfil_list_remove(&ph->ph_in, func, arg); - if ((err == 0) && (flags & PFIL_OUT)) + if (err == 0) + ph->ph_nhooks--; + } + if ((err == 0) && (flags & PFIL_OUT)) { err = pfil_list_remove(&ph->ph_out, func, arg); - - if (TAILQ_EMPTY(&ph->ph_in) && TAILQ_EMPTY(&ph->ph_out)) - ph->ph_busy_count = -1; - + if (err == 0) + ph->ph_nhooks--; + } PFIL_WUNLOCK(ph); return err; } static int pfil_list_add(pfil_list_t *list, struct packet_filter_hook *pfh1, int flags) { struct packet_filter_hook *pfh; /* * First make sure the hook is not already there. */ TAILQ_FOREACH(pfh, list, pfil_link) if (pfh->pfil_func == pfh1->pfil_func && pfh->pfil_arg == pfh1->pfil_arg) return EEXIST; /* * insert the input list in reverse order of the output list * so that the same path is followed in or out of the kernel. */ if (flags & PFIL_IN) TAILQ_INSERT_HEAD(list, pfh1, pfil_link); else TAILQ_INSERT_TAIL(list, pfh1, pfil_link); return 0; } /* * pfil_list_remove is an internal function that takes a function off the * specified list. */ static int pfil_list_remove(pfil_list_t *list, int (*func)(void *, struct mbuf **, struct ifnet *, int, struct inpcb *), void *arg) { struct packet_filter_hook *pfh; TAILQ_FOREACH(pfh, list, pfil_link) if (pfh->pfil_func == func && pfh->pfil_arg == arg) { TAILQ_REMOVE(list, pfh, pfil_link); free(pfh, M_IFADDR); return 0; } return ENOENT; } diff --git a/sys/net/pfil.h b/sys/net/pfil.h index da14f5b4801c..ccf7a651717a 100644 --- a/sys/net/pfil.h +++ b/sys/net/pfil.h @@ -1,113 +1,113 @@ /* $FreeBSD$ */ /* $NetBSD: pfil.h,v 1.22 2003/06/23 12:57:08 martin Exp $ */ /*- * Copyright (c) 1996 Matthew R. Green * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NET_PFIL_H_ #define _NET_PFIL_H_ #include #include #include #include -#include /* XXX */ +#include struct mbuf; struct ifnet; struct inpcb; /* * The packet filter hooks are designed for anything to call them to * possibly intercept the packet. */ struct packet_filter_hook { TAILQ_ENTRY(packet_filter_hook) pfil_link; int (*pfil_func)(void *, struct mbuf **, struct ifnet *, int, struct inpcb *); void *pfil_arg; int pfil_flags; }; #define PFIL_IN 0x00000001 #define PFIL_OUT 0x00000002 #define PFIL_WAITOK 0x00000004 #define PFIL_ALL (PFIL_IN|PFIL_OUT) typedef TAILQ_HEAD(pfil_list, packet_filter_hook) pfil_list_t; #define PFIL_TYPE_AF 1 /* key is AF_* type */ #define PFIL_TYPE_IFNET 2 /* key is ifnet pointer */ struct pfil_head { pfil_list_t ph_in; pfil_list_t ph_out; int ph_type; - /* - * Locking: use a busycounter per pfil_head. - * Use ph_busy_count = -1 to indicate pfil_head is empty. - */ - int ph_busy_count; /* count of threads with read lock */ - int ph_want_write; /* want write lock flag */ - struct cv ph_cv; /* for waking up writers */ - struct mtx ph_mtx; /* mutex on locking state */ + int ph_nhooks; + struct rwlock ph_mtx; union { u_long phu_val; void *phu_ptr; } ph_un; #define ph_af ph_un.phu_val #define ph_ifnet ph_un.phu_ptr LIST_ENTRY(pfil_head) ph_list; }; int pfil_run_hooks(struct pfil_head *, struct mbuf **, struct ifnet *, int, struct inpcb *inp); int pfil_add_hook(int (*func)(void *, struct mbuf **, struct ifnet *, int, struct inpcb *), void *, int, struct pfil_head *); int pfil_remove_hook(int (*func)(void *, struct mbuf **, struct ifnet *, int, struct inpcb *), void *, int, struct pfil_head *); int pfil_head_register(struct pfil_head *); int pfil_head_unregister(struct pfil_head *); struct pfil_head *pfil_head_get(int, u_long); +#define PFIL_HOOKED(p) (&(p)->ph_nhooks > 0) +#define PFIL_RLOCK(p) rw_rlock(&(p)->ph_mtx) +#define PFIL_WLOCK(p) rw_wlock(&(p)->ph_mtx) +#define PFIL_RUNLOCK(p) rw_runlock(&(p)->ph_mtx) +#define PFIL_WUNLOCK(p) rw_wunlock(&(p)->ph_mtx) +#define PFIL_LIST_LOCK() mtx_lock(&pfil_global_lock) +#define PFIL_LIST_UNLOCK() mtx_unlock(&pfil_global_lock) + static __inline struct packet_filter_hook * pfil_hook_get(int dir, struct pfil_head *ph) { - KASSERT(ph->ph_busy_count > 0, - ("pfil_hook_get: called on unbusy pfil_head")); if (dir == PFIL_IN) return (TAILQ_FIRST(&ph->ph_in)); else if (dir == PFIL_OUT) return (TAILQ_FIRST(&ph->ph_out)); else return (NULL); } #endif /* _NET_PFIL_H_ */ diff --git a/sys/netinet/ip_fastfwd.c b/sys/netinet/ip_fastfwd.c index 6ad8e9d85fe6..94da84b2645e 100644 --- a/sys/netinet/ip_fastfwd.c +++ b/sys/netinet/ip_fastfwd.c @@ -1,607 +1,607 @@ /*- * Copyright (c) 2003 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * ip_fastforward gets its speed from processing the forwarded packet to * completion (if_output on the other side) without any queues or netisr's. * The receiving interface DMAs the packet into memory, the upper half of * driver calls ip_fastforward, we do our routing table lookup and directly * send it off to the outgoing interface, which DMAs the packet to the * network card. The only part of the packet we touch with the CPU is the * IP header (unless there are complex firewall rules touching other parts * of the packet, but that is up to you). We are essentially limited by bus * bandwidth and how fast the network card/driver can set up receives and * transmits. * * We handle basic errors, IP header errors, checksum errors, * destination unreachable, fragmentation and fragmentation needed and * report them via ICMP to the sender. * * Else if something is not pure IPv4 unicast forwarding we fall back to * the normal ip_input processing path. We should only be called from * interfaces connected to the outside world. * * Firewalling is fully supported including divert, ipfw fwd and ipfilter * ipnat and address rewrite. * * IPSEC is not supported if this host is a tunnel broker. IPSEC is * supported for connections to/from local host. * * We try to do the least expensive (in CPU ops) checks and operations * first to catch junk with as little overhead as possible. * * We take full advantage of hardware support for IP checksum and * fragmentation offloading. * * We don't do ICMP redirect in the fast forwarding path. I have had my own * cases where two core routers with Zebra routing suite would send millions * ICMP redirects to connected hosts if the destination router was not the * default gateway. In one case it was filling the routing table of a host * with approximately 300.000 cloned redirect entries until it ran out of * kernel memory. However the networking code proved very robust and it didn't * crash or fail in other ways. */ /* * Many thanks to Matt Thomas of NetBSD for basic structure of ip_flow.c which * is being followed here. */ #include "opt_ipfw.h" #include "opt_ipstealth.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int ipfastforward_active = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, fastforwarding, CTLFLAG_RW, &ipfastforward_active, 0, "Enable fast IP forwarding"); static struct sockaddr_in * ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m) { struct sockaddr_in *dst; struct rtentry *rt; /* * Find route to destination. */ bzero(ro, sizeof(*ro)); dst = (struct sockaddr_in *)&ro->ro_dst; dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr.s_addr = dest.s_addr; rtalloc_ign(ro, RTF_CLONING); /* * Route there and interface still up? */ rt = ro->ro_rt; if (rt && (rt->rt_flags & RTF_UP) && (rt->rt_ifp->if_flags & IFF_UP) && (rt->rt_ifp->if_drv_flags & IFF_DRV_RUNNING)) { if (rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *)rt->rt_gateway; } else { ipstat.ips_noroute++; ipstat.ips_cantforward++; if (rt) RTFREE(rt); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); return NULL; } return dst; } /* * Try to forward a packet based on the destination address. * This is a fast path optimized for the plain forwarding case. * If the packet is handled (and consumed) here then we return 1; * otherwise 0 is returned and the packet should be delivered * to ip_input for full processing. */ struct mbuf * ip_fastforward(struct mbuf *m) { struct ip *ip; struct mbuf *m0 = NULL; struct route ro; struct sockaddr_in *dst = NULL; struct ifnet *ifp; struct in_addr odest, dest; u_short sum, ip_len; int error = 0; int hlen, mtu; #ifdef IPFIREWALL_FORWARD struct m_tag *fwd_tag; #endif /* * Are we active and forwarding packets? */ if (!ipfastforward_active || !ipforwarding) return m; M_ASSERTVALID(m); M_ASSERTPKTHDR(m); ro.ro_rt = NULL; /* * Step 1: check for packet drop conditions (and sanity checks) */ /* * Is entire packet big enough? */ if (m->m_pkthdr.len < sizeof(struct ip)) { ipstat.ips_tooshort++; goto drop; } /* * Is first mbuf large enough for ip header and is header present? */ if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { ipstat.ips_toosmall++; return NULL; /* mbuf already free'd */ } ip = mtod(m, struct ip *); /* * Is it IPv4? */ if (ip->ip_v != IPVERSION) { ipstat.ips_badvers++; goto drop; } /* * Is IP header length correct and is it in first mbuf? */ hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ ipstat.ips_badlen++; goto drop; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { ipstat.ips_badhlen++; return NULL; /* mbuf already free'd */ } ip = mtod(m, struct ip *); } /* * Checksum correct? */ if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); else { if (hlen == sizeof(struct ip)) sum = in_cksum_hdr(ip); else sum = in_cksum(m, hlen); } if (sum) { ipstat.ips_badsum++; goto drop; } /* * Remember that we have checked the IP header and found it valid. */ m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); ip_len = ntohs(ip->ip_len); /* * Is IP length longer than packet we have got? */ if (m->m_pkthdr.len < ip_len) { ipstat.ips_tooshort++; goto drop; } /* * Is packet longer than IP header tells us? If yes, truncate packet. */ if (m->m_pkthdr.len > ip_len) { if (m->m_len == m->m_pkthdr.len) { m->m_len = ip_len; m->m_pkthdr.len = ip_len; } else m_adj(m, ip_len - m->m_pkthdr.len); } /* * Is packet from or to 127/8? */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { ipstat.ips_badaddr++; goto drop; } #ifdef ALTQ /* * Is packet dropped by traffic conditioner? */ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) goto drop; #endif /* * Step 2: fallback conditions to normal ip_input path processing */ /* * Only IP packets without options */ if (ip->ip_hl != (sizeof(struct ip) >> 2)) { if (ip_doopts == 1) return m; else if (ip_doopts == 2) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_FILTER_PROHIB, 0, 0); return NULL; /* mbuf already free'd */ } /* else ignore IP options and continue */ } /* * Only unicast IP, not from loopback, no L2 or IP broadcast, * no multicast, no INADDR_ANY * * XXX: Probably some of these checks could be direct drop * conditions. However it is not clear whether there are some * hacks or obscure behaviours which make it neccessary to * let ip_input handle it. We play safe here and let ip_input * deal with it until it is proven that we can directly drop it. */ if ((m->m_flags & (M_BCAST|M_MCAST)) || (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) || ntohl(ip->ip_src.s_addr) == (u_long)INADDR_BROADCAST || ntohl(ip->ip_dst.s_addr) == (u_long)INADDR_BROADCAST || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || ip->ip_src.s_addr == INADDR_ANY || ip->ip_dst.s_addr == INADDR_ANY ) return m; /* * Is it for a local address on this host? */ if (in_localip(ip->ip_dst)) return m; ipstat.ips_total++; /* * Step 3: incoming packet firewall processing */ /* * Convert to host representation */ ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); odest.s_addr = dest.s_addr = ip->ip_dst.s_addr; /* * Run through list of ipfilter hooks for input packets */ - if (inet_pfil_hook.ph_busy_count == -1) + if (!PFIL_HOOKED(&inet_pfil_hook)) goto passin; if (pfil_run_hooks(&inet_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL) || m == NULL) goto drop; M_ASSERTVALID(m); M_ASSERTPKTHDR(m); ip = mtod(m, struct ip *); /* m may have changed by pfil hook */ dest.s_addr = ip->ip_dst.s_addr; /* * Destination address changed? */ if (odest.s_addr != dest.s_addr) { /* * Is it now for a local address on this host? */ if (in_localip(dest)) goto forwardlocal; /* * Go on with new destination address */ } #ifdef IPFIREWALL_FORWARD if (m->m_flags & M_FASTFWD_OURS) { /* * ipfw changed it for a local address on this host. */ goto forwardlocal; } #endif /* IPFIREWALL_FORWARD */ passin: /* * Step 4: decrement TTL and look up route */ /* * Check TTL */ #ifdef IPSTEALTH if (!ipstealth) { #endif if (ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); return NULL; /* mbuf already free'd */ } /* * Decrement the TTL and incrementally change the IP header checksum. * Don't bother doing this with hw checksum offloading, it's faster * doing it right here. */ ip->ip_ttl -= IPTTLDEC; if (ip->ip_sum >= (u_int16_t) ~htons(IPTTLDEC << 8)) ip->ip_sum -= ~htons(IPTTLDEC << 8); else ip->ip_sum += htons(IPTTLDEC << 8); #ifdef IPSTEALTH } #endif /* * Find route to destination. */ if ((dst = ip_findroute(&ro, dest, m)) == NULL) return NULL; /* icmp unreach already sent */ ifp = ro.ro_rt->rt_ifp; /* * Immediately drop blackholed traffic. */ if (ro.ro_rt->rt_flags & RTF_BLACKHOLE) goto drop; /* * Step 5: outgoing firewall packet processing */ /* * Run through list of hooks for output packets. */ - if (inet_pfil_hook.ph_busy_count == -1) + if (!PFIL_HOOKED(&inet_pfil_hook)) goto passout; if (pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, NULL) || m == NULL) { goto drop; } M_ASSERTVALID(m); M_ASSERTPKTHDR(m); ip = mtod(m, struct ip *); dest.s_addr = ip->ip_dst.s_addr; /* * Destination address changed? */ #ifndef IPFIREWALL_FORWARD if (odest.s_addr != dest.s_addr) { #else fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); if (odest.s_addr != dest.s_addr || fwd_tag != NULL) { #endif /* IPFIREWALL_FORWARD */ /* * Is it now for a local address on this host? */ #ifndef IPFIREWALL_FORWARD if (in_localip(dest)) { #else if (m->m_flags & M_FASTFWD_OURS || in_localip(dest)) { #endif /* IPFIREWALL_FORWARD */ forwardlocal: /* * Return packet for processing by ip_input(). * Keep host byte order as expected at ip_input's * "ours"-label. */ m->m_flags |= M_FASTFWD_OURS; if (ro.ro_rt) RTFREE(ro.ro_rt); return m; } /* * Redo route lookup with new destination address */ #ifdef IPFIREWALL_FORWARD if (fwd_tag) { if (!in_localip(ip->ip_src) && !in_localaddr(ip->ip_dst)) dest.s_addr = ((struct sockaddr_in *)(fwd_tag+1))->sin_addr.s_addr; m_tag_delete(m, fwd_tag); } #endif /* IPFIREWALL_FORWARD */ RTFREE(ro.ro_rt); if ((dst = ip_findroute(&ro, dest, m)) == NULL) return NULL; /* icmp unreach already sent */ ifp = ro.ro_rt->rt_ifp; } passout: /* * Step 6: send off the packet */ /* * Check if route is dampned (when ARP is unable to resolve) */ if ((ro.ro_rt->rt_flags & RTF_REJECT) && ro.ro_rt->rt_rmx.rmx_expire >= time_uptime) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); goto consumed; } #ifndef ALTQ /* * Check if there is enough space in the interface queue */ if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= ifp->if_snd.ifq_maxlen) { ipstat.ips_odropped++; /* would send source quench here but that is depreciated */ goto drop; } #endif /* * Check if media link state of interface is not down */ if (ifp->if_link_state == LINK_STATE_DOWN) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); goto consumed; } /* * Check if packet fits MTU or if hardware will fragment for us */ if (ro.ro_rt->rt_rmx.rmx_mtu) mtu = min(ro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); else mtu = ifp->if_mtu; if (ip->ip_len <= mtu || (ifp->if_hwassist & CSUM_FRAGMENT && (ip->ip_off & IP_DF) == 0)) { /* * Restore packet header fields to original values */ ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); /* * Send off the packet via outgoing interface */ error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro.ro_rt); } else { /* * Handle EMSGSIZE with icmp reply needfrag for TCP MTU discovery */ if (ip->ip_off & IP_DF) { ipstat.ips_cantfrag++; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu); goto consumed; } else { /* * We have to fragment the packet */ m->m_pkthdr.csum_flags |= CSUM_IP; /* * ip_fragment expects ip_len and ip_off in host byte * order but returns all packets in network byte order */ if (ip_fragment(ip, &m, mtu, ifp->if_hwassist, (~ifp->if_hwassist & CSUM_DELAY_IP))) { goto drop; } KASSERT(m != NULL, ("null mbuf and no error")); /* * Send off the fragments via outgoing interface */ error = 0; do { m0 = m->m_nextpkt; m->m_nextpkt = NULL; error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro.ro_rt); if (error) break; } while ((m = m0) != NULL); if (error) { /* Reclaim remaining fragments */ for (m = m0; m; m = m0) { m0 = m->m_nextpkt; m_freem(m); } } else ipstat.ips_fragmented++; } } if (error != 0) ipstat.ips_odropped++; else { ro.ro_rt->rt_rmx.rmx_pksent++; ipstat.ips_forward++; ipstat.ips_fastforward++; } consumed: RTFREE(ro.ro_rt); return NULL; drop: if (m) m_freem(m); if (ro.ro_rt) RTFREE(ro.ro_rt); return NULL; } diff --git a/sys/netinet/ip_fw2.c b/sys/netinet/ip_fw2.c index e28695198c73..ea43ece3e42d 100644 --- a/sys/netinet/ip_fw2.c +++ b/sys/netinet/ip_fw2.c @@ -1,4262 +1,4227 @@ /*- * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #define DEB(x) #define DDB(x) x /* * Implement IP packet firewall (new version) */ #if !defined(KLD_MODULE) #include "opt_ipfw.h" #include "opt_ip6fw.h" #include "opt_ipdn.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #endif #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include +#include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #endif #include #include #ifdef INET6 #include #endif #include /* XXX for ETHERTYPE_IP */ #include /* XXX for in_cksum */ /* * set_disable contains one bit per set value (0..31). * If the bit is set, all rules with the corresponding set * are disabled. Set RESVD_SET(31) is reserved for the default rule * and rules that are not deleted by the flush command, * and CANNOT be disabled. * Rules in set RESVD_SET can only be deleted explicitly. */ static u_int32_t set_disable; static int fw_verbose; static int verbose_limit; static struct callout ipfw_timeout; static uma_zone_t ipfw_dyn_rule_zone; #define IPFW_DEFAULT_RULE 65535 /* * Data structure to cache our ucred related * information. This structure only gets used if * the user specified UID/GID based constraints in * a firewall rule. */ struct ip_fw_ugid { gid_t fw_groups[NGROUPS]; int fw_ngroups; uid_t fw_uid; int fw_prid; }; #define IPFW_TABLES_MAX 128 struct ip_fw_chain { struct ip_fw *rules; /* list of rules */ struct ip_fw *reap; /* list of rules to reap */ struct radix_node_head *tables[IPFW_TABLES_MAX]; - struct mtx mtx; /* lock guarding rule list */ - int busy_count; /* busy count for rw locks */ - int want_write; - struct cv cv; + struct rwlock rwmtx; }; #define IPFW_LOCK_INIT(_chain) \ - mtx_init(&(_chain)->mtx, "IPFW static rules", NULL, \ - MTX_DEF | MTX_RECURSE) -#define IPFW_LOCK_DESTROY(_chain) mtx_destroy(&(_chain)->mtx) + rw_init(&(_chain)->rwmtx, "IPFW static rules") +#define IPFW_LOCK_DESTROY(_chain) rw_destroy(&(_chain)->rwmtx) #define IPFW_WLOCK_ASSERT(_chain) do { \ - mtx_assert(&(_chain)->mtx, MA_OWNED); \ + rw_assert(rw, RA_WLOCKED); \ NET_ASSERT_GIANT(); \ } while (0) -static __inline void -IPFW_RLOCK(struct ip_fw_chain *chain) -{ - mtx_lock(&chain->mtx); - chain->busy_count++; - mtx_unlock(&chain->mtx); -} - -static __inline void -IPFW_RUNLOCK(struct ip_fw_chain *chain) -{ - mtx_lock(&chain->mtx); - chain->busy_count--; - if (chain->busy_count == 0 && chain->want_write) - cv_signal(&chain->cv); - mtx_unlock(&chain->mtx); -} - -static __inline void -IPFW_WLOCK(struct ip_fw_chain *chain) -{ - mtx_lock(&chain->mtx); - chain->want_write++; - while (chain->busy_count > 0) - cv_wait(&chain->cv, &chain->mtx); -} - -static __inline void -IPFW_WUNLOCK(struct ip_fw_chain *chain) -{ - chain->want_write--; - cv_signal(&chain->cv); - mtx_unlock(&chain->mtx); -} +#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) +#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) +#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) +#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) /* * list of rules for layer 3 */ static struct ip_fw_chain layer3_chain; MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); struct table_entry { struct radix_node rn[2]; struct sockaddr_in addr, mask; u_int32_t value; }; static int fw_debug = 1; static int autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ #ifdef SYSCTL_NODE SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_SECURE3, &fw_enable, 0, "Enable ipfw"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW, &autoinc_step, 0, "Rule number autincrement step"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, CTLFLAG_RW | CTLFLAG_SECURE3, &fw_one_pass, 0, "Only do a single pass through ipfw when using dummynet(4)"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW, &fw_debug, 0, "Enable printing of debug ip_fw statements"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_SECURE3, &fw_verbose, 0, "Log matches to ipfw rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged"); /* * Description of dynamic rules. * * Dynamic rules are stored in lists accessed through a hash table * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can * be modified through the sysctl variable dyn_buckets which is * updated when the table becomes empty. * * XXX currently there is only one list, ipfw_dyn. * * When a packet is received, its address fields are first masked * with the mask defined for the rule, then hashed, then matched * against the entries in the corresponding list. * Dynamic rules can be used for different purposes: * + stateful rules; * + enforcing limits on the number of sessions; * + in-kernel NAT (not implemented yet) * * The lifetime of dynamic rules is regulated by dyn_*_lifetime, * measured in seconds and depending on the flags. * * The total number of dynamic rules is stored in dyn_count. * The max number of dynamic rules is dyn_max. When we reach * the maximum number of rules we do not create anymore. This is * done to avoid consuming too much memory, but also too much * time when searching on each packet (ideally, we should try instead * to put a limit on the length of the list on each bucket...). * * Each dynamic rule holds a pointer to the parent ipfw rule so * we know what action to perform. Dynamic rules are removed when * the parent rule is deleted. XXX we should make them survive. * * There are some limitations with dynamic rules -- we do not * obey the 'randomized match', and we do not do multiple * passes through the firewall. XXX check the latter!!! */ static ipfw_dyn_rule **ipfw_dyn_v = NULL; static u_int32_t dyn_buckets = 256; /* must be power of 2 */ static u_int32_t curr_dyn_buckets = 256; /* must be power of 2 */ static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */ #define IPFW_DYN_LOCK_INIT() \ mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF) #define IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx) #define IPFW_DYN_LOCK() mtx_lock(&ipfw_dyn_mtx) #define IPFW_DYN_UNLOCK() mtx_unlock(&ipfw_dyn_mtx) #define IPFW_DYN_LOCK_ASSERT() mtx_assert(&ipfw_dyn_mtx, MA_OWNED) /* * Timeouts for various events in handing dynamic rules. */ static u_int32_t dyn_ack_lifetime = 300; static u_int32_t dyn_syn_lifetime = 20; static u_int32_t dyn_fin_lifetime = 1; static u_int32_t dyn_rst_lifetime = 1; static u_int32_t dyn_udp_lifetime = 10; static u_int32_t dyn_short_lifetime = 5; /* * Keepalives are sent if dyn_keepalive is set. They are sent every * dyn_keepalive_period seconds, in the last dyn_keepalive_interval * seconds of lifetime of a rule. * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower * than dyn_keepalive_period. */ static u_int32_t dyn_keepalive_interval = 20; static u_int32_t dyn_keepalive_period = 5; static u_int32_t dyn_keepalive = 1; /* do send keepalives */ static u_int32_t static_count; /* # of static rules */ static u_int32_t static_len; /* size in bytes of static rules */ static u_int32_t dyn_count; /* # of dynamic rules */ static u_int32_t dyn_max = 4096; /* max # of dynamic rules */ SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW, &dyn_buckets, 0, "Number of dyn. buckets"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD, &curr_dyn_buckets, 0, "Current Number of dyn. buckets"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD, &dyn_count, 0, "Number of dyn. rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW, &dyn_max, 0, "Max number of dyn. rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD, &static_count, 0, "Number of static rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW, &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW, &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW, &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW, &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW, &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW, &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW, &dyn_keepalive, 0, "Enable keepalives for dyn. rules"); #ifdef INET6 /* * IPv6 specific variables */ SYSCTL_DECL(_net_inet6_ip6); static struct sysctl_ctx_list ip6_fw_sysctl_ctx; static struct sysctl_oid *ip6_fw_sysctl_tree; #endif /* INET6 */ #endif /* SYSCTL_NODE */ static int fw_deny_unknown_exthdrs = 1; /* * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) static __inline int icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) { int type = icmp->icmp_type; return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<icmp_type; return (type <= ICMP_MAXTYPE && (TT & (1<arg1 or cmd->d[0]. * * We scan options and store the bits we find set. We succeed if * * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear * * The code is sometimes optimized not to store additional variables. */ static int flags_match(ipfw_insn *cmd, u_int8_t bits) { u_char want_clear; bits = ~bits; if ( ((cmd->arg1 & 0xff) & bits) != 0) return 0; /* some bits we want set were clear */ want_clear = (cmd->arg1 >> 8) & 0xff; if ( (want_clear & bits) != want_clear) return 0; /* some bits we want clear were set */ return 1; } static int ipopts_match(struct ip *ip, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(ip + 1); int x = (ip->ip_hl << 2) - sizeof (struct ip); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { optlen = cp[IPOPT_OLEN]; if (optlen <= 0 || optlen > x) return 0; /* invalid or truncated */ } switch (opt) { default: break; case IPOPT_LSRR: bits |= IP_FW_IPOPT_LSRR; break; case IPOPT_SSRR: bits |= IP_FW_IPOPT_SSRR; break; case IPOPT_RR: bits |= IP_FW_IPOPT_RR; break; case IPOPT_TS: bits |= IP_FW_IPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(tcp + 1); int x = (tcp->th_off << 2) - sizeof(struct tcphdr); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { optlen = cp[1]; if (optlen <= 0) break; } switch (opt) { default: break; case TCPOPT_MAXSEG: bits |= IP_FW_TCPOPT_MSS; break; case TCPOPT_WINDOW: bits |= IP_FW_TCPOPT_WINDOW; break; case TCPOPT_SACK_PERMITTED: case TCPOPT_SACK: bits |= IP_FW_TCPOPT_SACK; break; case TCPOPT_TIMESTAMP: bits |= IP_FW_TCPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) { if (ifp == NULL) /* no iface with this packet, match fails */ return 0; /* Check by name or by IP address */ if (cmd->name[0] != '\0') { /* match by name */ /* Check name */ if (cmd->p.glob) { if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) return(1); } else { if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) return(1); } } else { struct ifaddr *ia; /* XXX lock? */ TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr == NULL) continue; if (ia->ifa_addr->sa_family != AF_INET) continue; if (cmd->p.ip.s_addr == ((struct sockaddr_in *) (ia->ifa_addr))->sin_addr.s_addr) return(1); /* match */ } } return(0); /* no match, fail ... */ } /* * The verify_path function checks if a route to the src exists and * if it is reachable via ifp (when provided). * * The 'verrevpath' option checks that the interface that an IP packet * arrives on is the same interface that traffic destined for the * packet's source address would be routed out of. The 'versrcreach' * option just checks that the source address is reachable via any route * (except default) in the routing table. These two are a measure to block * forged packets. This is also commonly known as "anti-spoofing" or Unicast * Reverse Path Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs * is purposely reminiscent of the Cisco IOS command, * * ip verify unicast reverse-path * ip verify unicast source reachable-via any * * which implements the same functionality. But note that syntax is * misleading. The check may be performed on all IP packets whether unicast, * multicast, or broadcast. */ static int verify_path(struct in_addr src, struct ifnet *ifp) { struct route ro; struct sockaddr_in *dst; bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in *)&(ro.ro_dst); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = src; rtalloc_ign(&ro, RTF_CLONING); if (ro.ro_rt == NULL) return 0; /* * If ifp is provided, check for equality with rtentry. * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * in order to pass packets injected back by if_simloop(): * if useloopback == 1 routing entry (via lo0) for our own address * may exist, so we need to handle routing assymetry. */ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { RTFREE(ro.ro_rt); return 0; } /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) { RTFREE(ro.ro_rt); return 0; } /* or if this is a blackhole/reject route */ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { RTFREE(ro.ro_rt); return 0; } /* found valid route */ RTFREE(ro.ro_rt); return 1; } #ifdef INET6 /* * ipv6 specific rules here... */ static __inline int icmp6type_match (int type, ipfw_insn_u32 *cmd) { return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); } static int flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) { int i; for (i=0; i <= cmd->o.arg1; ++i ) if (curr_flow == cmd->d[i] ) return 1; return 0; } /* support for IP6_*_ME opcodes */ static int search_ip6_addr_net (struct in6_addr * ip6_addr) { struct ifnet *mdc; struct ifaddr *mdc2; struct in6_ifaddr *fdm; struct in6_addr copia; TAILQ_FOREACH(mdc, &ifnet, if_link) for (mdc2 = mdc->if_addrlist.tqh_first; mdc2; mdc2 = mdc2->ifa_list.tqe_next) { if (!mdc2->ifa_addr) continue; if (mdc2->ifa_addr->sa_family == AF_INET6) { fdm = (struct in6_ifaddr *)mdc2; copia = fdm->ia_addr.sin6_addr; /* need for leaving scope_id in the sock_addr */ in6_clearscope(&copia); if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) return 1; } } return 0; } static int verify_path6(struct in6_addr *src, struct ifnet *ifp) { struct route_in6 ro; struct sockaddr_in6 *dst; bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in6 * )&(ro.ro_dst); dst->sin6_family = AF_INET6; dst->sin6_len = sizeof(*dst); dst->sin6_addr = *src; rtalloc_ign((struct route *)&ro, RTF_CLONING); if (ro.ro_rt == NULL) return 0; /* * if ifp is provided, check for equality with rtentry * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * to support the case of sending packets to an address of our own. * (where the former interface is the first argument of if_simloop() * (=ifp), the latter is lo0) */ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { RTFREE(ro.ro_rt); return 0; } /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) { RTFREE(ro.ro_rt); return 0; } /* or if this is a blackhole/reject route */ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { RTFREE(ro.ro_rt); return 0; } /* found valid route */ RTFREE(ro.ro_rt); return 1; } static __inline int hash_packet6(struct ipfw_flow_id *id) { u_int32_t i; i = (id->dst_ip6.__u6_addr.__u6_addr32[0]) ^ (id->dst_ip6.__u6_addr.__u6_addr32[1]) ^ (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ (id->dst_port) ^ (id->src_port) ^ (id->flow_id6); return i; } static int is_icmp6_query(int icmp6_type) { if ((icmp6_type <= ICMP6_MAXTYPE) && (icmp6_type == ICMP6_ECHO_REQUEST || icmp6_type == ICMP6_MEMBERSHIP_QUERY || icmp6_type == ICMP6_WRUREQUEST || icmp6_type == ICMP6_FQDN_QUERY || icmp6_type == ICMP6_NI_QUERY)) return (1); return (0); } static void send_reject6(struct ip_fw_args *args, int code, u_short offset, u_int hlen) { if (code == ICMP6_UNREACH_RST && offset == 0 && args->f_id.proto == IPPROTO_TCP) { struct ip6_hdr *ip6; struct tcphdr *tcp; tcp_seq ack, seq; int flags; struct { struct ip6_hdr ip6; struct tcphdr th; } ti; if (args->m->m_len < (hlen+sizeof(struct tcphdr))) { args->m = m_pullup(args->m, hlen+sizeof(struct tcphdr)); if (args->m == NULL) return; } ip6 = mtod(args->m, struct ip6_hdr *); tcp = (struct tcphdr *)(mtod(args->m, char *) + hlen); if ((tcp->th_flags & TH_RST) != 0) { m_freem(args->m); return; } ti.ip6 = *ip6; ti.th = *tcp; ti.th.th_seq = ntohl(ti.th.th_seq); ti.th.th_ack = ntohl(ti.th.th_ack); ti.ip6.ip6_nxt = IPPROTO_TCP; if (ti.th.th_flags & TH_ACK) { ack = 0; seq = ti.th.th_ack; flags = TH_RST; } else { ack = ti.th.th_seq; if (((args->m)->m_flags & M_PKTHDR) != 0) { ack += (args->m)->m_pkthdr.len - hlen - (ti.th.th_off << 2); } else if (ip6->ip6_plen) { ack += ntohs(ip6->ip6_plen) + sizeof(*ip6) - hlen - (ti.th.th_off << 2); } else { m_freem(args->m); return; } if (tcp->th_flags & TH_SYN) ack++; seq = 0; flags = TH_RST|TH_ACK; } bcopy(&ti, ip6, sizeof(ti)); tcp_respond(NULL, ip6, (struct tcphdr *)(ip6 + 1), args->m, ack, seq, flags); } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */ icmp6_error(args->m, ICMP6_DST_UNREACH, code, 0); } else m_freem(args->m); args->m = NULL; } #endif /* INET6 */ static u_int64_t norule_counter; /* counter for ipfw_log(NULL...) */ #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 #define SNP(buf) buf, sizeof(buf) /* * We enter here when we have a rule with O_LOG. * XXX this function alone takes about 2Kbytes of code! */ static void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, u_short offset) { struct ether_header *eh = args->eh; char *action; int limit_reached = 0; char action2[40], proto[128], fragment[32]; fragment[0] = '\0'; proto[0] = '\0'; if (f == NULL) { /* bogus pkt */ if (verbose_limit != 0 && norule_counter >= verbose_limit) return; norule_counter++; if (norule_counter == verbose_limit) limit_reached = verbose_limit; action = "Refuse"; } else { /* O_LOG is the first action, find the real one */ ipfw_insn *cmd = ACTION_PTR(f); ipfw_insn_log *l = (ipfw_insn_log *)cmd; if (l->max_log != 0 && l->log_left == 0) return; l->log_left--; if (l->log_left == 0) limit_reached = l->max_log; cmd += F_LEN(cmd); /* point to first action */ if (cmd->opcode == O_ALTQ) { ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; snprintf(SNPARGS(action2, 0), "Altq %d", altq->qid); cmd += F_LEN(cmd); } if (cmd->opcode == O_PROB) cmd += F_LEN(cmd); action = action2; switch (cmd->opcode) { case O_DENY: action = "Deny"; break; case O_REJECT: if (cmd->arg1==ICMP_REJECT_RST) action = "Reset"; else if (cmd->arg1==ICMP_UNREACH_HOST) action = "Reject"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_UNREACH6: if (cmd->arg1==ICMP6_UNREACH_RST) action = "Reset"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_ACCEPT: action = "Accept"; break; case O_COUNT: action = "Count"; break; case O_DIVERT: snprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1); break; case O_TEE: snprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1); break; case O_SKIPTO: snprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1); break; case O_PIPE: snprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1); break; case O_QUEUE: snprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1); break; case O_FORWARD_IP: { ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; int len; len = snprintf(SNPARGS(action2, 0), "Forward to %s", inet_ntoa(sa->sa.sin_addr)); if (sa->sa.sin_port) snprintf(SNPARGS(action2, len), ":%d", sa->sa.sin_port); } break; case O_NETGRAPH: snprintf(SNPARGS(action2, 0), "Netgraph %d", cmd->arg1); break; case O_NGTEE: snprintf(SNPARGS(action2, 0), "Ngtee %d", cmd->arg1); break; default: action = "UNKNOWN"; break; } } if (hlen == 0) { /* non-ip */ snprintf(SNPARGS(proto, 0), "MAC"); } else { int len; char src[48], dst[48]; struct icmphdr *icmp; struct tcphdr *tcp; struct udphdr *udp; /* Initialize to make compiler happy. */ struct ip *ip = NULL; #ifdef INET6 struct ip6_hdr *ip6 = NULL; struct icmp6_hdr *icmp6; #endif src[0] = '\0'; dst[0] = '\0'; #ifdef INET6 if (args->f_id.addr_type == 6) { snprintf(src, sizeof(src), "[%s]", ip6_sprintf(&args->f_id.src_ip6)); snprintf(dst, sizeof(dst), "[%s]", ip6_sprintf(&args->f_id.dst_ip6)); ip6 = (struct ip6_hdr *)mtod(m, struct ip6_hdr *); tcp = (struct tcphdr *)(mtod(args->m, char *) + hlen); udp = (struct udphdr *)(mtod(args->m, char *) + hlen); } else #endif { ip = mtod(m, struct ip *); tcp = L3HDR(struct tcphdr, ip); udp = L3HDR(struct udphdr, ip); inet_ntoa_r(ip->ip_src, src); inet_ntoa_r(ip->ip_dst, dst); } switch (args->f_id.proto) { case IPPROTO_TCP: len = snprintf(SNPARGS(proto, 0), "TCP %s", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(tcp->th_sport), dst, ntohs(tcp->th_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_UDP: len = snprintf(SNPARGS(proto, 0), "UDP %s", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(udp->uh_sport), dst, ntohs(udp->uh_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_ICMP: icmp = L3HDR(struct icmphdr, ip); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMP:%u.%u ", icmp->icmp_type, icmp->icmp_code); else len = snprintf(SNPARGS(proto, 0), "ICMP "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #ifdef INET6 case IPPROTO_ICMPV6: icmp6 = (struct icmp6_hdr *)(mtod(args->m, char *) + hlen); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMPv6:%u.%u ", icmp6->icmp6_type, icmp6->icmp6_code); else len = snprintf(SNPARGS(proto, 0), "ICMPv6 "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #endif default: len = snprintf(SNPARGS(proto, 0), "P:%d %s", args->f_id.proto, src); snprintf(SNPARGS(proto, len), " %s", dst); break; } #ifdef INET6 if (args->f_id.addr_type == 6) { if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) snprintf(SNPARGS(fragment, 0), " (frag %08x:%d@%d%s)", args->f_id.frag_id6, ntohs(ip6->ip6_plen) - hlen, ntohs(offset & IP6F_OFF_MASK) << 3, (offset & IP6F_MORE_FRAG) ? "+" : ""); } else #endif { int ip_off, ip_len; if (eh != NULL) { /* layer 2 packets are as on the wire */ ip_off = ntohs(ip->ip_off); ip_len = ntohs(ip->ip_len); } else { ip_off = ip->ip_off; ip_len = ip->ip_len; } if (ip_off & (IP_MF | IP_OFFMASK)) snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)", ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2), offset << 3, (ip_off & IP_MF) ? "+" : ""); } } if (oif || m->m_pkthdr.rcvif) log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s %s via %s%s\n", f ? f->rulenum : -1, action, proto, oif ? "out" : "in", oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, fragment); else log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s [no if info]%s\n", f ? f->rulenum : -1, action, proto, fragment); if (limit_reached) log(LOG_SECURITY | LOG_NOTICE, "ipfw: limit %d reached on entry %d\n", limit_reached, f ? f->rulenum : -1); } /* * IMPORTANT: the hash function for dynamic rules must be commutative * in source and destination (ip,port), because rules are bidirectional * and we want to find both in the same bucket. */ static __inline int hash_packet(struct ipfw_flow_id *id) { u_int32_t i; #ifdef INET6 if (IS_IP6_FLOW_ID(id)) i = hash_packet6(id); else #endif /* INET6 */ i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); i &= (curr_dyn_buckets - 1); return i; } /** * unlink a dynamic rule from a chain. prev is a pointer to * the previous one, q is a pointer to the rule to delete, * head is a pointer to the head of the queue. * Modifies q and potentially also head. */ #define UNLINK_DYN_RULE(prev, head, q) { \ ipfw_dyn_rule *old_q = q; \ \ /* remove a refcount to the parent */ \ if (q->dyn_type == O_LIMIT) \ q->parent->count--; \ DEB(printf("ipfw: unlink entry 0x%08x %d -> 0x%08x %d, %d left\n",\ (q->id.src_ip), (q->id.src_port), \ (q->id.dst_ip), (q->id.dst_port), dyn_count-1 ); ) \ if (prev != NULL) \ prev->next = q = q->next; \ else \ head = q = q->next; \ dyn_count--; \ uma_zfree(ipfw_dyn_rule_zone, old_q); } #define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) /** * Remove dynamic rules pointing to "rule", or all of them if rule == NULL. * * If keep_me == NULL, rules are deleted even if not expired, * otherwise only expired rules are removed. * * The value of the second parameter is also used to point to identify * a rule we absolutely do not want to remove (e.g. because we are * holding a reference to it -- this is the case with O_LIMIT_PARENT * rules). The pointer is only used for comparison, so any non-null * value will do. */ static void remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me) { static u_int32_t last_remove = 0; #define FORCE (keep_me == NULL) ipfw_dyn_rule *prev, *q; int i, pass = 0, max_pass = 0; IPFW_DYN_LOCK_ASSERT(); if (ipfw_dyn_v == NULL || dyn_count == 0) return; /* do not expire more than once per second, it is useless */ if (!FORCE && last_remove == time_uptime) return; last_remove = time_uptime; /* * because O_LIMIT refer to parent rules, during the first pass only * remove child and mark any pending LIMIT_PARENT, and remove * them in a second pass. */ next_pass: for (i = 0 ; i < curr_dyn_buckets ; i++) { for (prev=NULL, q = ipfw_dyn_v[i] ; q ; ) { /* * Logic can become complex here, so we split tests. */ if (q == keep_me) goto next; if (rule != NULL && rule != q->rule) goto next; /* not the one we are looking for */ if (q->dyn_type == O_LIMIT_PARENT) { /* * handle parent in the second pass, * record we need one. */ max_pass = 1; if (pass == 0) goto next; if (FORCE && q->count != 0 ) { /* XXX should not happen! */ printf("ipfw: OUCH! cannot remove rule," " count %d\n", q->count); } } else { if (!FORCE && !TIME_LEQ( q->expire, time_uptime )) goto next; } if (q->dyn_type != O_LIMIT_PARENT || !q->count) { UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); continue; } next: prev=q; q=q->next; } } if (pass++ < max_pass) goto next_pass; } /** * lookup a dynamic rule. */ static ipfw_dyn_rule * lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp) { /* * stateful ipfw extensions. * Lookup into dynamic session queue */ #define MATCH_REVERSE 0 #define MATCH_FORWARD 1 #define MATCH_NONE 2 #define MATCH_UNKNOWN 3 int i, dir = MATCH_NONE; ipfw_dyn_rule *prev, *q=NULL; IPFW_DYN_LOCK_ASSERT(); if (ipfw_dyn_v == NULL) goto done; /* not found */ i = hash_packet( pkt ); for (prev=NULL, q = ipfw_dyn_v[i] ; q != NULL ; ) { if (q->dyn_type == O_LIMIT_PARENT && q->count) goto next; if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */ UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); continue; } if (pkt->proto == q->id.proto && q->dyn_type != O_LIMIT_PARENT) { if (IS_IP6_FLOW_ID(pkt)) { if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), &(q->id.src_ip6)) && IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), &(q->id.dst_ip6)) && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port ) { dir = MATCH_FORWARD; break; } if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), &(q->id.dst_ip6)) && IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), &(q->id.src_ip6)) && pkt->src_port == q->id.dst_port && pkt->dst_port == q->id.src_port ) { dir = MATCH_REVERSE; break; } } else { if (pkt->src_ip == q->id.src_ip && pkt->dst_ip == q->id.dst_ip && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port ) { dir = MATCH_FORWARD; break; } if (pkt->src_ip == q->id.dst_ip && pkt->dst_ip == q->id.src_ip && pkt->src_port == q->id.dst_port && pkt->dst_port == q->id.src_port ) { dir = MATCH_REVERSE; break; } } } next: prev = q; q = q->next; } if (q == NULL) goto done; /* q = NULL, not found */ if ( prev != NULL) { /* found and not in front */ prev->next = q->next; q->next = ipfw_dyn_v[i]; ipfw_dyn_v[i] = q; } if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST); #define BOTH_SYN (TH_SYN | (TH_SYN << 8)) #define BOTH_FIN (TH_FIN | (TH_FIN << 8)) q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8); switch (q->state) { case TH_SYN: /* opening */ q->expire = time_uptime + dyn_syn_lifetime; break; case BOTH_SYN: /* move to established */ case BOTH_SYN | TH_FIN : /* one side tries to close */ case BOTH_SYN | (TH_FIN << 8) : if (tcp) { #define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0) u_int32_t ack = ntohl(tcp->th_ack); if (dir == MATCH_FORWARD) { if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd)) q->ack_fwd = ack; else { /* ignore out-of-sequence */ break; } } else { if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev)) q->ack_rev = ack; else { /* ignore out-of-sequence */ break; } } } q->expire = time_uptime + dyn_ack_lifetime; break; case BOTH_SYN | BOTH_FIN: /* both sides closed */ if (dyn_fin_lifetime >= dyn_keepalive_period) dyn_fin_lifetime = dyn_keepalive_period - 1; q->expire = time_uptime + dyn_fin_lifetime; break; default: #if 0 /* * reset or some invalid combination, but can also * occur if we use keep-state the wrong way. */ if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) printf("invalid state: 0x%x\n", q->state); #endif if (dyn_rst_lifetime >= dyn_keepalive_period) dyn_rst_lifetime = dyn_keepalive_period - 1; q->expire = time_uptime + dyn_rst_lifetime; break; } } else if (pkt->proto == IPPROTO_UDP) { q->expire = time_uptime + dyn_udp_lifetime; } else { /* other protocols */ q->expire = time_uptime + dyn_short_lifetime; } done: if (match_direction) *match_direction = dir; return q; } static ipfw_dyn_rule * lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp) { ipfw_dyn_rule *q; IPFW_DYN_LOCK(); q = lookup_dyn_rule_locked(pkt, match_direction, tcp); if (q == NULL) IPFW_DYN_UNLOCK(); /* NB: return table locked when q is not NULL */ return q; } static void realloc_dynamic_table(void) { IPFW_DYN_LOCK_ASSERT(); /* * Try reallocation, make sure we have a power of 2 and do * not allow more than 64k entries. In case of overflow, * default to 1024. */ if (dyn_buckets > 65536) dyn_buckets = 1024; if ((dyn_buckets & (dyn_buckets-1)) != 0) { /* not a power of 2 */ dyn_buckets = curr_dyn_buckets; /* reset */ return; } curr_dyn_buckets = dyn_buckets; if (ipfw_dyn_v != NULL) free(ipfw_dyn_v, M_IPFW); for (;;) { ipfw_dyn_v = malloc(curr_dyn_buckets * sizeof(ipfw_dyn_rule *), M_IPFW, M_NOWAIT | M_ZERO); if (ipfw_dyn_v != NULL || curr_dyn_buckets <= 2) break; curr_dyn_buckets /= 2; } } /** * Install state of type 'type' for a dynamic session. * The hash table contains two type of rules: * - regular rules (O_KEEP_STATE) * - rules for sessions with limited number of sess per user * (O_LIMIT). When they are created, the parent is * increased by 1, and decreased on delete. In this case, * the third parameter is the parent rule and not the chain. * - "parent" rules for the above (O_LIMIT_PARENT). */ static ipfw_dyn_rule * add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) { ipfw_dyn_rule *r; int i; IPFW_DYN_LOCK_ASSERT(); if (ipfw_dyn_v == NULL || (dyn_count == 0 && dyn_buckets != curr_dyn_buckets)) { realloc_dynamic_table(); if (ipfw_dyn_v == NULL) return NULL; /* failed ! */ } i = hash_packet(id); r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO); if (r == NULL) { printf ("ipfw: sorry cannot allocate state\n"); return NULL; } /* increase refcount on parent, and set pointer */ if (dyn_type == O_LIMIT) { ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule; if ( parent->dyn_type != O_LIMIT_PARENT) panic("invalid parent"); parent->count++; r->parent = parent; rule = parent->rule; } r->id = *id; r->expire = time_uptime + dyn_syn_lifetime; r->rule = rule; r->dyn_type = dyn_type; r->pcnt = r->bcnt = 0; r->count = 0; r->bucket = i; r->next = ipfw_dyn_v[i]; ipfw_dyn_v[i] = r; dyn_count++; DEB(printf("ipfw: add dyn entry ty %d 0x%08x %d -> 0x%08x %d, total %d\n", dyn_type, (r->id.src_ip), (r->id.src_port), (r->id.dst_ip), (r->id.dst_port), dyn_count ); ) return r; } /** * lookup dynamic parent rule using pkt and rule as search keys. * If the lookup fails, then install one. */ static ipfw_dyn_rule * lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule) { ipfw_dyn_rule *q; int i; IPFW_DYN_LOCK_ASSERT(); if (ipfw_dyn_v) { int is_v6 = IS_IP6_FLOW_ID(pkt); i = hash_packet( pkt ); for (q = ipfw_dyn_v[i] ; q != NULL ; q=q->next) if (q->dyn_type == O_LIMIT_PARENT && rule== q->rule && pkt->proto == q->id.proto && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port && ( (is_v6 && IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), &(q->id.src_ip6)) && IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), &(q->id.dst_ip6))) || (!is_v6 && pkt->src_ip == q->id.src_ip && pkt->dst_ip == q->id.dst_ip) ) ) { q->expire = time_uptime + dyn_short_lifetime; DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);) return q; } } return add_dyn_rule(pkt, O_LIMIT_PARENT, rule); } /** * Install dynamic state for rule type cmd->o.opcode * * Returns 1 (failure) if state is not installed because of errors or because * session limitations are enforced. */ static int install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, struct ip_fw_args *args) { static int last_log; ipfw_dyn_rule *q; DEB(printf("ipfw: install state type %d 0x%08x %u -> 0x%08x %u\n", cmd->o.opcode, (args->f_id.src_ip), (args->f_id.src_port), (args->f_id.dst_ip), (args->f_id.dst_port) );) IPFW_DYN_LOCK(); q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL); if (q != NULL) { /* should never occur */ if (last_log != time_uptime) { last_log = time_uptime; printf("ipfw: install_state: entry already present, done\n"); } IPFW_DYN_UNLOCK(); return 0; } if (dyn_count >= dyn_max) /* * Run out of slots, try to remove any expired rule. */ remove_dyn_rule(NULL, (ipfw_dyn_rule *)1); if (dyn_count >= dyn_max) { if (last_log != time_uptime) { last_log = time_uptime; printf("ipfw: install_state: Too many dynamic rules\n"); } IPFW_DYN_UNLOCK(); return 1; /* cannot install, notify caller */ } switch (cmd->o.opcode) { case O_KEEP_STATE: /* bidir rule */ add_dyn_rule(&args->f_id, O_KEEP_STATE, rule); break; case O_LIMIT: /* limit number of sessions */ { u_int16_t limit_mask = cmd->limit_mask; struct ipfw_flow_id id; ipfw_dyn_rule *parent; DEB(printf("ipfw: installing dyn-limit rule %d\n", cmd->conn_limit);) id.dst_ip = id.src_ip = 0; id.dst_port = id.src_port = 0; id.proto = args->f_id.proto; if (IS_IP6_FLOW_ID (&(args->f_id))) { if (limit_mask & DYN_SRC_ADDR) id.src_ip6 = args->f_id.src_ip6; if (limit_mask & DYN_DST_ADDR) id.dst_ip6 = args->f_id.dst_ip6; } else { if (limit_mask & DYN_SRC_ADDR) id.src_ip = args->f_id.src_ip; if (limit_mask & DYN_DST_ADDR) id.dst_ip = args->f_id.dst_ip; } if (limit_mask & DYN_SRC_PORT) id.src_port = args->f_id.src_port; if (limit_mask & DYN_DST_PORT) id.dst_port = args->f_id.dst_port; parent = lookup_dyn_parent(&id, rule); if (parent == NULL) { printf("ipfw: add parent failed\n"); IPFW_DYN_UNLOCK(); return 1; } if (parent->count >= cmd->conn_limit) { /* * See if we can remove some expired rule. */ remove_dyn_rule(rule, parent); if (parent->count >= cmd->conn_limit) { if (fw_verbose && last_log != time_uptime) { last_log = time_uptime; log(LOG_SECURITY | LOG_DEBUG, "drop session, too many entries\n"); } IPFW_DYN_UNLOCK(); return 1; } } add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent); } break; default: printf("ipfw: unknown dynamic rule type %u\n", cmd->o.opcode); IPFW_DYN_UNLOCK(); return 1; } lookup_dyn_rule_locked(&args->f_id, NULL, NULL); /* XXX just set lifetime */ IPFW_DYN_UNLOCK(); return 0; } /* * Generate a TCP packet, containing either a RST or a keepalive. * When flags & TH_RST, we are sending a RST packet, because of a * "reset" action matched the packet. * Otherwise we are sending a keepalive, and flags & TH_ */ static struct mbuf * send_pkt(struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) { struct mbuf *m; struct ip *ip; struct tcphdr *tcp; MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == 0) return (NULL); m->m_pkthdr.rcvif = (struct ifnet *)0; m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr); m->m_data += max_linkhdr; ip = mtod(m, struct ip *); bzero(ip, m->m_len); tcp = (struct tcphdr *)(ip + 1); /* no IP options */ ip->ip_p = IPPROTO_TCP; tcp->th_off = 5; /* * Assume we are sending a RST (or a keepalive in the reverse * direction), swap src and destination addresses and ports. */ ip->ip_src.s_addr = htonl(id->dst_ip); ip->ip_dst.s_addr = htonl(id->src_ip); tcp->th_sport = htons(id->dst_port); tcp->th_dport = htons(id->src_port); if (flags & TH_RST) { /* we are sending a RST */ if (flags & TH_ACK) { tcp->th_seq = htonl(ack); tcp->th_ack = htonl(0); tcp->th_flags = TH_RST; } else { if (flags & TH_SYN) seq++; tcp->th_seq = htonl(0); tcp->th_ack = htonl(seq); tcp->th_flags = TH_RST | TH_ACK; } } else { /* * We are sending a keepalive. flags & TH_SYN determines * the direction, forward if set, reverse if clear. * NOTE: seq and ack are always assumed to be correct * as set by the caller. This may be confusing... */ if (flags & TH_SYN) { /* * we have to rewrite the correct addresses! */ ip->ip_dst.s_addr = htonl(id->dst_ip); ip->ip_src.s_addr = htonl(id->src_ip); tcp->th_dport = htons(id->dst_port); tcp->th_sport = htons(id->src_port); } tcp->th_seq = htonl(seq); tcp->th_ack = htonl(ack); tcp->th_flags = TH_ACK; } /* * set ip_len to the payload size so we can compute * the tcp checksum on the pseudoheader * XXX check this, could save a couple of words ? */ ip->ip_len = htons(sizeof(struct tcphdr)); tcp->th_sum = in_cksum(m, m->m_pkthdr.len); /* * now fill fields left out earlier */ ip->ip_ttl = ip_defttl; ip->ip_len = m->m_pkthdr.len; m->m_flags |= M_SKIP_FIREWALL; return (m); } /* * sends a reject message, consuming the mbuf passed as an argument. */ static void send_reject(struct ip_fw_args *args, int code, u_short offset, int ip_len) { if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ /* We need the IP header in host order for icmp_error(). */ if (args->eh != NULL) { struct ip *ip = mtod(args->m, struct ip *); ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); } icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); } else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *const tcp = L3HDR(struct tcphdr, mtod(args->m, struct ip *)); if ( (tcp->th_flags & TH_RST) == 0) { struct mbuf *m; m = send_pkt(&(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); } m_freem(args->m); } else m_freem(args->m); args->m = NULL; } /** * * Given an ip_fw *, lookup_next_rule will return a pointer * to the next rule, which can be either the jump * target (for skipto instructions) or the next one in the list (in * all other cases including a missing jump target). * The result is also written in the "next_rule" field of the rule. * Backward jumps are not allowed, so start looking from the next * rule... * * This never returns NULL -- in case we do not have an exact match, * the next rule is returned. When the ruleset is changed, * pointers are flushed so we are always correct. */ static struct ip_fw * lookup_next_rule(struct ip_fw *me) { struct ip_fw *rule = NULL; ipfw_insn *cmd; /* look for action, in case it is a skipto */ cmd = ACTION_PTR(me); if (cmd->opcode == O_LOG) cmd += F_LEN(cmd); if (cmd->opcode == O_ALTQ) cmd += F_LEN(cmd); if ( cmd->opcode == O_SKIPTO ) for (rule = me->next; rule ; rule = rule->next) if (rule->rulenum >= cmd->arg1) break; if (rule == NULL) /* failure or not a skipto */ rule = me->next; me->next_rule = rule; return rule; } static int add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint8_t mlen, uint32_t value) { struct radix_node_head *rnh; struct table_entry *ent; if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl]; ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO); if (ent == NULL) return (ENOMEM); ent->value = value; ent->addr.sin_len = ent->mask.sin_len = 8; ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr; IPFW_WLOCK(&layer3_chain); if (rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent) == NULL) { IPFW_WUNLOCK(&layer3_chain); free(ent, M_IPFW_TBL); return (EEXIST); } IPFW_WUNLOCK(&layer3_chain); return (0); } static int del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint8_t mlen) { struct radix_node_head *rnh; struct table_entry *ent; struct sockaddr_in sa, mask; if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl]; sa.sin_len = mask.sin_len = 8; mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr; IPFW_WLOCK(ch); ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh); if (ent == NULL) { IPFW_WUNLOCK(ch); return (ESRCH); } IPFW_WUNLOCK(ch); free(ent, M_IPFW_TBL); return (0); } static int flush_table_entry(struct radix_node *rn, void *arg) { struct radix_node_head * const rnh = arg; struct table_entry *ent; ent = (struct table_entry *) rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh); if (ent != NULL) free(ent, M_IPFW_TBL); return (0); } static int flush_table(struct ip_fw_chain *ch, uint16_t tbl) { struct radix_node_head *rnh; IPFW_WLOCK_ASSERT(ch); if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl]; KASSERT(rnh != NULL, ("NULL IPFW table")); rnh->rnh_walktree(rnh, flush_table_entry, rnh); return (0); } static void flush_tables(struct ip_fw_chain *ch) { uint16_t tbl; IPFW_WLOCK_ASSERT(ch); for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++) flush_table(ch, tbl); } static int init_tables(struct ip_fw_chain *ch) { int i; uint16_t j; for (i = 0; i < IPFW_TABLES_MAX; i++) { if (!rn_inithead((void **)&ch->tables[i], 32)) { for (j = 0; j < i; j++) { (void) flush_table(ch, j); } return (ENOMEM); } } return (0); } static int lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint32_t *val) { struct radix_node_head *rnh; struct table_entry *ent; struct sockaddr_in sa; if (tbl >= IPFW_TABLES_MAX) return (0); rnh = ch->tables[tbl]; sa.sin_len = 8; sa.sin_addr.s_addr = addr; ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh)); if (ent != NULL) { *val = ent->value; return (1); } return (0); } static int count_table_entry(struct radix_node *rn, void *arg) { u_int32_t * const cnt = arg; (*cnt)++; return (0); } static int count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt) { struct radix_node_head *rnh; if (tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl]; *cnt = 0; rnh->rnh_walktree(rnh, count_table_entry, cnt); return (0); } static int dump_table_entry(struct radix_node *rn, void *arg) { struct table_entry * const n = (struct table_entry *)rn; ipfw_table * const tbl = arg; ipfw_table_entry *ent; if (tbl->cnt == tbl->size) return (1); ent = &tbl->ent[tbl->cnt]; ent->tbl = tbl->tbl; if (in_nullhost(n->mask.sin_addr)) ent->masklen = 0; else ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr)); ent->addr = n->addr.sin_addr.s_addr; ent->value = n->value; tbl->cnt++; return (0); } static int dump_table(struct ip_fw_chain *ch, ipfw_table *tbl) { struct radix_node_head *rnh; IPFW_WLOCK_ASSERT(ch); if (tbl->tbl >= IPFW_TABLES_MAX) return (EINVAL); rnh = ch->tables[tbl->tbl]; tbl->cnt = 0; rnh->rnh_walktree(rnh, dump_table_entry, tbl); return (0); } static void fill_ugid_cache(struct inpcb *inp, struct ip_fw_ugid *ugp) { struct ucred *cr; if (inp->inp_socket != NULL) { cr = inp->inp_socket->so_cred; ugp->fw_prid = jailed(cr) ? cr->cr_prison->pr_id : -1; ugp->fw_uid = cr->cr_uid; ugp->fw_ngroups = cr->cr_ngroups; bcopy(cr->cr_groups, ugp->fw_groups, sizeof(ugp->fw_groups)); } } static int check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, u_int16_t src_port, struct ip_fw_ugid *ugp, int *lookup, struct inpcb *inp) { struct inpcbinfo *pi; int wildcard; struct inpcb *pcb; int match; gid_t *gp; /* * Check to see if the UDP or TCP stack supplied us with * the PCB. If so, rather then holding a lock and looking * up the PCB, we can use the one that was supplied. */ if (inp && *lookup == 0) { INP_LOCK_ASSERT(inp); if (inp->inp_socket != NULL) { fill_ugid_cache(inp, ugp); *lookup = 1; } } /* * If we have already been here and the packet has no * PCB entry associated with it, then we can safely * assume that this is a no match. */ if (*lookup == -1) return (0); if (proto == IPPROTO_TCP) { wildcard = 0; pi = &tcbinfo; } else if (proto == IPPROTO_UDP) { wildcard = 1; pi = &udbinfo; } else return 0; match = 0; if (*lookup == 0) { INP_INFO_RLOCK(pi); pcb = (oif) ? in_pcblookup_hash(pi, dst_ip, htons(dst_port), src_ip, htons(src_port), wildcard, oif) : in_pcblookup_hash(pi, src_ip, htons(src_port), dst_ip, htons(dst_port), wildcard, NULL); if (pcb != NULL) { INP_LOCK(pcb); if (pcb->inp_socket != NULL) { fill_ugid_cache(pcb, ugp); *lookup = 1; } INP_UNLOCK(pcb); } INP_INFO_RUNLOCK(pi); if (*lookup == 0) { /* * If the lookup did not yield any results, there * is no sense in coming back and trying again. So * we can set lookup to -1 and ensure that we wont * bother the pcb system again. */ *lookup = -1; return (0); } } if (insn->o.opcode == O_UID) match = (ugp->fw_uid == (uid_t)insn->d[0]); else if (insn->o.opcode == O_GID) { for (gp = ugp->fw_groups; gp < &ugp->fw_groups[ugp->fw_ngroups]; gp++) if (*gp == (gid_t)insn->d[0]) { match = 1; break; } } else if (insn->o.opcode == O_JAIL) match = (ugp->fw_prid == (int)insn->d[0]); return match; } /* * The main check routine for the firewall. * * All arguments are in args so we can modify them and return them * back to the caller. * * Parameters: * * args->m (in/out) The packet; we set to NULL when/if we nuke it. * Starts with the IP header. * args->eh (in) Mac header if present, or NULL for layer3 packet. * args->oif Outgoing interface, or NULL if packet is incoming. * The incoming interface is in the mbuf. (in) * args->divert_rule (in/out) * Skip up to the first rule past this rule number; * upon return, non-zero port number for divert or tee. * * args->rule Pointer to the last matching rule (in/out) * args->next_hop Socket we are forwarding to (out). * args->f_id Addresses grabbed from the packet (out) * args->cookie a cookie depending on rule action * * Return value: * * IP_FW_PASS the packet must be accepted * IP_FW_DENY the packet must be dropped * IP_FW_DIVERT divert packet, port in m_tag * IP_FW_TEE tee packet, port in m_tag * IP_FW_DUMMYNET to dummynet, pipe in args->cookie * IP_FW_NETGRAPH into netgraph, cookie args->cookie * */ int ipfw_chk(struct ip_fw_args *args) { /* * Local variables hold state during the processing of a packet. * * IMPORTANT NOTE: to speed up the processing of rules, there * are some assumption on the values of the variables, which * are documented here. Should you change them, please check * the implementation of the various instructions to make sure * that they still work. * * args->eh The MAC header. It is non-null for a layer2 * packet, it is NULL for a layer-3 packet. * * m | args->m Pointer to the mbuf, as received from the caller. * It may change if ipfw_chk() does an m_pullup, or if it * consumes the packet because it calls send_reject(). * XXX This has to change, so that ipfw_chk() never modifies * or consumes the buffer. * ip is simply an alias of the value of m, and it is kept * in sync with it (the packet is supposed to start with * the ip header). */ struct mbuf *m = args->m; struct ip *ip = mtod(m, struct ip *); /* * For rules which contain uid/gid or jail constraints, cache * a copy of the users credentials after the pcb lookup has been * executed. This will speed up the processing of rules with * these types of constraints, as well as decrease contention * on pcb related locks. */ struct ip_fw_ugid fw_ugid_cache; int ugid_lookup = 0; /* * divinput_flags If non-zero, set to the IP_FW_DIVERT_*_FLAG * associated with a packet input on a divert socket. This * will allow to distinguish traffic and its direction when * it originates from a divert socket. */ u_int divinput_flags = 0; /* * oif | args->oif If NULL, ipfw_chk has been called on the * inbound path (ether_input, ip_input). * If non-NULL, ipfw_chk has been called on the outbound path * (ether_output, ip_output). */ struct ifnet *oif = args->oif; struct ip_fw *f = NULL; /* matching rule */ int retval = 0; /* * hlen The length of the IP header. */ u_int hlen = 0; /* hlen >0 means we have an IP pkt */ /* * offset The offset of a fragment. offset != 0 means that * we have a fragment at this offset of an IPv4 packet. * offset == 0 means that (if this is an IPv4 packet) * this is the first or only fragment. * For IPv6 offset == 0 means there is no Fragment Header. * If offset != 0 for IPv6 always use correct mask to * get the correct offset because we add IP6F_MORE_FRAG * to be able to dectect the first fragment which would * otherwise have offset = 0. */ u_short offset = 0; /* * Local copies of addresses. They are only valid if we have * an IP packet. * * proto The protocol. Set to 0 for non-ip packets, * or to the protocol read from the packet otherwise. * proto != 0 means that we have an IPv4 packet. * * src_port, dst_port port numbers, in HOST format. Only * valid for TCP and UDP packets. * * src_ip, dst_ip ip addresses, in NETWORK format. * Only valid for IPv4 packets. */ u_int8_t proto; u_int16_t src_port = 0, dst_port = 0; /* NOTE: host format */ struct in_addr src_ip, dst_ip; /* NOTE: network format */ u_int16_t ip_len=0; int pktlen; /* * dyn_dir = MATCH_UNKNOWN when rules unchecked, * MATCH_NONE when checked and not matched (q = NULL), * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) */ int dyn_dir = MATCH_UNKNOWN; ipfw_dyn_rule *q = NULL; struct ip_fw_chain *chain = &layer3_chain; struct m_tag *mtag; /* * We store in ulp a pointer to the upper layer protocol header. * In the ipv4 case this is easy to determine from the header, * but for ipv6 we might have some additional headers in the middle. * ulp is NULL if not found. */ void *ulp = NULL; /* upper layer protocol pointer. */ /* XXX ipv6 variables */ int is_ipv6 = 0; u_int16_t ext_hd = 0; /* bits vector for extension header filtering */ /* end of ipv6 variables */ int is_ipv4 = 0; if (m->m_flags & M_SKIP_FIREWALL) return (IP_FW_PASS); /* accept */ pktlen = m->m_pkthdr.len; proto = args->f_id.proto = 0; /* mark f_id invalid */ /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */ /* * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, * then it sets p to point at the offset "len" in the mbuf. WARNING: the * pointer might become stale after other pullups (but we never use it * this way). */ #define PULLUP_TO(len, p, T) \ do { \ int x = (len) + sizeof(T); \ if ((m)->m_len < x) { \ args->m = m = m_pullup(m, x); \ if (m == NULL) \ goto pullup_failed; \ } \ p = (mtod(m, char *) + (len)); \ } while (0) /* Identify IP packets and fill up variables. */ if (pktlen >= sizeof(struct ip6_hdr) && (args->eh == NULL || ntohs(args->eh->ether_type)==ETHERTYPE_IPV6) && mtod(m, struct ip *)->ip_v == 6) { is_ipv6 = 1; args->f_id.addr_type = 6; hlen = sizeof(struct ip6_hdr); proto = mtod(m, struct ip6_hdr *)->ip6_nxt; /* Search extension headers to find upper layer protocols */ while (ulp == NULL) { switch (proto) { case IPPROTO_ICMPV6: PULLUP_TO(hlen, ulp, struct icmp6_hdr); args->f_id.flags = ICMP6(ulp)->icmp6_type; break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; args->f_id.flags = TCP(ulp)->th_flags; break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_HOPOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_HOPOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_ROUTING: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_rthdr); if (((struct ip6_rthdr *)ulp)->ip6r_type != 0) { printf("IPFW2: IPV6 - Unknown Routing " "Header type(%d)\n", ((struct ip6_rthdr *)ulp)->ip6r_type); if (fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } ext_hd |= EXT_ROUTING; hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; ulp = NULL; break; case IPPROTO_FRAGMENT: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_frag); ext_hd |= EXT_FRAGMENT; hlen += sizeof (struct ip6_frag); proto = ((struct ip6_frag *)ulp)->ip6f_nxt; offset = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_OFF_MASK; /* Add IP6F_MORE_FRAG for offset of first * fragment to be != 0. */ offset |= ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_MORE_FRAG; if (offset == 0) { printf("IPFW2: IPV6 - Invalid Fragment " "Header\n"); if (fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } args->f_id.frag_id6 = ntohl(((struct ip6_frag *)ulp)->ip6f_ident); ulp = NULL; break; case IPPROTO_DSTOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_DSTOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_AH: /* RFC 2402 */ PULLUP_TO(hlen, ulp, struct ip6_ext); ext_hd |= EXT_AH; hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; proto = ((struct ip6_ext *)ulp)->ip6e_nxt; ulp = NULL; break; case IPPROTO_ESP: /* RFC 2406 */ PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ /* Anything past Seq# is variable length and * data past this ext. header is encrypted. */ ext_hd |= EXT_ESP; break; case IPPROTO_NONE: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_ext); /* Packet ends here. if ip6e_len!=0 octets * must be ignored. */ break; case IPPROTO_OSPFIGP: /* XXX OSPF header check? */ PULLUP_TO(hlen, ulp, struct ip6_ext); break; default: printf("IPFW2: IPV6 - Unknown Extension " "Header(%d), ext_hd=%x\n", proto, ext_hd); if (fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } /*switch */ } args->f_id.src_ip6 = mtod(m,struct ip6_hdr *)->ip6_src; args->f_id.dst_ip6 = mtod(m,struct ip6_hdr *)->ip6_dst; args->f_id.src_ip = 0; args->f_id.dst_ip = 0; args->f_id.flow_id6 = ntohl(mtod(m, struct ip6_hdr *)->ip6_flow); } else if (pktlen >= sizeof(struct ip) && (args->eh == NULL || ntohs(args->eh->ether_type) == ETHERTYPE_IP) && mtod(m, struct ip *)->ip_v == 4) { is_ipv4 = 1; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; args->f_id.addr_type = 4; /* * Collect parameters into local variables for faster matching. */ proto = ip->ip_p; src_ip = ip->ip_src; dst_ip = ip->ip_dst; if (args->eh != NULL) { /* layer 2 packets are as on the wire */ offset = ntohs(ip->ip_off) & IP_OFFMASK; ip_len = ntohs(ip->ip_len); } else { offset = ip->ip_off & IP_OFFMASK; ip_len = ip->ip_len; } pktlen = ip_len < pktlen ? ip_len : pktlen; if (offset == 0) { switch (proto) { case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; args->f_id.flags = TCP(ulp)->th_flags; break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_ICMP: PULLUP_TO(hlen, ulp, struct icmphdr); args->f_id.flags = ICMP(ulp)->icmp_type; break; default: break; } } args->f_id.src_ip = ntohl(src_ip.s_addr); args->f_id.dst_ip = ntohl(dst_ip.s_addr); } #undef PULLUP_TO if (proto) { /* we may have port numbers, store them */ args->f_id.proto = proto; args->f_id.src_port = src_port = ntohs(src_port); args->f_id.dst_port = dst_port = ntohs(dst_port); } IPFW_RLOCK(chain); mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL); if (args->rule) { /* * Packet has already been tagged. Look for the next rule * to restart processing. * * If fw_one_pass != 0 then just accept it. * XXX should not happen here, but optimized out in * the caller. */ if (fw_one_pass) { IPFW_RUNLOCK(chain); return (IP_FW_PASS); } f = args->rule->next_rule; if (f == NULL) f = lookup_next_rule(args->rule); } else { /* * Find the starting rule. It can be either the first * one, or the one after divert_rule if asked so. */ int skipto = mtag ? divert_cookie(mtag) : 0; f = chain->rules; if (args->eh == NULL && skipto != 0) { if (skipto >= IPFW_DEFAULT_RULE) { IPFW_RUNLOCK(chain); return (IP_FW_DENY); /* invalid */ } while (f && f->rulenum <= skipto) f = f->next; if (f == NULL) { /* drop packet */ IPFW_RUNLOCK(chain); return (IP_FW_DENY); } } } /* reset divert rule to avoid confusion later */ if (mtag) { divinput_flags = divert_info(mtag) & (IP_FW_DIVERT_OUTPUT_FLAG | IP_FW_DIVERT_LOOPBACK_FLAG); m_tag_delete(m, mtag); } /* * Now scan the rules, and parse microinstructions for each rule. */ for (; f; f = f->next) { ipfw_insn *cmd; uint32_t tablearg = 0; int l, cmdlen, skip_or; /* skip rest of OR block */ again: if (set_disable & (1 << f->set) ) continue; skip_or = 0; for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { int match; /* * check_body is a jump target used when we find a * CHECK_STATE, and need to jump to the body of * the target rule. */ check_body: cmdlen = F_LEN(cmd); /* * An OR block (insn_1 || .. || insn_n) has the * F_OR bit set in all but the last instruction. * The first match will set "skip_or", and cause * the following instructions to be skipped until * past the one with the F_OR bit clear. */ if (skip_or) { /* skip this instruction */ if ((cmd->len & F_OR) == 0) skip_or = 0; /* next one is good */ continue; } match = 0; /* set to 1 if we succeed */ switch (cmd->opcode) { /* * The first set of opcodes compares the packet's * fields with some pattern, setting 'match' if a * match is found. At the end of the loop there is * logic to deal with F_NOT and F_OR flags associated * with the opcode. */ case O_NOP: match = 1; break; case O_FORWARD_MAC: printf("ipfw: opcode %d unimplemented\n", cmd->opcode); break; case O_GID: case O_UID: case O_JAIL: /* * We only check offset == 0 && proto != 0, * as this ensures that we have a * packet with the ports info. */ if (offset!=0) break; if (is_ipv6) /* XXX to be fixed later */ break; if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) match = check_uidgid( (ipfw_insn_u32 *)cmd, proto, oif, dst_ip, dst_port, src_ip, src_port, &fw_ugid_cache, &ugid_lookup, args->inp); break; case O_RECV: match = iface_match(m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd); break; case O_XMIT: match = iface_match(oif, (ipfw_insn_if *)cmd); break; case O_VIA: match = iface_match(oif ? oif : m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd); break; case O_MACADDR2: if (args->eh != NULL) { /* have MAC header */ u_int32_t *want = (u_int32_t *) ((ipfw_insn_mac *)cmd)->addr; u_int32_t *mask = (u_int32_t *) ((ipfw_insn_mac *)cmd)->mask; u_int32_t *hdr = (u_int32_t *)args->eh; match = ( want[0] == (hdr[0] & mask[0]) && want[1] == (hdr[1] & mask[1]) && want[2] == (hdr[2] & mask[2]) ); } break; case O_MAC_TYPE: if (args->eh != NULL) { u_int16_t t = ntohs(args->eh->ether_type); u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (t>=p[0] && t<=p[1]); } break; case O_FRAG: match = (offset != 0); break; case O_IN: /* "out" is "not in" */ match = (oif == NULL); break; case O_LAYER2: match = (args->eh != NULL); break; case O_DIVERTED: match = (cmd->arg1 & 1 && divinput_flags & IP_FW_DIVERT_LOOPBACK_FLAG) || (cmd->arg1 & 2 && divinput_flags & IP_FW_DIVERT_OUTPUT_FLAG); break; case O_PROTO: /* * We do not allow an arg of 0 so the * check of "proto" only suffices. */ match = (proto == cmd->arg1); break; case O_IP_SRC: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == src_ip.s_addr); break; case O_IP_SRC_LOOKUP: case O_IP_DST_LOOKUP: if (is_ipv4) { uint32_t a = (cmd->opcode == O_IP_DST_LOOKUP) ? dst_ip.s_addr : src_ip.s_addr; uint32_t v; match = lookup_table(chain, cmd->arg1, a, &v); if (!match) break; if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) match = ((ipfw_insn_u32 *)cmd)->d[0] == v; else tablearg = v; } break; case O_IP_SRC_MASK: case O_IP_DST_MASK: if (is_ipv4) { uint32_t a = (cmd->opcode == O_IP_DST_MASK) ? dst_ip.s_addr : src_ip.s_addr; uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; int i = cmdlen-1; for (; !match && i>0; i-= 2, p+= 2) match = (p[0] == (a & p[1])); } break; case O_IP_SRC_ME: if (is_ipv4) { struct ifnet *tif; INADDR_TO_IFP(src_ip, tif); match = (tif != NULL); } break; case O_IP_DST_SET: case O_IP_SRC_SET: if (is_ipv4) { u_int32_t *d = (u_int32_t *)(cmd+1); u_int32_t addr = cmd->opcode == O_IP_DST_SET ? args->f_id.dst_ip : args->f_id.src_ip; if (addr < d[0]) break; addr -= d[0]; /* subtract base */ match = (addr < cmd->arg1) && ( d[ 1 + (addr>>5)] & (1<<(addr & 0x1f)) ); } break; case O_IP_DST: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == dst_ip.s_addr); break; case O_IP_DST_ME: if (is_ipv4) { struct ifnet *tif; INADDR_TO_IFP(dst_ip, tif); match = (tif != NULL); } break; case O_IP_SRCPORT: case O_IP_DSTPORT: /* * offset == 0 && proto != 0 is enough * to guarantee that we have a * packet with port info. */ if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP) && offset == 0) { u_int16_t x = (cmd->opcode == O_IP_SRCPORT) ? src_port : dst_port ; u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (x>=p[0] && x<=p[1]); } break; case O_ICMPTYPE: match = (offset == 0 && proto==IPPROTO_ICMP && icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); break; #ifdef INET6 case O_ICMP6TYPE: match = is_ipv6 && offset == 0 && proto==IPPROTO_ICMPV6 && icmp6type_match( ICMP6(ulp)->icmp6_type, (ipfw_insn_u32 *)cmd); break; #endif /* INET6 */ case O_IPOPT: match = (is_ipv4 && ipopts_match(mtod(m, struct ip *), cmd) ); break; case O_IPVER: match = (is_ipv4 && cmd->arg1 == mtod(m, struct ip *)->ip_v); break; case O_IPID: case O_IPLEN: case O_IPTTL: if (is_ipv4) { /* only for IP packets */ uint16_t x; uint16_t *p; int i; if (cmd->opcode == O_IPLEN) x = ip_len; else if (cmd->opcode == O_IPTTL) x = mtod(m, struct ip *)->ip_ttl; else /* must be IPID */ x = ntohs(mtod(m, struct ip *)->ip_id); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_IPPRECEDENCE: match = (is_ipv4 && (cmd->arg1 == (mtod(m, struct ip *)->ip_tos & 0xe0)) ); break; case O_IPTOS: match = (is_ipv4 && flags_match(cmd, mtod(m, struct ip *)->ip_tos)); break; case O_TCPDATALEN: if (proto == IPPROTO_TCP && offset == 0) { struct tcphdr *tcp; uint16_t x; uint16_t *p; int i; tcp = TCP(ulp); x = ip_len - ((ip->ip_hl + tcp->th_off) << 2); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_TCPFLAGS: match = (proto == IPPROTO_TCP && offset == 0 && flags_match(cmd, TCP(ulp)->th_flags)); break; case O_TCPOPTS: match = (proto == IPPROTO_TCP && offset == 0 && tcpopts_match(TCP(ulp), cmd)); break; case O_TCPSEQ: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_seq); break; case O_TCPACK: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_ack); break; case O_TCPWIN: match = (proto == IPPROTO_TCP && offset == 0 && cmd->arg1 == TCP(ulp)->th_win); break; case O_ESTAB: /* reject packets which have SYN only */ /* XXX should i also check for TH_ACK ? */ match = (proto == IPPROTO_TCP && offset == 0 && (TCP(ulp)->th_flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); break; case O_ALTQ: { struct altq_tag *at; ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; match = 1; mtag = m_tag_find(m, PACKET_TAG_PF_QID, NULL); if (mtag != NULL) break; mtag = m_tag_get(PACKET_TAG_PF_QID, sizeof(struct altq_tag), M_NOWAIT); if (mtag == NULL) { /* * Let the packet fall back to the * default ALTQ. */ break; } at = (struct altq_tag *)(mtag+1); at->qid = altq->qid; if (is_ipv4) at->af = AF_INET; else at->af = AF_LINK; at->hdr = ip; m_tag_prepend(m, mtag); break; } case O_LOG: if (fw_verbose) ipfw_log(f, hlen, args, m, oif, offset); match = 1; break; case O_PROB: match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); break; case O_VERREVPATH: /* Outgoing packets automatically pass/match */ match = ((oif != NULL) || (m->m_pkthdr.rcvif == NULL) || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), m->m_pkthdr.rcvif) : #endif verify_path(src_ip, m->m_pkthdr.rcvif))); break; case O_VERSRCREACH: /* Outgoing packets automatically pass/match */ match = (hlen > 0 && ((oif != NULL) || #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), NULL) : #endif verify_path(src_ip, NULL))); break; case O_ANTISPOOF: /* Outgoing packets automatically pass/match */ if (oif == NULL && hlen > 0 && ( (is_ipv4 && in_localaddr(src_ip)) #ifdef INET6 || (is_ipv6 && in6_localaddr(&(args->f_id.src_ip6))) #endif )) match = #ifdef INET6 is_ipv6 ? verify_path6( &(args->f_id.src_ip6), m->m_pkthdr.rcvif) : #endif verify_path(src_ip, m->m_pkthdr.rcvif); else match = 1; break; case O_IPSEC: #ifdef FAST_IPSEC match = (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); #endif #ifdef IPSEC match = (ipsec_getnhist(m) != 0); #endif /* otherwise no match */ break; #ifdef INET6 case O_IP6_SRC: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_DST: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_SRC_MASK: if (is_ipv6) { ipfw_insn_ip6 *te = (ipfw_insn_ip6 *)cmd; struct in6_addr p = args->f_id.src_ip6; APPLY_MASK(&p, &te->mask6); match = IN6_ARE_ADDR_EQUAL(&te->addr6, &p); } break; case O_IP6_DST_MASK: if (is_ipv6) { ipfw_insn_ip6 *te = (ipfw_insn_ip6 *)cmd; struct in6_addr p = args->f_id.dst_ip6; APPLY_MASK(&p, &te->mask6); match = IN6_ARE_ADDR_EQUAL(&te->addr6, &p); } break; case O_IP6_SRC_ME: match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); break; case O_IP6_DST_ME: match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); break; case O_FLOW6ID: match = is_ipv6 && flow6id_match(args->f_id.flow_id6, (ipfw_insn_u32 *) cmd); break; case O_EXT_HDR: match = is_ipv6 && (ext_hd & ((ipfw_insn *) cmd)->arg1); break; case O_IP6: match = is_ipv6; break; #endif case O_IP4: match = is_ipv4; break; /* * The second set of opcodes represents 'actions', * i.e. the terminal part of a rule once the packet * matches all previous patterns. * Typically there is only one action for each rule, * and the opcode is stored at the end of the rule * (but there are exceptions -- see below). * * In general, here we set retval and terminate the * outer loop (would be a 'break 3' in some language, * but we need to do a 'goto done'). * * Exceptions: * O_COUNT and O_SKIPTO actions: * instead of terminating, we jump to the next rule * ('goto next_rule', equivalent to a 'break 2'), * or to the SKIPTO target ('goto again' after * having set f, cmd and l), respectively. * * O_LOG and O_ALTQ action parameters: * perform some action and set match = 1; * * O_LIMIT and O_KEEP_STATE: these opcodes are * not real 'actions', and are stored right * before the 'action' part of the rule. * These opcodes try to install an entry in the * state tables; if successful, we continue with * the next opcode (match=1; break;), otherwise * the packet * must be dropped * ('goto done' after setting retval); * * O_PROBE_STATE and O_CHECK_STATE: these opcodes * cause a lookup of the state table, and a jump * to the 'action' part of the parent rule * ('goto check_body') if an entry is found, or * (CHECK_STATE only) a jump to the next rule if * the entry is not found ('goto next_rule'). * The result of the lookup is cached to make * further instances of these opcodes are * effectively NOPs. */ case O_LIMIT: case O_KEEP_STATE: if (install_state(f, (ipfw_insn_limit *)cmd, args)) { retval = IP_FW_DENY; goto done; /* error/limit violation */ } match = 1; break; case O_PROBE_STATE: case O_CHECK_STATE: /* * dynamic rules are checked at the first * keep-state or check-state occurrence, * with the result being stored in dyn_dir. * The compiler introduces a PROBE_STATE * instruction for us when we have a * KEEP_STATE (because PROBE_STATE needs * to be run first). */ if (dyn_dir == MATCH_UNKNOWN && (q = lookup_dyn_rule(&args->f_id, &dyn_dir, proto == IPPROTO_TCP ? TCP(ulp) : NULL)) != NULL) { /* * Found dynamic entry, update stats * and jump to the 'action' part of * the parent rule. */ q->pcnt++; q->bcnt += pktlen; f = q->rule; cmd = ACTION_PTR(f); l = f->cmd_len - f->act_ofs; IPFW_DYN_UNLOCK(); goto check_body; } /* * Dynamic entry not found. If CHECK_STATE, * skip to next rule, if PROBE_STATE just * ignore and continue with next opcode. */ if (cmd->opcode == O_CHECK_STATE) goto next_rule; match = 1; break; case O_ACCEPT: retval = 0; /* accept */ goto done; case O_PIPE: case O_QUEUE: args->rule = f; /* report matching rule */ if (cmd->arg1 == IP_FW_TABLEARG) args->cookie = tablearg; else args->cookie = cmd->arg1; retval = IP_FW_DUMMYNET; goto done; case O_DIVERT: case O_TEE: { struct divert_tag *dt; if (args->eh) /* not on layer 2 */ break; mtag = m_tag_get(PACKET_TAG_DIVERT, sizeof(struct divert_tag), M_NOWAIT); if (mtag == NULL) { /* XXX statistic */ /* drop packet */ IPFW_RUNLOCK(chain); return (IP_FW_DENY); } dt = (struct divert_tag *)(mtag+1); dt->cookie = f->rulenum; if (cmd->arg1 == IP_FW_TABLEARG) dt->info = tablearg; else dt->info = cmd->arg1; m_tag_prepend(m, mtag); retval = (cmd->opcode == O_DIVERT) ? IP_FW_DIVERT : IP_FW_TEE; goto done; } case O_COUNT: case O_SKIPTO: f->pcnt++; /* update stats */ f->bcnt += pktlen; f->timestamp = time_uptime; if (cmd->opcode == O_COUNT) goto next_rule; /* handle skipto */ if (f->next_rule == NULL) lookup_next_rule(f); f = f->next_rule; goto again; case O_REJECT: /* * Drop the packet and send a reject notice * if the packet is not ICMP (or is an ICMP * query), and it is not multicast/broadcast. */ if (hlen > 0 && is_ipv4 && offset == 0 && (proto != IPPROTO_ICMP || is_icmp_query(ICMP(ulp))) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN_MULTICAST(ntohl(dst_ip.s_addr))) { send_reject(args, cmd->arg1, offset,ip_len); m = args->m; } /* FALLTHROUGH */ #ifdef INET6 case O_UNREACH6: if (hlen > 0 && is_ipv6 && (proto != IPPROTO_ICMPV6 || (is_icmp6_query(args->f_id.flags) == 1)) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) { send_reject6(args, cmd->arg1, offset, hlen); m = args->m; } /* FALLTHROUGH */ #endif case O_DENY: retval = IP_FW_DENY; goto done; case O_FORWARD_IP: if (args->eh) /* not valid on layer2 pkts */ break; if (!q || dyn_dir == MATCH_FORWARD) args->next_hop = &((ipfw_insn_sa *)cmd)->sa; retval = IP_FW_PASS; goto done; case O_NETGRAPH: case O_NGTEE: args->rule = f; /* report matching rule */ if (cmd->arg1 == IP_FW_TABLEARG) args->cookie = tablearg; else args->cookie = cmd->arg1; retval = (cmd->opcode == O_NETGRAPH) ? IP_FW_NETGRAPH : IP_FW_NGTEE; goto done; default: panic("-- unknown opcode %d\n", cmd->opcode); } /* end of switch() on opcodes */ if (cmd->len & F_NOT) match = !match; if (match) { if (cmd->len & F_OR) skip_or = 1; } else { if (!(cmd->len & F_OR)) /* not an OR block, */ break; /* try next rule */ } } /* end of inner for, scan opcodes */ next_rule:; /* try next rule */ } /* end of outer for, scan rules */ printf("ipfw: ouch!, skip past end of rules, denying packet\n"); IPFW_RUNLOCK(chain); return (IP_FW_DENY); done: /* Update statistics */ f->pcnt++; f->bcnt += pktlen; f->timestamp = time_uptime; IPFW_RUNLOCK(chain); return (retval); pullup_failed: if (fw_verbose) printf("ipfw: pullup failed\n"); return (IP_FW_DENY); } /* * When a rule is added/deleted, clear the next_rule pointers in all rules. * These will be reconstructed on the fly as packets are matched. */ static void flush_rule_ptrs(struct ip_fw_chain *chain) { struct ip_fw *rule; IPFW_WLOCK_ASSERT(chain); for (rule = chain->rules; rule; rule = rule->next) rule->next_rule = NULL; } /* * Add a new rule to the list. Copy the rule into a malloc'ed area, then * possibly create a rule number and add the rule to the list. * Update the rule_number in the input struct so the caller knows it as well. */ static int add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule) { struct ip_fw *rule, *f, *prev; int l = RULESIZE(input_rule); if (chain->rules == NULL && input_rule->rulenum != IPFW_DEFAULT_RULE) return (EINVAL); rule = malloc(l, M_IPFW, M_NOWAIT | M_ZERO); if (rule == NULL) return (ENOSPC); bcopy(input_rule, rule, l); rule->next = NULL; rule->next_rule = NULL; rule->pcnt = 0; rule->bcnt = 0; rule->timestamp = 0; IPFW_WLOCK(chain); if (chain->rules == NULL) { /* default rule */ chain->rules = rule; goto done; } /* * If rulenum is 0, find highest numbered rule before the * default rule, and add autoinc_step */ if (autoinc_step < 1) autoinc_step = 1; else if (autoinc_step > 1000) autoinc_step = 1000; if (rule->rulenum == 0) { /* * locate the highest numbered rule before default */ for (f = chain->rules; f; f = f->next) { if (f->rulenum == IPFW_DEFAULT_RULE) break; rule->rulenum = f->rulenum; } if (rule->rulenum < IPFW_DEFAULT_RULE - autoinc_step) rule->rulenum += autoinc_step; input_rule->rulenum = rule->rulenum; } /* * Now insert the new rule in the right place in the sorted list. */ for (prev = NULL, f = chain->rules; f; prev = f, f = f->next) { if (f->rulenum > rule->rulenum) { /* found the location */ if (prev) { rule->next = f; prev->next = rule; } else { /* head insert */ rule->next = chain->rules; chain->rules = rule; } break; } } flush_rule_ptrs(chain); done: static_count++; static_len += l; IPFW_WUNLOCK(chain); DEB(printf("ipfw: installed rule %d, static count now %d\n", rule->rulenum, static_count);) return (0); } /** * Remove a static rule (including derived * dynamic rules) * and place it on the ``reap list'' for later reclamation. * The caller is in charge of clearing rule pointers to avoid * dangling pointers. * @return a pointer to the next entry. * Arguments are not checked, so they better be correct. */ static struct ip_fw * remove_rule(struct ip_fw_chain *chain, struct ip_fw *rule, struct ip_fw *prev) { struct ip_fw *n; int l = RULESIZE(rule); IPFW_WLOCK_ASSERT(chain); n = rule->next; IPFW_DYN_LOCK(); remove_dyn_rule(rule, NULL /* force removal */); IPFW_DYN_UNLOCK(); if (prev == NULL) chain->rules = n; else prev->next = n; static_count--; static_len -= l; rule->next = chain->reap; chain->reap = rule; return n; } /** * Reclaim storage associated with a list of rules. This is * typically the list created using remove_rule. */ static void reap_rules(struct ip_fw *head) { struct ip_fw *rule; while ((rule = head) != NULL) { head = head->next; if (DUMMYNET_LOADED) ip_dn_ruledel_ptr(rule); free(rule, M_IPFW); } } /* * Remove all rules from a chain (except rules in set RESVD_SET * unless kill_default = 1). The caller is responsible for * reclaiming storage for the rules left in chain->reap. */ static void free_chain(struct ip_fw_chain *chain, int kill_default) { struct ip_fw *prev, *rule; IPFW_WLOCK_ASSERT(chain); flush_rule_ptrs(chain); /* more efficient to do outside the loop */ for (prev = NULL, rule = chain->rules; rule ; ) if (kill_default || rule->set != RESVD_SET) rule = remove_rule(chain, rule, prev); else { prev = rule; rule = rule->next; } } /** * Remove all rules with given number, and also do set manipulation. * Assumes chain != NULL && *chain != NULL. * * The argument is an u_int32_t. The low 16 bit are the rule or set number, * the next 8 bits are the new set, the top 8 bits are the command: * * 0 delete rules with given number * 1 delete rules with given set number * 2 move rules with given number to new set * 3 move rules with given set number to new set * 4 swap sets with given numbers */ static int del_entry(struct ip_fw_chain *chain, u_int32_t arg) { struct ip_fw *prev = NULL, *rule; u_int16_t rulenum; /* rule or old_set */ u_int8_t cmd, new_set; rulenum = arg & 0xffff; cmd = (arg >> 24) & 0xff; new_set = (arg >> 16) & 0xff; if (cmd > 4) return EINVAL; if (new_set > RESVD_SET) return EINVAL; if (cmd == 0 || cmd == 2) { if (rulenum >= IPFW_DEFAULT_RULE) return EINVAL; } else { if (rulenum > RESVD_SET) /* old_set */ return EINVAL; } IPFW_WLOCK(chain); rule = chain->rules; chain->reap = NULL; switch (cmd) { case 0: /* delete rules with given number */ /* * locate first rule to delete */ for (; rule->rulenum < rulenum; prev = rule, rule = rule->next) ; if (rule->rulenum != rulenum) { IPFW_WUNLOCK(chain); return EINVAL; } /* * flush pointers outside the loop, then delete all matching * rules. prev remains the same throughout the cycle. */ flush_rule_ptrs(chain); while (rule->rulenum == rulenum) rule = remove_rule(chain, rule, prev); break; case 1: /* delete all rules with given set number */ flush_rule_ptrs(chain); rule = chain->rules; while (rule->rulenum < IPFW_DEFAULT_RULE) if (rule->set == rulenum) rule = remove_rule(chain, rule, prev); else { prev = rule; rule = rule->next; } break; case 2: /* move rules with given number to new set */ rule = chain->rules; for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) if (rule->rulenum == rulenum) rule->set = new_set; break; case 3: /* move rules with given set number to new set */ for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) if (rule->set == rulenum) rule->set = new_set; break; case 4: /* swap two sets */ for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) if (rule->set == rulenum) rule->set = new_set; else if (rule->set == new_set) rule->set = rulenum; break; } /* * Look for rules to reclaim. We grab the list before * releasing the lock then reclaim them w/o the lock to * avoid a LOR with dummynet. */ rule = chain->reap; chain->reap = NULL; IPFW_WUNLOCK(chain); if (rule) reap_rules(rule); return 0; } /* * Clear counters for a specific rule. * The enclosing "table" is assumed locked. */ static void clear_counters(struct ip_fw *rule, int log_only) { ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); if (log_only == 0) { rule->bcnt = rule->pcnt = 0; rule->timestamp = 0; } if (l->o.opcode == O_LOG) l->log_left = l->max_log; } /** * Reset some or all counters on firewall rules. * @arg frwl is null to clear all entries, or contains a specific * rule number. * @arg log_only is 1 if we only want to reset logs, zero otherwise. */ static int zero_entry(struct ip_fw_chain *chain, int rulenum, int log_only) { struct ip_fw *rule; char *msg; IPFW_WLOCK(chain); if (rulenum == 0) { norule_counter = 0; for (rule = chain->rules; rule; rule = rule->next) clear_counters(rule, log_only); msg = log_only ? "ipfw: All logging counts reset.\n" : "ipfw: Accounting cleared.\n"; } else { int cleared = 0; /* * We can have multiple rules with the same number, so we * need to clear them all. */ for (rule = chain->rules; rule; rule = rule->next) if (rule->rulenum == rulenum) { while (rule && rule->rulenum == rulenum) { clear_counters(rule, log_only); rule = rule->next; } cleared = 1; break; } if (!cleared) { /* we did not find any matching rules */ IPFW_WUNLOCK(chain); return (EINVAL); } msg = log_only ? "ipfw: Entry %d logging count reset.\n" : "ipfw: Entry %d cleared.\n"; } IPFW_WUNLOCK(chain); if (fw_verbose) log(LOG_SECURITY | LOG_NOTICE, msg, rulenum); return (0); } /* * Check validity of the structure before insert. * Fortunately rules are simple, so this mostly need to check rule sizes. */ static int check_ipfw_struct(struct ip_fw *rule, int size) { int l, cmdlen = 0; int have_action=0; ipfw_insn *cmd; if (size < sizeof(*rule)) { printf("ipfw: rule too short\n"); return (EINVAL); } /* first, check for valid size */ l = RULESIZE(rule); if (l != size) { printf("ipfw: size mismatch (have %d want %d)\n", size, l); return (EINVAL); } if (rule->act_ofs >= rule->cmd_len) { printf("ipfw: bogus action offset (%u > %u)\n", rule->act_ofs, rule->cmd_len - 1); return (EINVAL); } /* * Now go for the individual checks. Very simple ones, basically only * instruction sizes. */ for (l = rule->cmd_len, cmd = rule->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (cmdlen > l) { printf("ipfw: opcode %d size truncated\n", cmd->opcode); return EINVAL; } DEB(printf("ipfw: opcode %d\n", cmd->opcode);) switch (cmd->opcode) { case O_PROBE_STATE: case O_KEEP_STATE: case O_PROTO: case O_IP_SRC_ME: case O_IP_DST_ME: case O_LAYER2: case O_IN: case O_FRAG: case O_DIVERTED: case O_IPOPT: case O_IPTOS: case O_IPPRECEDENCE: case O_IPVER: case O_TCPWIN: case O_TCPFLAGS: case O_TCPOPTS: case O_ESTAB: case O_VERREVPATH: case O_VERSRCREACH: case O_ANTISPOOF: case O_IPSEC: #ifdef INET6 case O_IP6_SRC_ME: case O_IP6_DST_ME: case O_EXT_HDR: case O_IP6: #endif case O_IP4: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; break; case O_UID: case O_GID: case O_JAIL: case O_IP_SRC: case O_IP_DST: case O_TCPSEQ: case O_TCPACK: case O_PROB: case O_ICMPTYPE: if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; break; case O_LIMIT: if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) goto bad_size; break; case O_LOG: if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) goto bad_size; ((ipfw_insn_log *)cmd)->log_left = ((ipfw_insn_log *)cmd)->max_log; break; case O_IP_SRC_MASK: case O_IP_DST_MASK: /* only odd command lengths */ if ( !(cmdlen & 1) || cmdlen > 31) goto bad_size; break; case O_IP_SRC_SET: case O_IP_DST_SET: if (cmd->arg1 == 0 || cmd->arg1 > 256) { printf("ipfw: invalid set size %d\n", cmd->arg1); return EINVAL; } if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + (cmd->arg1+31)/32 ) goto bad_size; break; case O_IP_SRC_LOOKUP: case O_IP_DST_LOOKUP: if (cmd->arg1 >= IPFW_TABLES_MAX) { printf("ipfw: invalid table number %d\n", cmd->arg1); return (EINVAL); } if (cmdlen != F_INSN_SIZE(ipfw_insn) && cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; break; case O_MACADDR2: if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) goto bad_size; break; case O_NOP: case O_IPID: case O_IPTTL: case O_IPLEN: case O_TCPDATALEN: if (cmdlen < 1 || cmdlen > 31) goto bad_size; break; case O_MAC_TYPE: case O_IP_SRCPORT: case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ if (cmdlen < 2 || cmdlen > 31) goto bad_size; break; case O_RECV: case O_XMIT: case O_VIA: if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) goto bad_size; break; case O_ALTQ: if (cmdlen != F_INSN_SIZE(ipfw_insn_altq)) goto bad_size; break; case O_PIPE: case O_QUEUE: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; goto check_action; case O_FORWARD_IP: #ifdef IPFIREWALL_FORWARD if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) goto bad_size; goto check_action; #else return EINVAL; #endif case O_DIVERT: case O_TEE: if (ip_divert_ptr == NULL) return EINVAL; else goto check_size; case O_NETGRAPH: case O_NGTEE: if (!NG_IPFW_LOADED) return EINVAL; else goto check_size; case O_FORWARD_MAC: /* XXX not implemented yet */ case O_CHECK_STATE: case O_COUNT: case O_ACCEPT: case O_DENY: case O_REJECT: #ifdef INET6 case O_UNREACH6: #endif case O_SKIPTO: check_size: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; check_action: if (have_action) { printf("ipfw: opcode %d, multiple actions" " not allowed\n", cmd->opcode); return EINVAL; } have_action = 1; if (l != cmdlen) { printf("ipfw: opcode %d, action must be" " last opcode\n", cmd->opcode); return EINVAL; } break; #ifdef INET6 case O_IP6_SRC: case O_IP6_DST: if (cmdlen != F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn)) goto bad_size; break; case O_FLOW6ID: if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + ((ipfw_insn_u32 *)cmd)->o.arg1) goto bad_size; break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if ( !(cmdlen & 1) || cmdlen > 127) goto bad_size; break; case O_ICMP6TYPE: if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) ) goto bad_size; break; #endif default: switch (cmd->opcode) { #ifndef INET6 case O_IP6_SRC_ME: case O_IP6_DST_ME: case O_EXT_HDR: case O_IP6: case O_UNREACH6: case O_IP6_SRC: case O_IP6_DST: case O_FLOW6ID: case O_IP6_SRC_MASK: case O_IP6_DST_MASK: case O_ICMP6TYPE: printf("ipfw: no IPv6 support in kernel\n"); return EPROTONOSUPPORT; #endif default: printf("ipfw: opcode %d, unknown opcode\n", cmd->opcode); return EINVAL; } } } if (have_action == 0) { printf("ipfw: missing action\n"); return EINVAL; } return 0; bad_size: printf("ipfw: opcode %d size %d wrong\n", cmd->opcode, cmdlen); return EINVAL; } /* * Copy the static and dynamic rules to the supplied buffer * and return the amount of space actually used. */ static size_t ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) { char *bp = buf; char *ep = bp + space; struct ip_fw *rule; int i; /* XXX this can take a long time and locking will block packet flow */ IPFW_RLOCK(chain); for (rule = chain->rules; rule ; rule = rule->next) { /* * Verify the entry fits in the buffer in case the * rules changed between calculating buffer space and * now. This would be better done using a generation * number but should suffice for now. */ i = RULESIZE(rule); if (bp + i <= ep) { bcopy(rule, bp, i); bcopy(&set_disable, &(((struct ip_fw *)bp)->next_rule), sizeof(set_disable)); bp += i; } } IPFW_RUNLOCK(chain); if (ipfw_dyn_v) { ipfw_dyn_rule *p, *last = NULL; IPFW_DYN_LOCK(); for (i = 0 ; i < curr_dyn_buckets; i++) for (p = ipfw_dyn_v[i] ; p != NULL; p = p->next) { if (bp + sizeof *p <= ep) { ipfw_dyn_rule *dst = (ipfw_dyn_rule *)bp; bcopy(p, dst, sizeof *p); bcopy(&(p->rule->rulenum), &(dst->rule), sizeof(p->rule->rulenum)); /* * store a non-null value in "next". * The userland code will interpret a * NULL here as a marker * for the last dynamic rule. */ bcopy(&dst, &dst->next, sizeof(dst)); last = dst; dst->expire = TIME_LEQ(dst->expire, time_uptime) ? 0 : dst->expire - time_uptime ; bp += sizeof(ipfw_dyn_rule); } } IPFW_DYN_UNLOCK(); if (last != NULL) /* mark last dynamic rule */ bzero(&last->next, sizeof(last)); } return (bp - (char *)buf); } /** * {set|get}sockopt parser. */ static int ipfw_ctl(struct sockopt *sopt) { #define RULE_MAXSIZE (256*sizeof(u_int32_t)) int error, rule_num; size_t size; struct ip_fw *buf, *rule; u_int32_t rulenum[2]; error = suser(sopt->sopt_td); if (error) return (error); /* * Disallow modifications in really-really secure mode, but still allow * the logging counters to be reset. */ if (sopt->sopt_name == IP_FW_ADD || (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) { error = securelevel_ge(sopt->sopt_td->td_ucred, 3); if (error) return (error); } error = 0; switch (sopt->sopt_name) { case IP_FW_GET: /* * pass up a copy of the current rules. Static rules * come first (the last of which has number IPFW_DEFAULT_RULE), * followed by a possibly empty list of dynamic rule. * The last dynamic rule has NULL in the "next" field. * * Note that the calculated size is used to bound the * amount of data returned to the user. The rule set may * change between calculating the size and returning the * data in which case we'll just return what fits. */ size = static_len; /* size of static rules */ if (ipfw_dyn_v) /* add size of dyn.rules */ size += (dyn_count * sizeof(ipfw_dyn_rule)); /* * XXX todo: if the user passes a short length just to know * how much room is needed, do not bother filling up the * buffer, just jump to the sooptcopyout. */ buf = malloc(size, M_TEMP, M_WAITOK); error = sooptcopyout(sopt, buf, ipfw_getrules(&layer3_chain, buf, size)); free(buf, M_TEMP); break; case IP_FW_FLUSH: /* * Normally we cannot release the lock on each iteration. * We could do it here only because we start from the head all * the times so there is no risk of missing some entries. * On the other hand, the risk is that we end up with * a very inconsistent ruleset, so better keep the lock * around the whole cycle. * * XXX this code can be improved by resetting the head of * the list to point to the default rule, and then freeing * the old list without the need for a lock. */ IPFW_WLOCK(&layer3_chain); layer3_chain.reap = NULL; free_chain(&layer3_chain, 0 /* keep default rule */); rule = layer3_chain.reap, layer3_chain.reap = NULL; IPFW_WUNLOCK(&layer3_chain); if (layer3_chain.reap != NULL) reap_rules(rule); break; case IP_FW_ADD: rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK); error = sooptcopyin(sopt, rule, RULE_MAXSIZE, sizeof(struct ip_fw) ); if (error == 0) error = check_ipfw_struct(rule, sopt->sopt_valsize); if (error == 0) { error = add_rule(&layer3_chain, rule); size = RULESIZE(rule); if (!error && sopt->sopt_dir == SOPT_GET) error = sooptcopyout(sopt, rule, size); } free(rule, M_TEMP); break; case IP_FW_DEL: /* * IP_FW_DEL is used for deleting single rules or sets, * and (ab)used to atomically manipulate sets. Argument size * is used to distinguish between the two: * sizeof(u_int32_t) * delete single rule or set of rules, * or reassign rules (or sets) to a different set. * 2*sizeof(u_int32_t) * atomic disable/enable sets. * first u_int32_t contains sets to be disabled, * second u_int32_t contains sets to be enabled. */ error = sooptcopyin(sopt, rulenum, 2*sizeof(u_int32_t), sizeof(u_int32_t)); if (error) break; size = sopt->sopt_valsize; if (size == sizeof(u_int32_t)) /* delete or reassign */ error = del_entry(&layer3_chain, rulenum[0]); else if (size == 2*sizeof(u_int32_t)) /* set enable/disable */ set_disable = (set_disable | rulenum[0]) & ~rulenum[1] & ~(1<sopt_val != 0) { error = sooptcopyin(sopt, &rule_num, sizeof(int), sizeof(int)); if (error) break; } error = zero_entry(&layer3_chain, rule_num, sopt->sopt_name == IP_FW_RESETLOG); break; case IP_FW_TABLE_ADD: { ipfw_table_entry ent; error = sooptcopyin(sopt, &ent, sizeof(ent), sizeof(ent)); if (error) break; error = add_table_entry(&layer3_chain, ent.tbl, ent.addr, ent.masklen, ent.value); } break; case IP_FW_TABLE_DEL: { ipfw_table_entry ent; error = sooptcopyin(sopt, &ent, sizeof(ent), sizeof(ent)); if (error) break; error = del_table_entry(&layer3_chain, ent.tbl, ent.addr, ent.masklen); } break; case IP_FW_TABLE_FLUSH: { u_int16_t tbl; error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)); if (error) break; IPFW_WLOCK(&layer3_chain); error = flush_table(&layer3_chain, tbl); IPFW_WUNLOCK(&layer3_chain); } break; case IP_FW_TABLE_GETSIZE: { u_int32_t tbl, cnt; if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)))) break; IPFW_RLOCK(&layer3_chain); if ((error = count_table(&layer3_chain, tbl, &cnt))) break; IPFW_RUNLOCK(&layer3_chain); error = sooptcopyout(sopt, &cnt, sizeof(cnt)); } break; case IP_FW_TABLE_LIST: { ipfw_table *tbl; if (sopt->sopt_valsize < sizeof(*tbl)) { error = EINVAL; break; } size = sopt->sopt_valsize; tbl = malloc(size, M_TEMP, M_WAITOK); if (tbl == NULL) { error = ENOMEM; break; } error = sooptcopyin(sopt, tbl, size, sizeof(*tbl)); if (error) { free(tbl, M_TEMP); break; } tbl->size = (size - sizeof(*tbl)) / sizeof(ipfw_table_entry); IPFW_WLOCK(&layer3_chain); error = dump_table(&layer3_chain, tbl); if (error) { IPFW_WUNLOCK(&layer3_chain); free(tbl, M_TEMP); break; } IPFW_WUNLOCK(&layer3_chain); error = sooptcopyout(sopt, tbl, size); free(tbl, M_TEMP); } break; default: printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name); error = EINVAL; } return (error); #undef RULE_MAXSIZE } /** * dummynet needs a reference to the default rule, because rules can be * deleted while packets hold a reference to them. When this happens, * dummynet changes the reference to the default rule (it could well be a * NULL pointer, but this way we do not need to check for the special * case, plus here he have info on the default behaviour). */ struct ip_fw *ip_fw_default_rule; /* * This procedure is only used to handle keepalives. It is invoked * every dyn_keepalive_period */ static void ipfw_tick(void * __unused unused) { struct mbuf *m0, *m, *mnext, **mtailp; int i; ipfw_dyn_rule *q; if (dyn_keepalive == 0 || ipfw_dyn_v == NULL || dyn_count == 0) goto done; /* * We make a chain of packets to go out here -- not deferring * until after we drop the IPFW dynamic rule lock would result * in a lock order reversal with the normal packet input -> ipfw * call stack. */ m0 = NULL; mtailp = &m0; IPFW_DYN_LOCK(); for (i = 0 ; i < curr_dyn_buckets ; i++) { for (q = ipfw_dyn_v[i] ; q ; q = q->next ) { if (q->dyn_type == O_LIMIT_PARENT) continue; if (q->id.proto != IPPROTO_TCP) continue; if ( (q->state & BOTH_SYN) != BOTH_SYN) continue; if (TIME_LEQ( time_uptime+dyn_keepalive_interval, q->expire)) continue; /* too early */ if (TIME_LEQ(q->expire, time_uptime)) continue; /* too late, rule expired */ *mtailp = send_pkt(&(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN); if (*mtailp != NULL) mtailp = &(*mtailp)->m_nextpkt; *mtailp = send_pkt(&(q->id), q->ack_fwd - 1, q->ack_rev, 0); if (*mtailp != NULL) mtailp = &(*mtailp)->m_nextpkt; } } IPFW_DYN_UNLOCK(); for (m = mnext = m0; m != NULL; m = mnext) { mnext = m->m_nextpkt; m->m_nextpkt = NULL; ip_output(m, NULL, NULL, 0, NULL, NULL); } done: callout_reset(&ipfw_timeout, dyn_keepalive_period*hz, ipfw_tick, NULL); } int ipfw_init(void) { struct ip_fw default_rule; int error; #ifdef INET6 /* Setup IPv6 fw sysctl tree. */ sysctl_ctx_init(&ip6_fw_sysctl_ctx); ip6_fw_sysctl_tree = SYSCTL_ADD_NODE(&ip6_fw_sysctl_ctx, SYSCTL_STATIC_CHILDREN(_net_inet6_ip6), OID_AUTO, "fw", CTLFLAG_RW | CTLFLAG_SECURE, 0, "Firewall"); SYSCTL_ADD_INT(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree), OID_AUTO, "deny_unknown_exthdrs", CTLFLAG_RW | CTLFLAG_SECURE, &fw_deny_unknown_exthdrs, 0, "Deny packets with unknown IPv6 Extension Headers"); #endif layer3_chain.rules = NULL; - layer3_chain.want_write = 0; - layer3_chain.busy_count = 0; - cv_init(&layer3_chain.cv, "Condition variable for IPFW rw locks"); IPFW_LOCK_INIT(&layer3_chain); ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule zone", sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); IPFW_DYN_LOCK_INIT(); callout_init(&ipfw_timeout, NET_CALLOUT_MPSAFE); bzero(&default_rule, sizeof default_rule); default_rule.act_ofs = 0; default_rule.rulenum = IPFW_DEFAULT_RULE; default_rule.cmd_len = 1; default_rule.set = RESVD_SET; default_rule.cmd[0].len = 1; default_rule.cmd[0].opcode = #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT 1 ? O_ACCEPT : #endif O_DENY; error = add_rule(&layer3_chain, &default_rule); if (error != 0) { printf("ipfw2: error %u initializing default rule " "(support disabled)\n", error); IPFW_DYN_LOCK_DESTROY(); IPFW_LOCK_DESTROY(&layer3_chain); uma_zdestroy(ipfw_dyn_rule_zone); return (error); } ip_fw_default_rule = layer3_chain.rules; printf("ipfw2 (+ipv6) initialized, divert %s, " "rule-based forwarding " #ifdef IPFIREWALL_FORWARD "enabled, " #else "disabled, " #endif "default to %s, logging ", #ifdef IPDIVERT "enabled", #else "loadable", #endif default_rule.cmd[0].opcode == O_ACCEPT ? "accept" : "deny"); #ifdef IPFIREWALL_VERBOSE fw_verbose = 1; #endif #ifdef IPFIREWALL_VERBOSE_LIMIT verbose_limit = IPFIREWALL_VERBOSE_LIMIT; #endif if (fw_verbose == 0) printf("disabled\n"); else if (verbose_limit == 0) printf("unlimited\n"); else printf("limited to %d packets/entry by default\n", verbose_limit); error = init_tables(&layer3_chain); if (error) { IPFW_DYN_LOCK_DESTROY(); IPFW_LOCK_DESTROY(&layer3_chain); uma_zdestroy(ipfw_dyn_rule_zone); return (error); } ip_fw_ctl_ptr = ipfw_ctl; ip_fw_chk_ptr = ipfw_chk; callout_reset(&ipfw_timeout, hz, ipfw_tick, NULL); return (0); } void ipfw_destroy(void) { struct ip_fw *reap; ip_fw_chk_ptr = NULL; ip_fw_ctl_ptr = NULL; callout_drain(&ipfw_timeout); IPFW_WLOCK(&layer3_chain); flush_tables(&layer3_chain); layer3_chain.reap = NULL; free_chain(&layer3_chain, 1 /* kill default rule */); reap = layer3_chain.reap, layer3_chain.reap = NULL; IPFW_WUNLOCK(&layer3_chain); if (reap != NULL) reap_rules(reap); IPFW_DYN_LOCK_DESTROY(); uma_zdestroy(ipfw_dyn_rule_zone); IPFW_LOCK_DESTROY(&layer3_chain); #ifdef INET6 /* Free IPv6 fw sysctl tree. */ sysctl_ctx_free(&ip6_fw_sysctl_ctx); #endif printf("IP firewall unloaded\n"); } diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c index 6f1130ca1114..2250116c6edd 100644 --- a/sys/netinet/ip_input.c +++ b/sys/netinet/ip_input.c @@ -1,1579 +1,1579 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 * $FreeBSD$ */ #include "opt_bootp.h" #include "opt_ipfw.h" #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_carp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_CARP #include #endif #if defined(IPSEC) || defined(FAST_IPSEC) #include #endif /* IPSEC */ #include /* XXX: Temporary until ipfw_ether and ipfw_bridge are converted. */ #include #include int rsvp_on = 0; int ipforwarding = 0; SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW, &ipforwarding, 0, "Enable IP forwarding between interfaces"); static int ipsendredirects = 1; /* XXX */ SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW, &ipsendredirects, 0, "Enable sending IP redirects"); int ip_defttl = IPDEFTTL; SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, &ip_defttl, 0, "Maximum TTL on IP packets"); static int ip_keepfaith = 0; SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW, &ip_keepfaith, 0, "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); static int ip_sendsourcequench = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_RW, &ip_sendsourcequench, 0, "Enable the transmission of source quench packets"); int ip_do_randomid = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW, &ip_do_randomid, 0, "Assign random ip_id values"); /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table * and transmit implementation do not implement the Strong ES model, * setting this to 1 results in an odd hybrid. * * XXX - ip_checkinterface currently must be disabled if you use ipnat * to translate the destination address to another local interface. * * XXX - ip_checkinterface must be disabled if you add IP aliases * to the loopback interface instead of the interface where the * packets for those addresses are received. */ static int ip_checkinterface = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, &ip_checkinterface, 0, "Verify packet arrives on correct interface"); struct pfil_head inet_pfil_hook; /* Packet filter hooks */ static struct ifqueue ipintrq; static int ipqmaxlen = IFQ_MAXLEN; extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; struct in_ifaddrhead in_ifaddrhead; /* first inet address */ struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ u_long in_ifaddrhmask; /* mask for hash table */ SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW, &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue"); SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD, &ipintrq.ifq_drops, 0, "Number of packets dropped from the IP input queue"); struct ipstat ipstat; SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW, &ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); /* * IP datagram reassembly. */ #define IPREASS_NHASH_LOG2 6 #define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) #define IPREASS_HMASK (IPREASS_NHASH - 1) #define IPREASS_HASH(x,y) \ (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) static uma_zone_t ipq_zone; static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; static struct mtx ipqlock; #define IPQ_LOCK() mtx_lock(&ipqlock) #define IPQ_UNLOCK() mtx_unlock(&ipqlock) #define IPQ_LOCK_INIT() mtx_init(&ipqlock, "ipqlock", NULL, MTX_DEF) #define IPQ_LOCK_ASSERT() mtx_assert(&ipqlock, MA_OWNED) static void maxnipq_update(void); static int maxnipq; /* Administrative limit on # reass queues. */ static int nipq = 0; /* Total # of reass queues */ SYSCTL_INT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD, &nipq, 0, "Current number of IPv4 fragment reassembly queue entries"); static int maxfragsperpacket; SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW, &maxfragsperpacket, 0, "Maximum number of IPv4 fragments allowed per packet"); struct callout ipport_tick_callout; #ifdef IPCTL_DEFMTU SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, &ip_mtu, 0, "Default MTU"); #endif #ifdef IPSTEALTH int ipstealth = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, &ipstealth, 0, ""); #endif /* * ipfw_ether and ipfw_bridge hooks. * XXX: Temporary until those are converted to pfil_hooks as well. */ ip_fw_chk_t *ip_fw_chk_ptr = NULL; ip_dn_io_t *ip_dn_io_ptr = NULL; int fw_enable = 1; int fw_one_pass = 1; static void ip_freef(struct ipqhead *, struct ipq *); /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. */ void ip_init() { register struct protosw *pr; register int i; TAILQ_INIT(&in_ifaddrhead); in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &in_ifaddrhmask); pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip_init: PF_INET not found"); /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip_protox[i] = pr - inetsw; /* * Cycle through IP protocols and put them into the appropriate place * in ip_protox[]. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip_protox[pr->pr_protocol] = pr - inetsw; } /* Initialize packet filter hooks. */ inet_pfil_hook.ph_type = PFIL_TYPE_AF; inet_pfil_hook.ph_af = AF_INET; if ((i = pfil_head_register(&inet_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); /* Initialize IP reassembly queue. */ IPQ_LOCK_INIT(); for (i = 0; i < IPREASS_NHASH; i++) TAILQ_INIT(&ipq[i]); maxnipq = nmbclusters / 32; maxfragsperpacket = 16; ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); maxnipq_update(); /* Start ipport_tick. */ callout_init(&ipport_tick_callout, CALLOUT_MPSAFE); ipport_tick(NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); /* Initialize various other remaining things. */ ip_id = time_second & 0xffff; ipintrq.ifq_maxlen = ipqmaxlen; mtx_init(&ipintrq.ifq_mtx, "ip_inq", NULL, MTX_DEF); netisr_register(NETISR_IP, ip_input, &ipintrq, NETISR_MPSAFE); } void ip_fini(xtp) void *xtp; { callout_stop(&ipport_tick_callout); } /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. */ void ip_input(struct mbuf *m) { struct ip *ip = NULL; struct in_ifaddr *ia = NULL; struct ifaddr *ifa; int checkif, hlen = 0; u_short sum; int dchg = 0; /* dest changed after fw */ struct in_addr odst; /* original dst address */ M_ASSERTPKTHDR(m); if (m->m_flags & M_FASTFWD_OURS) { /* * Firewall or NAT changed destination to local. * We expect ip_len and ip_off to be in host byte order. */ m->m_flags &= ~M_FASTFWD_OURS; /* Set up some basics that will be used later. */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; goto ours; } ipstat.ips_total++; if (m->m_pkthdr.len < sizeof(struct ip)) goto tooshort; if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { ipstat.ips_toosmall++; return; } ip = mtod(m, struct ip *); if (ip->ip_v != IPVERSION) { ipstat.ips_badvers++; goto bad; } hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ ipstat.ips_badhlen++; goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { ipstat.ips_badhlen++; return; } ip = mtod(m, struct ip *); } /* 127/8 must not appear on wire - RFC1122 */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) { ipstat.ips_badaddr++; goto bad; } } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); } else { if (hlen == sizeof(struct ip)) { sum = in_cksum_hdr(ip); } else { sum = in_cksum(m, hlen); } } if (sum) { ipstat.ips_badsum++; goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) /* packet is dropped by traffic conditioner */ return; #endif /* * Convert fields to host representation. */ ip->ip_len = ntohs(ip->ip_len); if (ip->ip_len < hlen) { ipstat.ips_badlen++; goto bad; } ip->ip_off = ntohs(ip->ip_off); /* * Check that the amount of data in the buffers * is as at least much as the IP header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len < ip->ip_len) { tooshort: ipstat.ips_tooshort++; goto bad; } if (m->m_pkthdr.len > ip->ip_len) { if (m->m_len == m->m_pkthdr.len) { m->m_len = ip->ip_len; m->m_pkthdr.len = ip->ip_len; } else m_adj(m, ip->ip_len - m->m_pkthdr.len); } #if defined(IPSEC) || defined(FAST_IPSEC) /* * Bypass packet filtering for packets from a tunnel (gif). */ if (ip_ipsec_filtergif(m)) goto passin; #endif /* IPSEC */ /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing (e.g. * by NAT rewriting). When this happens, tell * ip_forward to do the right thing. */ /* Jump over all PFIL processing if hooks are not active. */ - if (inet_pfil_hook.ph_busy_count == -1) + if (!PFIL_HOOKED(&inet_pfil_hook)) goto passin; odst = ip->ip_dst; if (pfil_run_hooks(&inet_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL) != 0) return; if (m == NULL) /* consumed by filter */ return; ip = mtod(m, struct ip *); dchg = (odst.s_addr != ip->ip_dst.s_addr); #ifdef IPFIREWALL_FORWARD if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; goto ours; } #ifndef IPFIREWALL_FORWARD_EXTENDED dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL); #else if ((dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL)) != 0) { /* * Directly ship on the packet. This allows to forward packets * that were destined for us to some other directly connected * host. */ ip_forward(m, dchg); return; } #endif /* IPFIREWALL_FORWARD_EXTENDED */ #endif /* IPFIREWALL_FORWARD */ passin: /* * Process options and, if not destined for us, * ship it on. ip_dooptions returns 1 when an * error was detected (causing an icmp message * to be sent and the original packet to be freed). */ if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) return; /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no * matter if it is destined to another node, or whether it is * a multicast one, RSVP wants it! and prevents it from being forwarded * anywhere else. Also checks if the rsvp daemon is running before * grabbing the packet. */ if (rsvp_on && ip->ip_p==IPPROTO_RSVP) goto ours; /* * Check our list of addresses, to see if the packet is for us. * If we don't have any addresses, assume any unicast packet * we receive might be for us (and let the upper layers deal * with it). */ if (TAILQ_EMPTY(&in_ifaddrhead) && (m->m_flags & (M_MCAST|M_BCAST)) == 0) goto ours; /* * Enable a consistency check between the destination address * and the arrival interface for a unicast packet (the RFC 1122 * strong ES model) if IP forwarding is disabled and the packet * is not locally generated and the packet is not subject to * 'ipfw fwd'. * * XXX - Checking also should be disabled if the destination * address is ipnat'ed to a different interface. * * XXX - Checking is incompatible with IP aliases added * to the loopback interface instead of the interface where * the packets are received. * * XXX - This is the case for carp vhost IPs as well so we * insert a workaround. If the packet got here, we already * checked with carp_iamatch() and carp_forus(). */ checkif = ip_checkinterface && (ipforwarding == 0) && m->m_pkthdr.rcvif != NULL && ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) && #ifdef DEV_CARP !m->m_pkthdr.rcvif->if_carp && #endif (dchg == 0); /* * Check for exact addresses in the hash bucket. */ LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { /* * If the address matches, verify that the packet * arrived via the correct interface if checking is * enabled. */ if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && (!checkif || ia->ia_ifp == m->m_pkthdr.rcvif)) goto ours; } /* * Check for broadcast addresses. * * Only accept broadcast packets that arrive via the matching * interface. Reception of forwarded directed broadcasts would * be handled via ip_forward() and ether_output() with the loopback * into the stack for SIMPLEX interfaces handled by ether_output(). */ if (m->m_pkthdr.rcvif != NULL && m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) { TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == ip->ip_dst.s_addr) goto ours; if (ia->ia_netbroadcast.s_addr == ip->ip_dst.s_addr) goto ours; #ifdef BOOTP_COMPAT if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) goto ours; #endif } } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct in_multi *inm; if (ip_mrouter) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ if (ip_mforward && ip_mforward(ip, m->m_pkthdr.rcvif, m, 0) != 0) { ipstat.ips_cantforward++; m_freem(m); return; } /* * The process-level routing daemon needs to receive * all multicast IGMP packets, whether or not this * host belongs to their destination groups. */ if (ip->ip_p == IPPROTO_IGMP) goto ours; ipstat.ips_forward++; } /* * See if we belong to the destination multicast group on the * arrival interface. */ IN_MULTI_LOCK(); IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm); IN_MULTI_UNLOCK(); if (inm == NULL) { ipstat.ips_notmember++; m_freem(m); return; } goto ours; } if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) goto ours; if (ip->ip_dst.s_addr == INADDR_ANY) goto ours; /* * FAITH(Firewall Aided Internet Translator) */ if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { if (ip_keepfaith) { if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP) goto ours; } m_freem(m); return; } /* * Not for us; forward if possible and desirable. */ if (ipforwarding == 0) { ipstat.ips_cantforward++; m_freem(m); } else { #if defined(IPSEC) || defined(FAST_IPSEC) if (ip_ipsec_fwd(m)) goto bad; #endif /* IPSEC */ ip_forward(m, dchg); } return; ours: #ifdef IPSTEALTH /* * IPSTEALTH: Process non-routing options only * if the packet is destined for us. */ if (ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) return; #endif /* IPSTEALTH */ /* Count the packet in the ip address stats */ if (ia != NULL) { ia->ia_ifa.if_ipackets++; ia->ia_ifa.if_ibytes += m->m_pkthdr.len; } /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ if (ip->ip_off & (IP_MF | IP_OFFMASK)) { m = ip_reass(m); if (m == NULL) return; ip = mtod(m, struct ip *); /* Get the header length of the reassembled packet */ hlen = ip->ip_hl << 2; } /* * Further protocols expect the packet length to be w/o the * IP header. */ ip->ip_len -= hlen; #if defined(IPSEC) || defined(FAST_IPSEC) /* * enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ if (ip_ipsec_input(m)) goto bad; #endif /* IPSEC */ /* * Switch out to protocol's input routine. */ ipstat.ips_delivered++; (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); return; bad: m_freem(m); } /* * After maxnipq has been updated, propagate the change to UMA. The UMA zone * max has slightly different semantics than the sysctl, for historical * reasons. */ static void maxnipq_update(void) { /* * -1 for unlimited allocation. */ if (maxnipq < 0) uma_zone_set_max(ipq_zone, 0); /* * Positive number for specific bound. */ if (maxnipq > 0) uma_zone_set_max(ipq_zone, maxnipq); /* * Zero specifies no further fragment queue allocation -- set the * bound very low, but rely on implementation elsewhere to actually * prevent allocation and reclaim current queues. */ if (maxnipq == 0) uma_zone_set_max(ipq_zone, 1); } static int sysctl_maxnipq(SYSCTL_HANDLER_ARGS) { int error, i; i = maxnipq; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) return (error); /* * XXXRW: Might be a good idea to sanity check the argument and place * an extreme upper bound. */ if (i < -1) return (EINVAL); maxnipq = i; maxnipq_update(); return (0); } SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT|CTLFLAG_RW, NULL, 0, sysctl_maxnipq, "I", "Maximum number of IPv4 fragment reassembly queue entries"); /* * Take incoming datagram fragment and try to reassemble it into * whole datagram. If the argument is the first fragment or one * in between the function will return NULL and store the mbuf * in the fragment chain. If the argument is the last fragment * the packet will be reassembled and the pointer to the new * mbuf returned for further processing. Only m_tags attached * to the first packet/fragment are preserved. * The IP header is *NOT* adjusted out of iplen. */ struct mbuf * ip_reass(struct mbuf *m) { struct ip *ip; struct mbuf *p, *q, *nq, *t; struct ipq *fp = NULL; struct ipqhead *head; int i, hlen, next; u_int8_t ecn, ecn0; u_short hash; /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ if (maxnipq == 0 || maxfragsperpacket == 0) { ipstat.ips_fragments++; ipstat.ips_fragdropped++; m_freem(m); return (NULL); } ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); head = &ipq[hash]; IPQ_LOCK(); /* * Look for queue of fragments * of this datagram. */ TAILQ_FOREACH(fp, head, ipq_list) if (ip->ip_id == fp->ipq_id && ip->ip_src.s_addr == fp->ipq_src.s_addr && ip->ip_dst.s_addr == fp->ipq_dst.s_addr && #ifdef MAC mac_fragment_match(m, fp) && #endif ip->ip_p == fp->ipq_p) goto found; fp = NULL; /* * Attempt to trim the number of allocated fragment queues if it * exceeds the administrative limit. */ if ((nipq > maxnipq) && (maxnipq > 0)) { /* * drop something from the tail of the current queue * before proceeding further */ struct ipq *q = TAILQ_LAST(head, ipqhead); if (q == NULL) { /* gak */ for (i = 0; i < IPREASS_NHASH; i++) { struct ipq *r = TAILQ_LAST(&ipq[i], ipqhead); if (r) { ipstat.ips_fragtimeout += r->ipq_nfrags; ip_freef(&ipq[i], r); break; } } } else { ipstat.ips_fragtimeout += q->ipq_nfrags; ip_freef(head, q); } } found: /* * Adjust ip_len to not reflect header, * convert offset of this to bytes. */ ip->ip_len -= hlen; if (ip->ip_off & IP_MF) { /* * Make sure that fragments have a data length * that's a non-zero multiple of 8 bytes. */ if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { ipstat.ips_toosmall++; /* XXX */ goto dropfrag; } m->m_flags |= M_FRAG; } else m->m_flags &= ~M_FRAG; ip->ip_off <<= 3; /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ ipstat.ips_fragments++; m->m_pkthdr.header = ip; /* Previous ip_reass() started here. */ /* * Presence of header sizes in mbufs * would confuse code below. */ m->m_data += hlen; m->m_len -= hlen; /* * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { fp = uma_zalloc(ipq_zone, M_NOWAIT); if (fp == NULL) goto dropfrag; #ifdef MAC if (mac_init_ipq(fp, M_NOWAIT) != 0) { uma_zfree(ipq_zone, fp); goto dropfrag; } mac_create_ipq(m, fp); #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); nipq++; fp->ipq_nfrags = 1; fp->ipq_ttl = IPFRAGTTL; fp->ipq_p = ip->ip_p; fp->ipq_id = ip->ip_id; fp->ipq_src = ip->ip_src; fp->ipq_dst = ip->ip_dst; fp->ipq_frags = m; m->m_nextpkt = NULL; goto done; } else { fp->ipq_nfrags++; #ifdef MAC mac_update_ipq(m, fp); #endif } #define GETIP(m) ((struct ip*)((m)->m_pkthdr.header)) /* * Handle ECN by comparing this segment with the first one; * if CE is set, do not lose CE. * drop if CE and not-ECT are mixed for the same packet. */ ecn = ip->ip_tos & IPTOS_ECN_MASK; ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; if (ecn == IPTOS_ECN_CE) { if (ecn0 == IPTOS_ECN_NOTECT) goto dropfrag; if (ecn0 != IPTOS_ECN_CE) GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; } if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) goto dropfrag; /* * Find a segment which begins after this one does. */ for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) if (GETIP(q)->ip_off > ip->ip_off) break; /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us, otherwise * stick new segment in the proper place. * * If some of the data is dropped from the the preceding * segment, then it's checksum is invalidated. */ if (p) { i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off; if (i > 0) { if (i >= ip->ip_len) goto dropfrag; m_adj(m, i); m->m_pkthdr.csum_flags = 0; ip->ip_off += i; ip->ip_len -= i; } m->m_nextpkt = p->m_nextpkt; p->m_nextpkt = m; } else { m->m_nextpkt = fp->ipq_frags; fp->ipq_frags = m; } /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off; q = nq) { i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off; if (i < GETIP(q)->ip_len) { GETIP(q)->ip_len -= i; GETIP(q)->ip_off += i; m_adj(q, i); q->m_pkthdr.csum_flags = 0; break; } nq = q->m_nextpkt; m->m_nextpkt = nq; ipstat.ips_fragdropped++; fp->ipq_nfrags--; m_freem(q); } /* * Check for complete reassembly and perform frag per packet * limiting. * * Frag limiting is performed here so that the nth frag has * a chance to complete the packet before we drop the packet. * As a result, n+1 frags are actually allowed per packet, but * only n will ever be stored. (n = maxfragsperpacket.) * */ next = 0; for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { if (GETIP(q)->ip_off != next) { if (fp->ipq_nfrags > maxfragsperpacket) { ipstat.ips_fragdropped += fp->ipq_nfrags; ip_freef(head, fp); } goto done; } next += GETIP(q)->ip_len; } /* Make sure the last packet didn't have the IP_MF flag */ if (p->m_flags & M_FRAG) { if (fp->ipq_nfrags > maxfragsperpacket) { ipstat.ips_fragdropped += fp->ipq_nfrags; ip_freef(head, fp); } goto done; } /* * Reassembly is complete. Make sure the packet is a sane size. */ q = fp->ipq_frags; ip = GETIP(q); if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { ipstat.ips_toolong++; ipstat.ips_fragdropped += fp->ipq_nfrags; ip_freef(head, fp); goto done; } /* * Concatenate fragments. */ m = q; t = m->m_next; m->m_next = NULL; m_cat(m, t); nq = q->m_nextpkt; q->m_nextpkt = NULL; for (q = nq; q != NULL; q = nq) { nq = q->m_nextpkt; q->m_nextpkt = NULL; m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; m_cat(m, q); } #ifdef MAC mac_create_datagram_from_ipq(fp, m); mac_destroy_ipq(fp); #endif /* * Create header for new ip packet by modifying header of first * packet; dequeue and discard fragment reassembly header. * Make header visible. */ ip->ip_len = (ip->ip_hl << 2) + next; ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; TAILQ_REMOVE(head, fp, ipq_list); nipq--; uma_zfree(ipq_zone, fp); m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); /* some debugging cruft by sklower, below, will go away soon */ if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ m_fixhdr(m); ipstat.ips_reassembled++; IPQ_UNLOCK(); return (m); dropfrag: ipstat.ips_fragdropped++; if (fp != NULL) fp->ipq_nfrags--; m_freem(m); done: IPQ_UNLOCK(); return (NULL); #undef GETIP } /* * Free a fragment reassembly header and all * associated datagrams. */ static void ip_freef(fhp, fp) struct ipqhead *fhp; struct ipq *fp; { register struct mbuf *q; IPQ_LOCK_ASSERT(); while (fp->ipq_frags) { q = fp->ipq_frags; fp->ipq_frags = q->m_nextpkt; m_freem(q); } TAILQ_REMOVE(fhp, fp, ipq_list); uma_zfree(ipq_zone, fp); nipq--; } /* * IP timer processing; * if a timer expires on a reassembly * queue, discard it. */ void ip_slowtimo() { register struct ipq *fp; int i; IPQ_LOCK(); for (i = 0; i < IPREASS_NHASH; i++) { for(fp = TAILQ_FIRST(&ipq[i]); fp;) { struct ipq *fpp; fpp = fp; fp = TAILQ_NEXT(fp, ipq_list); if(--fpp->ipq_ttl == 0) { ipstat.ips_fragtimeout += fpp->ipq_nfrags; ip_freef(&ipq[i], fpp); } } } /* * If we are over the maximum number of fragments * (due to the limit being lowered), drain off * enough to get down to the new limit. */ if (maxnipq >= 0 && nipq > maxnipq) { for (i = 0; i < IPREASS_NHASH; i++) { while (nipq > maxnipq && !TAILQ_EMPTY(&ipq[i])) { ipstat.ips_fragdropped += TAILQ_FIRST(&ipq[i])->ipq_nfrags; ip_freef(&ipq[i], TAILQ_FIRST(&ipq[i])); } } } IPQ_UNLOCK(); } /* * Drain off all datagram fragments. */ void ip_drain() { int i; IPQ_LOCK(); for (i = 0; i < IPREASS_NHASH; i++) { while(!TAILQ_EMPTY(&ipq[i])) { ipstat.ips_fragdropped += TAILQ_FIRST(&ipq[i])->ipq_nfrags; ip_freef(&ipq[i], TAILQ_FIRST(&ipq[i])); } } IPQ_UNLOCK(); in_rtqdrain(); } /* * The protocol to be inserted into ip_protox[] must be already registered * in inetsw[], either statically or through pf_proto_register(). */ int ipproto_register(u_char ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto == 0) return (EPROTONOSUPPORT); /* * The protocol slot must not be occupied by another protocol * already. An index pointing to IPPROTO_RAW is unused. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */ return (EEXIST); /* Find the protocol position in inetsw[] and set the index. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) { if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol == ipproto) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) { ip_protox[pr->pr_protocol] = pr - inetsw; return (0); } else return (EINVAL); } } return (EPROTONOSUPPORT); } int ipproto_unregister(u_char ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto == 0) return (EPROTONOSUPPORT); /* Check if the protocol was indeed registered. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */ return (ENOENT); /* Reset the protocol slot to IPPROTO_RAW. */ ip_protox[ipproto] = pr - inetsw; return (0); } /* * Given address of next destination (final or next hop), * return internet address info of interface to be used to get there. */ struct in_ifaddr * ip_rtaddr(dst) struct in_addr dst; { struct route sro; struct sockaddr_in *sin; struct in_ifaddr *ifa; bzero(&sro, sizeof(sro)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = dst; rtalloc_ign(&sro, RTF_CLONING); if (sro.ro_rt == NULL) return (NULL); ifa = ifatoia(sro.ro_rt->rt_ifa); RTFREE(sro.ro_rt); return (ifa); } u_char inetctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, EHOSTUNREACH, 0, ENOPROTOOPT, ECONNREFUSED }; /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful * icmp message because icmp doesn't have a large enough repertoire * of codes and types. * * If not forwarding, just drop the packet. This could be confusing * if ipforwarding was zero but some routing protocol was advancing * us as a gateway to somewhere. However, we must let the routing * protocol deal with that. * * The srcrt parameter indicates whether the packet is being forwarded * via a source route. */ void ip_forward(struct mbuf *m, int srcrt) { struct ip *ip = mtod(m, struct ip *); struct in_ifaddr *ia = NULL; struct mbuf *mcopy; struct in_addr dest; int error, type = 0, code = 0, mtu = 0; if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { ipstat.ips_cantforward++; m_freem(m); return; } #ifdef IPSTEALTH if (!ipstealth) { #endif if (ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); return; } #ifdef IPSTEALTH } #endif if (!srcrt && (ia = ip_rtaddr(ip->ip_dst)) == NULL) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); return; } /* * Save the IP header and at most 8 bytes of the payload, * in case we need to generate an ICMP message to the src. * * XXX this can be optimized a lot by saving the data in a local * buffer on the stack (72 bytes at most), and only allocating the * mbuf if really necessary. The vast majority of the packets * are forwarded without having to send an ICMP back (either * because unnecessary, or because rate limited), so we are * really we are wasting a lot of work here. * * We don't use m_copy() because it might return a reference * to a shared cluster. Both this function and ip_output() * assume exclusive access to the IP header in `m', so any * data in a cluster may change before we reach icmp_error(). */ MGETHDR(mcopy, M_DONTWAIT, m->m_type); if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_DONTWAIT)) { /* * It's probably ok if the pkthdr dup fails (because * the deep copy of the tag chain failed), but for now * be conservative and just discard the copy since * code below may some day want the tags. */ m_free(mcopy); mcopy = NULL; } if (mcopy != NULL) { mcopy->m_len = min(ip->ip_len, M_TRAILINGSPACE(mcopy)); mcopy->m_pkthdr.len = mcopy->m_len; m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); } #ifdef IPSTEALTH if (!ipstealth) { #endif ip->ip_ttl -= IPTTLDEC; #ifdef IPSTEALTH } #endif /* * If forwarding packet using same interface that it came in on, * perhaps should send a redirect to sender to shortcut a hop. * Only send redirect if source is sending directly to us, * and if packet was not source routed (or has any options). * Also, don't send redirect if forwarding using a default route * or a route modified by a redirect. */ dest.s_addr = 0; if (!srcrt && ipsendredirects && ia->ia_ifp == m->m_pkthdr.rcvif) { struct sockaddr_in *sin; struct route ro; struct rtentry *rt; bzero(&ro, sizeof(ro)); sin = (struct sockaddr_in *)&ro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ip->ip_dst; rtalloc_ign(&ro, RTF_CLONING); rt = ro.ro_rt; if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && satosin(rt_key(rt))->sin_addr.s_addr != 0) { #define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) u_long src = ntohl(ip->ip_src.s_addr); if (RTA(rt) && (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { if (rt->rt_flags & RTF_GATEWAY) dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr; else dest.s_addr = ip->ip_dst.s_addr; /* Router requirements says to only send host redirects */ type = ICMP_REDIRECT; code = ICMP_REDIRECT_HOST; } } if (rt) RTFREE(rt); } error = ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); if (error) ipstat.ips_cantforward++; else { ipstat.ips_forward++; if (type) ipstat.ips_redirectsent++; else { if (mcopy) m_freem(mcopy); return; } } if (mcopy == NULL) return; switch (error) { case 0: /* forwarded, but need redirect */ /* type, code set above */ break; case ENETUNREACH: /* shouldn't happen, checked above */ case EHOSTUNREACH: case ENETDOWN: case EHOSTDOWN: default: type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; break; case EMSGSIZE: type = ICMP_UNREACH; code = ICMP_UNREACH_NEEDFRAG; #if defined(IPSEC) || defined(FAST_IPSEC) mtu = ip_ipsec_mtu(m); #endif /* IPSEC */ /* * If the MTU wasn't set before use the interface mtu or * fall back to the next smaller mtu step compared to the * current packet size. */ if (mtu == 0) { if (ia != NULL) mtu = ia->ia_ifp->if_mtu; else mtu = ip_next_mtu(ip->ip_len, 0); } ipstat.ips_cantfrag++; break; case ENOBUFS: /* * A router should not generate ICMP_SOURCEQUENCH as * required in RFC1812 Requirements for IP Version 4 Routers. * Source quench could be a big problem under DoS attacks, * or if the underlying interface is rate-limited. * Those who need source quench packets may re-enable them * via the net.inet.ip.sendsourcequench sysctl. */ if (ip_sendsourcequench == 0) { m_freem(mcopy); return; } else { type = ICMP_SOURCEQUENCH; code = 0; } break; case EACCES: /* ipfw denied packet */ m_freem(mcopy); return; } icmp_error(mcopy, type, code, dest.s_addr, mtu); } void ip_savecontrol(inp, mp, ip, m) register struct inpcb *inp; register struct mbuf **mp; register struct ip *ip; register struct mbuf *m; { if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) { struct bintime bt; bintime(&bt); if (inp->inp_socket->so_options & SO_BINTIME) { *mp = sbcreatecontrol((caddr_t) &bt, sizeof(bt), SCM_BINTIME, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_socket->so_options & SO_TIMESTAMP) { struct timeval tv; bintime2timeval(&bt, &tv); *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } } if (inp->inp_flags & INP_RECVDSTADDR) { *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTTL) { *mp = sbcreatecontrol((caddr_t) &ip->ip_ttl, sizeof(u_char), IP_RECVTTL, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef notyet /* XXX * Moving these out of udp_input() made them even more broken * than they already were. */ /* options were tossed already */ if (inp->inp_flags & INP_RECVOPTS) { *mp = sbcreatecontrol((caddr_t) opts_deleted_above, sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } /* ip_srcroute doesn't do what we want here, need to fix */ if (inp->inp_flags & INP_RECVRETOPTS) { *mp = sbcreatecontrol((caddr_t) ip_srcroute(m), sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #endif if (inp->inp_flags & INP_RECVIF) { struct ifnet *ifp; struct sdlbuf { struct sockaddr_dl sdl; u_char pad[32]; } sdlbuf; struct sockaddr_dl *sdp; struct sockaddr_dl *sdl2 = &sdlbuf.sdl; if (((ifp = m->m_pkthdr.rcvif)) && ( ifp->if_index && (ifp->if_index <= if_index))) { sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; /* * Change our mind and don't try copy. */ if ((sdp->sdl_family != AF_LINK) || (sdp->sdl_len > sizeof(sdlbuf))) { goto makedummy; } bcopy(sdp, sdl2, sdp->sdl_len); } else { makedummy: sdl2->sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); sdl2->sdl_family = AF_LINK; sdl2->sdl_index = 0; sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len, IP_RECVIF, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } } /* * XXX these routines are called from the upper part of the kernel. * They need to be locked when we remove Giant. * * They could also be moved to ip_mroute.c, since all the RSVP * handling is done there already. */ static int ip_rsvp_on; struct socket *ip_rsvpd; int ip_rsvp_init(struct socket *so) { if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; if (ip_rsvpd != NULL) return EADDRINUSE; ip_rsvpd = so; /* * This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ if (!ip_rsvp_on) { ip_rsvp_on = 1; rsvp_on++; } return 0; } int ip_rsvp_done(void) { ip_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ if (ip_rsvp_on) { ip_rsvp_on = 0; rsvp_on--; } return 0; } void rsvp_input(struct mbuf *m, int off) /* XXX must fixup manually */ { if (rsvp_input_p) { /* call the real one if loaded */ rsvp_input_p(m, off); return; } /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ if (!rsvp_on) { m_freem(m); return; } if (ip_rsvpd != NULL) { rip_input(m, off); return; } /* Drop the packet */ m_freem(m); } diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index a3fa63bee6fe..915512e56669 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -1,1609 +1,1609 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 * $FreeBSD$ */ #include "opt_ipfw.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_mbuf_stress_test.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(IPSEC) || defined(FAST_IPSEC) #include #ifdef IPSEC #include #endif #ifdef FAST_IPSEC #include #endif #endif /*IPSEC*/ #include static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); #define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\ x, (ntohl(a.s_addr)>>24)&0xFF,\ (ntohl(a.s_addr)>>16)&0xFF,\ (ntohl(a.s_addr)>>8)&0xFF,\ (ntohl(a.s_addr))&0xFF, y); u_short ip_id; #ifdef MBUF_STRESS_TEST int mbuf_frag_size = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); #endif static struct ifnet *ip_multicast_if(struct in_addr *, int *); static void ip_mloopback (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); static int ip_getmoptions(struct inpcb *, struct sockopt *); static int ip_setmoptions(struct inpcb *, struct sockopt *); extern struct protosw inetsw[]; /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * In the IP forwarding case, the packet will arrive with options already * inserted, so must have a NULL opt pointer. */ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int len, error = 0; struct sockaddr_in *dst = NULL; /* keep compiler happy */ struct in_ifaddr *ia = NULL; int isbroadcast, sw_csum; struct route iproute; struct in_addr odst; #ifdef IPFIREWALL_FORWARD struct m_tag *fwd_tag = NULL; #endif M_ASSERTPKTHDR(m); if (ro == NULL) { ro = &iproute; bzero(ro, sizeof (*ro)); } if (inp != NULL) INP_LOCK_ASSERT(inp); if (opt) { len = 0; m = ip_insertoptions(m, opt, &len); if (len != 0) hlen = len; } ip = mtod(m, struct ip *); /* * Fill in IP header. If we are not allowing fragmentation, * then the ip_id field is meaningless, but we don't set it * to zero. Doing so causes various problems when devices along * the path (routers, load balancers, firewalls, etc.) illegally * disable DF on our packet. Note that a 16-bit counter * will wrap around in less than 10 seconds at 100 Mbit/s on a * medium with MTU 1500. See Steven M. Bellovin, "A Technique * for Counting NATted Hosts", Proc. IMW'02, available at * . */ if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip->ip_id = ip_newid(); ipstat.ips_localout++; } else { hlen = ip->ip_hl << 2; } dst = (struct sockaddr_in *)&ro->ro_dst; again: /* * If there is a cached route, * check that it is to the same destination * and is still up. If not, free it and try again. * The address family should also be checked in case of sharing the * cache with IPv6. */ if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { RTFREE(ro->ro_rt); ro->ro_rt = (struct rtentry *)0; } #ifdef IPFIREWALL_FORWARD if (ro->ro_rt == NULL && fwd_tag == NULL) { #else if (ro->ro_rt == NULL) { #endif bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } /* * If routing to interface only, * short circuit routing lookup. */ if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) { ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; IFP_TO_IA(ifp, ia); isbroadcast = 0; /* fool gcc */ } else { /* * We want to do any cloning requested by the link layer, * as this is probably required in all cases for correct * operation (as it is for ARP). */ if (ro->ro_rt == NULL) rtalloc_ign(ro, 0); if (ro->ro_rt == NULL) { ipstat.ips_noroute++; error = EHOSTUNREACH; goto bad; } ia = ifatoia(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_rmx.rmx_pksent++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; if (ro->ro_rt->rt_flags & RTF_HOST) isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); else isbroadcast = in_broadcast(dst->sin_addr, ifp); } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct in_multi *inm; m->m_flags |= M_MCAST; /* * IP destination address is multicast. Make sure "dst" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ dst = (struct sockaddr_in *)&ro->ro_dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { ip->ip_ttl = imo->imo_multicast_ttl; if (imo->imo_multicast_vif != -1) ip->ip_src.s_addr = ip_mcast_src ? ip_mcast_src(imo->imo_multicast_vif) : INADDR_ANY; } else ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } } /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) ip->ip_src = IA_SIN(ia)->sin_addr; } IN_MULTI_LOCK(); IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); if (inm != NULL && (imo == NULL || imo->imo_multicast_loop)) { IN_MULTI_UNLOCK(); /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not * forbid loopback, loop back a copy. */ ip_mloopback(ifp, m, dst, hlen); } else { IN_MULTI_UNLOCK(); /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IP_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip_mloopback(), * above, will be forwarded by the ip_input() routine, * if necessary. */ if (ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * If rsvp daemon is not running, do not * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ if (!rsvp_on) imo = NULL; if (ip_mforward && ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); goto done; } goto sendit; } #ifndef notdef /* * If the source address is not specified yet, use the address * of the outoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) { ip->ip_src = IA_SIN(ia)->sin_addr; } } #endif /* notdef */ /* * Verify that we have any chance at all of being able to queue the * packet or packet fragments, unless ALTQ is enabled on the given * interface in which case packetdrop should be done by queueing. */ #ifdef ALTQ if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) && ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= ifp->if_snd.ifq_maxlen)) #else if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= ifp->if_snd.ifq_maxlen) #endif /* ALTQ */ { error = ENOBUFS; ipstat.ips_odropped++; ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1); goto bad; } /* * Look for broadcast address and * verify user is allowed to send * such a packet. */ if (isbroadcast) { if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EADDRNOTAVAIL; goto bad; } if ((flags & IP_ALLOWBROADCAST) == 0) { error = EACCES; goto bad; } /* don't allow broadcast messages to be fragmented */ if (ip->ip_len > ifp->if_mtu) { error = EMSGSIZE; goto bad; } if (flags & IP_SENDONES) ip->ip_dst.s_addr = INADDR_BROADCAST; m->m_flags |= M_BCAST; } else { m->m_flags &= ~M_BCAST; } sendit: #if defined(IPSEC) || defined(FAST_IPSEC) switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) { case 1: goto bad; case -1: goto done; case 0: default: break; /* Continue with packet processing. */ } /* Update variables that are affected by ipsec4_output(). */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ - if (inet_pfil_hook.ph_busy_count == -1) + if (!PFIL_HOOKED(&inet_pfil_hook)) goto passout; /* Run through list of hooks for output packets. */ odst.s_addr = ip->ip_dst.s_addr; error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp); if (error != 0 || m == NULL) goto done; ip = mtod(m, struct ip *); /* See if destination IP address was changed by packet filter. */ if (odst.s_addr != ip->ip_dst.s_addr) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip_input(). */ if (in_localip(ip->ip_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; error = netisr_queue(NETISR_IP, m); goto done; } else goto again; /* Redo the routing table lookup. */ } #ifdef IPFIREWALL_FORWARD /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; error = netisr_queue(NETISR_IP, m); goto done; } /* Or forward to some other address? */ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); if (fwd_tag) { #ifndef IPFIREWALL_FORWARD_EXTENDED if (!in_localip(ip->ip_src) && !in_localaddr(ip->ip_dst)) { #endif dst = (struct sockaddr_in *)&ro->ro_dst; bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); m->m_flags |= M_SKIP_FIREWALL; m_tag_delete(m, fwd_tag); goto again; #ifndef IPFIREWALL_FORWARD_EXTENDED } else { m_tag_delete(m, fwd_tag); /* Continue. */ } #endif } #endif /* IPFIREWALL_FORWARD */ passout: /* 127/8 must not appear on wire - RFC1122. */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { ipstat.ips_badaddr++; error = EADDRNOTAVAIL; goto bad; } } m->m_pkthdr.csum_flags |= CSUM_IP; sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; if (sw_csum & CSUM_DELAY_DATA) { in_delayed_cksum(m); sw_csum &= ~CSUM_DELAY_DATA; } m->m_pkthdr.csum_flags &= ifp->if_hwassist; /* * If small enough for interface, or the interface will take * care of the fragmentation for us, can just send directly. */ if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT && ((ip->ip_off & IP_DF) == 0))) { ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) ip->ip_sum = in_cksum(m, hlen); /* Record statistics for this interface address. */ if (!(flags & IP_FORWARDING) && ia) { ia->ia_ifa.if_opackets++; ia->ia_ifa.if_obytes += m->m_pkthdr.len; } #ifdef IPSEC /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif #ifdef MBUF_STRESS_TEST if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) m = m_fragment(m, M_DONTWAIT, mbuf_frag_size); #endif /* * Reset layer specific mbuf flags * to avoid confusing lower layers. */ m->m_flags &= ~(M_PROTOFLAGS); error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro->ro_rt); goto done; } if (ip->ip_off & IP_DF) { error = EMSGSIZE; /* * This case can happen if the user changed the MTU * of an interface after enabling IP on it. Because * most netifs don't keep track of routes pointing to * them, there is no way for one to update all its * routes when the MTU is changed. */ if (ro != NULL && (ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; } ipstat.ips_cantfrag++; goto bad; } /* * Too large for interface; fragment if possible. If successful, * on return, m will point to a list of packets to be sent. */ error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum); if (error) goto bad; for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; #ifdef IPSEC /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { ia->ia_ifa.if_opackets++; ia->ia_ifa.if_obytes += m->m_pkthdr.len; } /* * Reset layer specific mbuf flags * to avoid confusing upper layers. */ m->m_flags &= ~(M_PROTOFLAGS); error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro->ro_rt); } else m_freem(m); } if (error == 0) ipstat.ips_fragmented++; done: if (ro == &iproute && ro->ro_rt) { RTFREE(ro->ro_rt); } return (error); bad: m_freem(m); goto done; } /* * Create a chain of fragments which fit the given mtu. m_frag points to the * mbuf to be fragmented; on return it points to the chain with the fragments. * Return 0 if no error. If error, m_frag may contain a partially built * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags, int sw_csum) { int error = 0; int hlen = ip->ip_hl << 2; int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ int off; struct mbuf *m0 = *m_frag; /* the original packet */ int firstlen; struct mbuf **mnext; int nfrags; if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ ipstat.ips_cantfrag++; return EMSGSIZE; } /* * Must be able to put at least 8 bytes per fragment. */ if (len < 8) return EMSGSIZE; /* * If the interface will not calculate checksums on * fragmented packets, then do it here. */ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA && (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } if (len > PAGE_SIZE) { /* * Fragment large datagrams such that each segment * contains a multiple of PAGE_SIZE amount of data, * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. * * XXX When does this help given that sender and receiver * could have different page sizes, and also mtu could * be less than the receiver's page size ? */ int newlen; struct mbuf *m; for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) off += m->m_len; /* * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) goto smart_frag_failure; off = ((off - hlen) & ~7) + hlen; newlen = (~PAGE_MASK) & mtu; if ((newlen + sizeof (struct ip)) > mtu) { /* we failed, go back the default */ smart_frag_failure: newlen = len; off = hlen + len; } len = newlen; } else { off = hlen + len; } firstlen = off - hlen; mnext = &m0->m_nextpkt; /* pointer to next packet */ /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. * Here, m0 is the original packet, m is the fragment being created. * The fragments are linked off the m_nextpkt of the original * packet, which after processing serves as the first fragment. */ for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) { struct ip *mhip; /* ip header on the fragment */ struct mbuf *m; int mhlen = sizeof (struct ip); MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; ipstat.ips_odropped++; goto done; } m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; /* * In the first mbuf, leave room for the link header, then * copy the original IP header including options. The payload * goes into an additional mbuf chain returned by m_copy(). */ m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); *mhip = *ip; if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); mhip->ip_v = IPVERSION; mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; /* XXX do we need to add ip->ip_off below ? */ mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; if (off + len >= ip->ip_len) { /* last fragment */ len = ip->ip_len - off; m->m_flags |= M_LASTFRAG; } else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copy(m0, off, len); if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ ipstat.ips_odropped++; goto done; } m->m_pkthdr.len = mhlen + len; m->m_pkthdr.rcvif = NULL; #ifdef MAC mac_create_fragment(m0, m); #endif m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) mhip->ip_sum = in_cksum(m, mhlen); *mnext = m; mnext = &m->m_nextpkt; } ipstat.ips_ofragments += nfrags; /* set first marker for fragment chain */ m0->m_flags |= M_FIRSTFRAG | M_FRAG; m0->m_pkthdr.csum_data = nfrags; /* * Update first fragment by trimming what's been copied out * and updating header. */ m_adj(m0, hlen + firstlen - ip->ip_len); m0->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m0->m_pkthdr.len); ip->ip_off |= IP_MF; ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) ip->ip_sum = in_cksum(m0, hlen); done: *m_frag = m0; return error; } void in_delayed_cksum(struct mbuf *m) { struct ip *ip; u_short csum, offset; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; csum = in_cksum_skip(m, ip->ip_len, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(u_short) > m->m_len) { printf("delayed m_pullup, m->len: %d off: %d p: %d\n", m->m_len, offset, ip->ip_p); /* * XXX * this shouldn't happen, but if it does, the * correct behavior may be to insert the checksum * in the appropriate next mbuf in the chain. */ return; } *(u_short *)(m->m_data + offset) = csum; } /* * IP socket option processing. */ int ip_ctloutput(so, sopt) struct socket *so; struct sockopt *sopt; { struct inpcb *inp = sotoinpcb(so); int error, optval; error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { return (EINVAL); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif { struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); INP_LOCK(inp); error = ip_pcbopts(inp, sopt->sopt_name, m); INP_UNLOCK(inp); return (error); } case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_FAITH: case IP_ONESBCAST: case IP_DONTFRAG: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos = optval; break; case IP_TTL: inp->inp_ip_ttl = optval; break; case IP_MINTTL: if (optval > 0 && optval <= MAXTTL) inp->inp_ip_minttl = optval; else error = EINVAL; break; #define OPTSET(bit) do { \ INP_LOCK(inp); \ if (optval) \ inp->inp_flags |= bit; \ else \ inp->inp_flags &= ~bit; \ INP_UNLOCK(inp); \ } while (0) case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; case IP_RECVRETOPTS: OPTSET(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: OPTSET(INP_RECVDSTADDR); break; case IP_RECVTTL: OPTSET(INP_RECVTTL); break; case IP_RECVIF: OPTSET(INP_RECVIF); break; case IP_FAITH: OPTSET(INP_FAITH); break; case IP_ONESBCAST: OPTSET(INP_ONESBCAST); break; case IP_DONTFRAG: OPTSET(INP_DONTFRAG); break; } break; #undef OPTSET case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: error = ip_setmoptions(inp, sopt); break; case IP_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_LOCK(inp); switch (optval) { case IP_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IP_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IP_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_UNLOCK(inp); break; #if defined(IPSEC) || defined(FAST_IPSEC) case IP_IPSEC_POLICY: { caddr_t req; size_t len = 0; int priv; struct mbuf *m; int optname; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; priv = (sopt->sopt_td != NULL && suser(sopt->sopt_td) != 0) ? 0 : 1; req = mtod(m, caddr_t); len = m->m_len; optname = sopt->sopt_name; error = ipsec4_set_policy(inp, optname, req, len, priv); m_freem(m); break; } #endif /*IPSEC*/ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: if (inp->inp_options) error = sooptcopyout(sopt, mtod(inp->inp_options, char *), inp->inp_options->m_len); else sopt->sopt_valsize = 0; break; case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_PORTRANGE: case IP_FAITH: case IP_ONESBCAST: case IP_DONTFRAG: switch (sopt->sopt_name) { case IP_TOS: optval = inp->inp_ip_tos; break; case IP_TTL: optval = inp->inp_ip_ttl; break; case IP_MINTTL: optval = inp->inp_ip_minttl; break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); break; case IP_RECVRETOPTS: optval = OPTBIT(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: optval = OPTBIT(INP_RECVDSTADDR); break; case IP_RECVTTL: optval = OPTBIT(INP_RECVTTL); break; case IP_RECVIF: optval = OPTBIT(INP_RECVIF); break; case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; else if (inp->inp_flags & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = 0; break; case IP_FAITH: optval = OPTBIT(INP_FAITH); break; case IP_ONESBCAST: optval = OPTBIT(INP_ONESBCAST); break; case IP_DONTFRAG: optval = OPTBIT(INP_DONTFRAG); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: error = ip_getmoptions(inp, sopt); break; #if defined(IPSEC) || defined(FAST_IPSEC) case IP_IPSEC_POLICY: { struct mbuf *m = NULL; caddr_t req = NULL; size_t len = 0; if (m != 0) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0) m_freem(m); break; } #endif /*IPSEC*/ default: error = ENOPROTOOPT; break; } break; } return (error); } /* * XXX * The whole multicast option thing needs to be re-thought. * Several of these options are equally applicable to non-multicast * transmission, and one (IP_MULTICAST_TTL) totally duplicates a * standard option (IP_TTL). */ /* * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. */ static struct ifnet * ip_multicast_if(a, ifindexp) struct in_addr *a; int *ifindexp; { int ifindex; struct ifnet *ifp; if (ifindexp) *ifindexp = 0; if (ntohl(a->s_addr) >> 24 == 0) { ifindex = ntohl(a->s_addr) & 0xffffff; if (ifindex < 0 || if_index < ifindex) return NULL; ifp = ifnet_byindex(ifindex); if (ifindexp) *ifindexp = ifindex; } else { INADDR_TO_IFP(*a, ifp); } return ifp; } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. */ static struct ip_moptions * ip_findmoptions(struct inpcb *inp) { struct ip_moptions *imo; INP_LOCK(inp); if (inp->inp_moptions != NULL) return (inp->inp_moptions); INP_UNLOCK(inp); imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); imo->imo_multicast_ifp = NULL; imo->imo_multicast_addr.s_addr = INADDR_ANY; imo->imo_multicast_vif = -1; imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; imo->imo_num_memberships = 0; INP_LOCK(inp); if (inp->inp_moptions != NULL) { free(imo, M_IPMOPTS); return (inp->inp_moptions); } inp->inp_moptions = imo; return (imo); } /* * Set the IP multicast options in response to user setsockopt(). */ static int ip_setmoptions(struct inpcb *inp, struct sockopt *sopt) { int error = 0; int i; struct in_addr addr; struct ip_mreq mreq; struct ifnet *ifp; struct ip_moptions *imo; struct route ro; struct sockaddr_in *dst; int ifindex; int s; switch (sopt->sopt_name) { /* store an index number for the vif you wanna use in the send */ case IP_MULTICAST_VIF: if (legal_vif_num == 0) { error = EOPNOTSUPP; break; } error = sooptcopyin(sopt, &i, sizeof i, sizeof i); if (error) break; if (!legal_vif_num(i) && (i != -1)) { error = EINVAL; break; } imo = ip_findmoptions(inp); imo->imo_multicast_vif = i; INP_UNLOCK(inp); break; case IP_MULTICAST_IF: /* * Select the interface for outgoing multicast packets. */ error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr); if (error) break; /* * INADDR_ANY is used to remove a previous selection. * When no interface is selected, a default one is * chosen every time a multicast packet is sent. */ imo = ip_findmoptions(inp); if (addr.s_addr == INADDR_ANY) { imo->imo_multicast_ifp = NULL; INP_UNLOCK(inp); break; } /* * The selected interface is identified by its local * IP address. Find the interface and confirm that * it supports multicasting. */ s = splimp(); ifp = ip_multicast_if(&addr, &ifindex); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { INP_UNLOCK(inp); splx(s); error = EADDRNOTAVAIL; break; } imo->imo_multicast_ifp = ifp; if (ifindex) imo->imo_multicast_addr = addr; else imo->imo_multicast_addr.s_addr = INADDR_ANY; INP_UNLOCK(inp); splx(s); break; case IP_MULTICAST_TTL: /* * Set the IP time-to-live for outgoing multicast packets. * The original multicast API required a char argument, * which is inconsistent with the rest of the socket API. * We allow either a char or an int. */ if (sopt->sopt_valsize == 1) { u_char ttl; error = sooptcopyin(sopt, &ttl, 1, 1); if (error) break; imo = ip_findmoptions(inp); imo->imo_multicast_ttl = ttl; INP_UNLOCK(inp); } else { u_int ttl; error = sooptcopyin(sopt, &ttl, sizeof ttl, sizeof ttl); if (error) break; if (ttl > 255) error = EINVAL; else { imo = ip_findmoptions(inp); imo->imo_multicast_ttl = ttl; INP_UNLOCK(inp); } } break; case IP_MULTICAST_LOOP: /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. The original multicast API required a * char argument, which is inconsistent with the rest * of the socket API. We allow either a char or an int. */ if (sopt->sopt_valsize == 1) { u_char loop; error = sooptcopyin(sopt, &loop, 1, 1); if (error) break; imo = ip_findmoptions(inp); imo->imo_multicast_loop = !!loop; INP_UNLOCK(inp); } else { u_int loop; error = sooptcopyin(sopt, &loop, sizeof loop, sizeof loop); if (error) break; imo = ip_findmoptions(inp); imo->imo_multicast_loop = !!loop; INP_UNLOCK(inp); } break; case IP_ADD_MEMBERSHIP: /* * Add a multicast group membership. * Group must be a valid IP multicast address. */ error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); if (error) break; if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { error = EINVAL; break; } s = splimp(); /* * If no interface address was provided, use the interface of * the route to the given multicast address. */ if (mreq.imr_interface.s_addr == INADDR_ANY) { bzero((caddr_t)&ro, sizeof(ro)); dst = (struct sockaddr_in *)&ro.ro_dst; dst->sin_len = sizeof(*dst); dst->sin_family = AF_INET; dst->sin_addr = mreq.imr_multiaddr; rtalloc_ign(&ro, RTF_CLONING); if (ro.ro_rt == NULL) { error = EADDRNOTAVAIL; splx(s); break; } ifp = ro.ro_rt->rt_ifp; RTFREE(ro.ro_rt); } else { ifp = ip_multicast_if(&mreq.imr_interface, NULL); } /* * See if we found an interface, and confirm that it * supports multicast. */ if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; splx(s); break; } /* * See if the membership already exists or if all the * membership slots are full. */ imo = ip_findmoptions(inp); for (i = 0; i < imo->imo_num_memberships; ++i) { if (imo->imo_membership[i]->inm_ifp == ifp && imo->imo_membership[i]->inm_addr.s_addr == mreq.imr_multiaddr.s_addr) break; } if (i < imo->imo_num_memberships) { INP_UNLOCK(inp); error = EADDRINUSE; splx(s); break; } if (i == IP_MAX_MEMBERSHIPS) { INP_UNLOCK(inp); error = ETOOMANYREFS; splx(s); break; } /* * Everything looks good; add a new record to the multicast * address list for the given interface. */ if ((imo->imo_membership[i] = in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) { INP_UNLOCK(inp); error = ENOBUFS; splx(s); break; } ++imo->imo_num_memberships; INP_UNLOCK(inp); splx(s); break; case IP_DROP_MEMBERSHIP: /* * Drop a multicast group membership. * Group must be a valid IP multicast address. */ error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); if (error) break; if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { error = EINVAL; break; } s = splimp(); /* * If an interface address was specified, get a pointer * to its ifnet structure. */ if (mreq.imr_interface.s_addr == INADDR_ANY) ifp = NULL; else { ifp = ip_multicast_if(&mreq.imr_interface, NULL); if (ifp == NULL) { error = EADDRNOTAVAIL; splx(s); break; } } /* * Find the membership in the membership array. */ imo = ip_findmoptions(inp); for (i = 0; i < imo->imo_num_memberships; ++i) { if ((ifp == NULL || imo->imo_membership[i]->inm_ifp == ifp) && imo->imo_membership[i]->inm_addr.s_addr == mreq.imr_multiaddr.s_addr) break; } if (i == imo->imo_num_memberships) { INP_UNLOCK(inp); error = EADDRNOTAVAIL; splx(s); break; } /* * Give up the multicast address record to which the * membership points. */ in_delmulti(imo->imo_membership[i]); /* * Remove the gap in the membership array. */ for (++i; i < imo->imo_num_memberships; ++i) imo->imo_membership[i-1] = imo->imo_membership[i]; --imo->imo_num_memberships; INP_UNLOCK(inp); splx(s); break; default: error = EOPNOTSUPP; break; } return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ static int ip_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip_moptions *imo; struct in_addr addr; struct in_ifaddr *ia; int error, optval; u_char coptval; INP_LOCK(inp); imo = inp->inp_moptions; error = 0; switch (sopt->sopt_name) { case IP_MULTICAST_VIF: if (imo != NULL) optval = imo->imo_multicast_vif; else optval = -1; INP_UNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_MULTICAST_IF: if (imo == NULL || imo->imo_multicast_ifp == NULL) addr.s_addr = INADDR_ANY; else if (imo->imo_multicast_addr.s_addr) { /* return the value user has set */ addr = imo->imo_multicast_addr; } else { IFP_TO_IA(imo->imo_multicast_ifp, ia); addr.s_addr = (ia == NULL) ? INADDR_ANY : IA_SIN(ia)->sin_addr.s_addr; } INP_UNLOCK(inp); error = sooptcopyout(sopt, &addr, sizeof addr); break; case IP_MULTICAST_TTL: if (imo == 0) optval = coptval = IP_DEFAULT_MULTICAST_TTL; else optval = coptval = imo->imo_multicast_ttl; INP_UNLOCK(inp); if (sopt->sopt_valsize == 1) error = sooptcopyout(sopt, &coptval, 1); else error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_MULTICAST_LOOP: if (imo == 0) optval = coptval = IP_DEFAULT_MULTICAST_LOOP; else optval = coptval = imo->imo_multicast_loop; INP_UNLOCK(inp); if (sopt->sopt_valsize == 1) error = sooptcopyout(sopt, &coptval, 1); else error = sooptcopyout(sopt, &optval, sizeof optval); break; default: INP_UNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Discard the IP multicast options. */ void ip_freemoptions(imo) register struct ip_moptions *imo; { register int i; if (imo != NULL) { for (i = 0; i < imo->imo_num_memberships; ++i) in_delmulti(imo->imo_membership[i]); free(imo, M_IPMOPTS); } } /* * Routine called from ip_output() to loop back a copy of an IP multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be a loopback interface -- evil, but easier than * replicating that code here. */ static void ip_mloopback(ifp, m, dst, hlen) struct ifnet *ifp; register struct mbuf *m; register struct sockaddr_in *dst; int hlen; { register struct ip *ip; struct mbuf *copym; copym = m_copy(m, 0, M_COPYALL); if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* If needed, compute the checksum and mark it as valid. */ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(copym); copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } /* * We don't bother to fragment if the IP length is greater * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); /* * NB: * It's not clear whether there are any lingering * reentrancy problems in other areas which might * be exposed by using ip_input directly (in * particular, everything which modifies the packet * in-place). Yet another option is using the * protosw directly to deliver the looped back * packet. For the moment, we'll err on the side * of safety by using if_simloop(). */ #if 1 /* XXX */ if (dst->sin_family != AF_INET) { printf("ip_mloopback: bad address family %d\n", dst->sin_family); dst->sin_family = AF_INET; } #endif #ifdef notdef copym->m_pkthdr.rcvif = ifp; ip_input(copym); #else if_simloop(ifp, copym, dst->sin_family, 0); #endif } } diff --git a/sys/netinet6/ip6_forward.c b/sys/netinet6/ip6_forward.c index 5041fe87ff49..3b542cf6845c 100644 --- a/sys/netinet6/ip6_forward.c +++ b/sys/netinet6/ip6_forward.c @@ -1,675 +1,675 @@ /* $FreeBSD$ */ /* $KAME: ip6_forward.c,v 1.69 2001/05/17 03:48:30 itojun Exp $ */ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_ip6fw.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_ipstealth.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #ifdef INET6 #include #endif #include #endif /* IPSEC */ #ifdef FAST_IPSEC #include #include #include #define IPSEC #endif /* FAST_IPSEC */ #include #include #include struct route_in6 ip6_forward_rt; /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful * icmp message because icmp doesn't have a large enough repertoire * of codes and types. * * If not forwarding, just drop the packet. This could be confusing * if ipforwarding was zero but some routing protocol was advancing * us as a gateway to somewhere. However, we must let the routing * protocol deal with that. * */ void ip6_forward(m, srcrt) struct mbuf *m; int srcrt; { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct sockaddr_in6 *dst = NULL; struct rtentry *rt = NULL; int error, type = 0, code = 0; struct mbuf *mcopy = NULL; struct ifnet *origifp; /* maybe unnecessary */ u_int32_t inzone, outzone; struct in6_addr src_in6, dst_in6; #ifdef IPSEC struct secpolicy *sp = NULL; int ipsecrt = 0; #endif #ifdef IPSEC /* * Check AH/ESP integrity. */ /* * Don't increment ip6s_cantforward because this is the check * before forwarding packet actually. */ if (ipsec6_in_reject(m, NULL)) { #if !defined(FAST_IPSEC) ipsec6stat.in_polvio++; #endif m_freem(m); return; } #endif /* IPSEC */ /* * Do not forward packets to multicast destination (should be handled * by ip6_mforward(). * Do not forward packets with unspecified source. It was discussed * in July 2000, on the ipngwg mailing list. */ if ((m->m_flags & (M_BCAST|M_MCAST)) != 0 || IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { ip6stat.ip6s_cantforward++; /* XXX in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard) */ if (ip6_log_time + ip6_log_interval < time_second) { ip6_log_time = time_second; log(LOG_DEBUG, "cannot forward " "from %s to %s nxt %d received on %s\n", ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst), ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif)); } m_freem(m); return; } #ifdef IPSTEALTH if (!ip6stealth) { #endif if (ip6->ip6_hlim <= IPV6_HLIMDEC) { /* XXX in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard) */ icmp6_error(m, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_TRANSIT, 0); return; } ip6->ip6_hlim -= IPV6_HLIMDEC; #ifdef IPSTEALTH } #endif /* * Save at most ICMPV6_PLD_MAXLEN (= the min IPv6 MTU - * size of IPv6 + ICMPv6 headers) bytes of the packet in case * we need to generate an ICMP6 message to the src. * Thanks to M_EXT, in most cases copy will not occur. * * It is important to save it before IPsec processing as IPsec * processing may modify the mbuf. */ mcopy = m_copy(m, 0, imin(m->m_pkthdr.len, ICMPV6_PLD_MAXLEN)); #ifdef IPSEC /* get a security policy for this packet */ sp = ipsec6_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, IP_FORWARDING, &error); if (sp == NULL) { ipsec6stat.out_inval++; ip6stat.ip6s_cantforward++; if (mcopy) { #if 0 /* XXX: what icmp ? */ #else m_freem(mcopy); #endif } m_freem(m); return; } error = 0; /* check policy */ switch (sp->policy) { case IPSEC_POLICY_DISCARD: /* * This packet is just discarded. */ ipsec6stat.out_polvio++; ip6stat.ip6s_cantforward++; key_freesp(sp); if (mcopy) { #if 0 /* XXX: what icmp ? */ #else m_freem(mcopy); #endif } m_freem(m); return; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: /* no need to do IPsec. */ key_freesp(sp); goto skip_ipsec; case IPSEC_POLICY_IPSEC: if (sp->req == NULL) { /* XXX should be panic ? */ printf("ip6_forward: No IPsec request specified.\n"); ip6stat.ip6s_cantforward++; key_freesp(sp); if (mcopy) { #if 0 /* XXX: what icmp ? */ #else m_freem(mcopy); #endif } m_freem(m); return; } /* do IPsec */ break; case IPSEC_POLICY_ENTRUST: default: /* should be panic ?? */ printf("ip6_forward: Invalid policy found. %d\n", sp->policy); key_freesp(sp); goto skip_ipsec; } { struct ipsecrequest *isr = NULL; struct ipsec_output_state state; /* * when the kernel forwards a packet, it is not proper to apply * IPsec transport mode to the packet is not proper. this check * avoid from this. * at present, if there is even a transport mode SA request in the * security policy, the kernel does not apply IPsec to the packet. * this check is not enough because the following case is valid. * ipsec esp/tunnel/xxx-xxx/require esp/transport//require; */ for (isr = sp->req; isr; isr = isr->next) { if (isr->saidx.mode == IPSEC_MODE_ANY) goto doipsectunnel; if (isr->saidx.mode == IPSEC_MODE_TUNNEL) goto doipsectunnel; } /* * if there's no need for tunnel mode IPsec, skip. */ if (!isr) goto skip_ipsec; doipsectunnel: /* * All the extension headers will become inaccessible * (since they can be encrypted). * Don't panic, we need no more updates to extension headers * on inner IPv6 packet (since they are now encapsulated). * * IPv6 [ESP|AH] IPv6 [extension headers] payload */ bzero(&state, sizeof(state)); state.m = m; state.ro = NULL; /* update at ipsec6_output_tunnel() */ state.dst = NULL; /* update at ipsec6_output_tunnel() */ error = ipsec6_output_tunnel(&state, sp, 0); m = state.m; key_freesp(sp); if (error) { /* mbuf is already reclaimed in ipsec6_output_tunnel. */ switch (error) { case EHOSTUNREACH: case ENETUNREACH: case EMSGSIZE: case ENOBUFS: case ENOMEM: break; default: printf("ip6_output (ipsec): error code %d\n", error); /* FALLTHROUGH */ case ENOENT: /* don't show these error codes to the user */ break; } ip6stat.ip6s_cantforward++; if (mcopy) { #if 0 /* XXX: what icmp ? */ #else m_freem(mcopy); #endif } m_freem(m); return; } if (ip6 != mtod(m, struct ip6_hdr *)) { /* * now tunnel mode headers are added. we are originating * packet instead of forwarding the packet. */ ip6_output(m, NULL, NULL, IPV6_FORWARDING/*XXX*/, NULL, NULL, NULL); goto freecopy; } /* adjust pointer */ dst = (struct sockaddr_in6 *)state.dst; rt = state.ro ? state.ro->ro_rt : NULL; if (dst != NULL && rt != NULL) ipsecrt = 1; } skip_ipsec: #endif /* IPSEC */ #ifdef IPSEC if (ipsecrt) goto skip_routing; #endif dst = (struct sockaddr_in6 *)&ip6_forward_rt.ro_dst; if (!srcrt) { /* ip6_forward_rt.ro_dst.sin6_addr is equal to ip6->ip6_dst */ if (ip6_forward_rt.ro_rt == 0 || (ip6_forward_rt.ro_rt->rt_flags & RTF_UP) == 0) { if (ip6_forward_rt.ro_rt) { RTFREE(ip6_forward_rt.ro_rt); ip6_forward_rt.ro_rt = 0; } /* this probably fails but give it a try again */ rtalloc((struct route *)&ip6_forward_rt); } if (ip6_forward_rt.ro_rt == 0) { ip6stat.ip6s_noroute++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute); if (mcopy) { icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE, 0); } m_freem(m); return; } } else if ((rt = ip6_forward_rt.ro_rt) == 0 || !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &dst->sin6_addr)) { if (ip6_forward_rt.ro_rt) { RTFREE(ip6_forward_rt.ro_rt); ip6_forward_rt.ro_rt = 0; } bzero(dst, sizeof(*dst)); dst->sin6_len = sizeof(struct sockaddr_in6); dst->sin6_family = AF_INET6; dst->sin6_addr = ip6->ip6_dst; rtalloc((struct route *)&ip6_forward_rt); if (ip6_forward_rt.ro_rt == 0) { ip6stat.ip6s_noroute++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute); if (mcopy) { icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE, 0); } m_freem(m); return; } } rt = ip6_forward_rt.ro_rt; #ifdef IPSEC skip_routing:; #endif /* * Source scope check: if a packet can't be delivered to its * destination for the reason that the destination is beyond the scope * of the source address, discard the packet and return an icmp6 * destination unreachable error with Code 2 (beyond scope of source * address). We use a local copy of ip6_src, since in6_setscope() * will possibly modify its first argument. * [draft-ietf-ipngwg-icmp-v3-04.txt, Section 3.1] */ src_in6 = ip6->ip6_src; if (in6_setscope(&src_in6, rt->rt_ifp, &outzone)) { /* XXX: this should not happen */ ip6stat.ip6s_cantforward++; ip6stat.ip6s_badscope++; m_freem(m); return; } if (in6_setscope(&src_in6, m->m_pkthdr.rcvif, &inzone)) { ip6stat.ip6s_cantforward++; ip6stat.ip6s_badscope++; m_freem(m); return; } if (inzone != outzone #ifdef IPSEC && !ipsecrt #endif ) { ip6stat.ip6s_cantforward++; ip6stat.ip6s_badscope++; in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard); if (ip6_log_time + ip6_log_interval < time_second) { ip6_log_time = time_second; log(LOG_DEBUG, "cannot forward " "src %s, dst %s, nxt %d, rcvif %s, outif %s\n", ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst), ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif), if_name(rt->rt_ifp)); } if (mcopy) icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_BEYONDSCOPE, 0); m_freem(m); return; } /* * Destination scope check: if a packet is going to break the scope * zone of packet's destination address, discard it. This case should * usually be prevented by appropriately-configured routing table, but * we need an explicit check because we may mistakenly forward the * packet to a different zone by (e.g.) a default route. */ dst_in6 = ip6->ip6_dst; if (in6_setscope(&dst_in6, m->m_pkthdr.rcvif, &inzone) != 0 || in6_setscope(&dst_in6, rt->rt_ifp, &outzone) != 0 || inzone != outzone) { ip6stat.ip6s_cantforward++; ip6stat.ip6s_badscope++; m_freem(m); return; } if (m->m_pkthdr.len > IN6_LINKMTU(rt->rt_ifp)) { in6_ifstat_inc(rt->rt_ifp, ifs6_in_toobig); if (mcopy) { u_long mtu; #ifdef IPSEC struct secpolicy *sp; int ipsecerror; size_t ipsechdrsiz; #endif mtu = IN6_LINKMTU(rt->rt_ifp); #ifdef IPSEC /* * When we do IPsec tunnel ingress, we need to play * with the link value (decrement IPsec header size * from mtu value). The code is much simpler than v4 * case, as we have the outgoing interface for * encapsulated packet as "rt->rt_ifp". */ sp = ipsec6_getpolicybyaddr(mcopy, IPSEC_DIR_OUTBOUND, IP_FORWARDING, &ipsecerror); if (sp) { ipsechdrsiz = ipsec6_hdrsiz(mcopy, IPSEC_DIR_OUTBOUND, NULL); if (ipsechdrsiz < mtu) mtu -= ipsechdrsiz; } /* * if mtu becomes less than minimum MTU, * tell minimum MTU (and I'll need to fragment it). */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU; #endif icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, mtu); } m_freem(m); return; } if (rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in6 *)rt->rt_gateway; /* * If we are to forward the packet using the same interface * as one we got the packet from, perhaps we should send a redirect * to sender to shortcut a hop. * Only send redirect if source is sending directly to us, * and if packet was not source routed (or has any options). * Also, don't send redirect if forwarding using a route * modified by a redirect. */ if (rt->rt_ifp == m->m_pkthdr.rcvif && !srcrt && #ifdef IPSEC !ipsecrt && #endif (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0) { if ((rt->rt_ifp->if_flags & IFF_POINTOPOINT) != 0) { /* * If the incoming interface is equal to the outgoing * one, and the link attached to the interface is * point-to-point, then it will be highly probable * that a routing loop occurs. Thus, we immediately * drop the packet and send an ICMPv6 error message. * * type/code is based on suggestion by Rich Draves. * not sure if it is the best pick. */ icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0); m_freem(m); return; } type = ND_REDIRECT; } /* * Check with the firewall... */ if (ip6_fw_enable && ip6_fw_chk_ptr) { u_short port = 0; /* If ipfw says divert, we have to just drop packet */ if ((*ip6_fw_chk_ptr)(&ip6, rt->rt_ifp, &port, &m)) { m_freem(m); goto freecopy; } if (!m) goto freecopy; } /* * Fake scoped addresses. Note that even link-local source or * destinaion can appear, if the originating node just sends the * packet to us (without address resolution for the destination). * Since both icmp6_error and icmp6_redirect_output fill the embedded * link identifiers, we can do this stuff after making a copy for * returning an error. */ if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { /* * See corresponding comments in ip6_output. * XXX: but is it possible that ip6_forward() sends a packet * to a loopback interface? I don't think so, and thus * I bark here. (jinmei@kame.net) * XXX: it is common to route invalid packets to loopback. * also, the codepath will be visited on use of ::1 in * rthdr. (itojun) */ #if 1 if (0) #else if ((rt->rt_flags & (RTF_BLACKHOLE|RTF_REJECT)) == 0) #endif { printf("ip6_forward: outgoing interface is loopback. " "src %s, dst %s, nxt %d, rcvif %s, outif %s\n", ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst), ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif), if_name(rt->rt_ifp)); } /* we can just use rcvif in forwarding. */ origifp = m->m_pkthdr.rcvif; } else origifp = rt->rt_ifp; /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); /* Jump over all PFIL processing if hooks are not active. */ - if (inet6_pfil_hook.ph_busy_count == -1) + if (!PFIL_HOOKED(&inet6_pfil_hook)) goto pass; /* Run through list of hooks for output packets. */ error = pfil_run_hooks(&inet6_pfil_hook, &m, rt->rt_ifp, PFIL_OUT, NULL); if (error != 0) goto senderr; if (m == NULL) goto freecopy; ip6 = mtod(m, struct ip6_hdr *); pass: error = nd6_output(rt->rt_ifp, origifp, m, dst, rt); if (error) { in6_ifstat_inc(rt->rt_ifp, ifs6_out_discard); ip6stat.ip6s_cantforward++; } else { ip6stat.ip6s_forward++; in6_ifstat_inc(rt->rt_ifp, ifs6_out_forward); if (type) ip6stat.ip6s_redirectsent++; else { if (mcopy) goto freecopy; } } senderr: if (mcopy == NULL) return; switch (error) { case 0: if (type == ND_REDIRECT) { icmp6_redirect_output(mcopy, rt); return; } goto freecopy; case EMSGSIZE: /* xxx MTU is constant in PPP? */ goto freecopy; case ENOBUFS: /* Tell source to slow down like source quench in IP? */ goto freecopy; case ENETUNREACH: /* shouldn't happen, checked above */ case EHOSTUNREACH: case ENETDOWN: case EHOSTDOWN: default: type = ICMP6_DST_UNREACH; code = ICMP6_DST_UNREACH_ADDR; break; } icmp6_error(mcopy, type, code, 0); return; freecopy: m_freem(mcopy); return; } diff --git a/sys/netinet6/ip6_input.c b/sys/netinet6/ip6_input.c index 3b345570c78c..a7f5a7c73ced 100644 --- a/sys/netinet6/ip6_input.c +++ b/sys/netinet6/ip6_input.c @@ -1,1610 +1,1610 @@ /* $FreeBSD$ */ /* $KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $ */ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include "opt_ip6fw.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #include #include #include #include #include #include #include #include #ifdef IPSEC #include #ifdef INET6 #include #endif #endif #ifdef FAST_IPSEC #include #include #define IPSEC #endif /* FAST_IPSEC */ #include #include #include extern struct domain inet6domain; u_char ip6_protox[IPPROTO_MAX]; static struct ifqueue ip6intrq; static int ip6qmaxlen = IFQ_MAXLEN; struct in6_ifaddr *in6_ifaddr; extern struct callout in6_tmpaddrtimer_ch; int ip6_forward_srcrt; /* XXX */ int ip6_sourcecheck; /* XXX */ int ip6_sourcecheck_interval; /* XXX */ int ip6_ours_check_algorithm; struct pfil_head inet6_pfil_hook; /* firewall hooks */ ip6_fw_chk_t *ip6_fw_chk_ptr; ip6_fw_ctl_t *ip6_fw_ctl_ptr; int ip6_fw_enable = 1; struct ip6stat ip6stat; static void ip6_init2 __P((void *)); static struct ip6aux *ip6_setdstifaddr __P((struct mbuf *, struct in6_ifaddr *)); static int ip6_hopopts_input __P((u_int32_t *, u_int32_t *, struct mbuf **, int *)); #ifdef PULLDOWN_TEST static struct mbuf *ip6_pullexthdr __P((struct mbuf *, size_t, int)); #endif /* * IP6 initialization: fill in IP6 protocol switch table. * All protocols not implemented in kernel go to raw IP6 protocol handler. */ void ip6_init() { struct ip6protosw *pr; int i; #ifdef DIAGNOSTIC if (sizeof(struct protosw) != sizeof(struct ip6protosw)) panic("sizeof(protosw) != sizeof(ip6protosw)"); #endif pr = (struct ip6protosw *)pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == 0) panic("ip6_init"); /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip6_protox[i] = pr - inet6sw; /* * Cycle through IP protocols and put them into the appropriate place * in ip6_protox[]. */ for (pr = (struct ip6protosw *)inet6domain.dom_protosw; pr < (struct ip6protosw *)inet6domain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET6 && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip6_protox[pr->pr_protocol] = pr - inet6sw; } /* Initialize packet filter hooks. */ inet6_pfil_hook.ph_type = PFIL_TYPE_AF; inet6_pfil_hook.ph_af = AF_INET6; if ((i = pfil_head_register(&inet6_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); ip6intrq.ifq_maxlen = ip6qmaxlen; mtx_init(&ip6intrq.ifq_mtx, "ip6_inq", NULL, MTX_DEF); netisr_register(NETISR_IPV6, ip6_input, &ip6intrq, 0); scope6_init(); addrsel_policy_init(); nd6_init(); frag6_init(); ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR; } static void ip6_init2(dummy) void *dummy; { /* nd6_timer_init */ callout_init(&nd6_timer_ch, 0); callout_reset(&nd6_timer_ch, hz, nd6_timer, NULL); /* timer for regeneranation of temporary addresses randomize ID */ callout_init(&in6_tmpaddrtimer_ch, 0); callout_reset(&in6_tmpaddrtimer_ch, (ip6_temp_preferred_lifetime - ip6_desync_factor - ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, NULL); } /* cheat */ /* This must be after route_init(), which is now SI_ORDER_THIRD */ SYSINIT(netinet6init2, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ip6_init2, NULL); extern struct route_in6 ip6_forward_rt; void ip6_input(m) struct mbuf *m; { struct ip6_hdr *ip6; int off = sizeof(struct ip6_hdr), nest; u_int32_t plen; u_int32_t rtalert = ~0; int nxt, ours = 0; struct ifnet *deliverifp = NULL; struct in6_addr odst; int srcrt = 0; GIANT_REQUIRED; /* XXX for now */ #ifdef IPSEC /* * should the inner packet be considered authentic? * see comment in ah4_input(). */ if (m) { m->m_flags &= ~M_AUTHIPHDR; m->m_flags &= ~M_AUTHIPDGM; } #endif /* * make sure we don't have onion peering information into m_tag. */ ip6_delaux(m); /* * mbuf statistics */ if (m->m_flags & M_EXT) { if (m->m_next) ip6stat.ip6s_mext2m++; else ip6stat.ip6s_mext1++; } else { #define M2MMAX (sizeof(ip6stat.ip6s_m2m)/sizeof(ip6stat.ip6s_m2m[0])) if (m->m_next) { if (m->m_flags & M_LOOP) { ip6stat.ip6s_m2m[loif[0].if_index]++; /* XXX */ } else if (m->m_pkthdr.rcvif->if_index < M2MMAX) ip6stat.ip6s_m2m[m->m_pkthdr.rcvif->if_index]++; else ip6stat.ip6s_m2m[0]++; } else ip6stat.ip6s_m1++; #undef M2MMAX } /* drop the packet if IPv6 operation is disabled on the IF */ if ((ND_IFINFO(m->m_pkthdr.rcvif)->flags & ND6_IFF_IFDISABLED)) { m_freem(m); return; } in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_receive); ip6stat.ip6s_total++; #ifndef PULLDOWN_TEST /* * L2 bridge code and some other code can return mbuf chain * that does not conform to KAME requirement. too bad. * XXX: fails to join if interface MTU > MCLBYTES. jumbogram? */ if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) { struct mbuf *n; MGETHDR(n, M_DONTWAIT, MT_HEADER); if (n) M_MOVE_PKTHDR(n, m); if (n && n->m_pkthdr.len > MHLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_freem(n); n = NULL; } } if (n == NULL) { m_freem(m); return; /* ENOBUFS */ } m_copydata(m, 0, n->m_pkthdr.len, mtod(n, caddr_t)); n->m_len = n->m_pkthdr.len; m_freem(m); m = n; } IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /* nothing */); #endif if (m->m_len < sizeof(struct ip6_hdr)) { struct ifnet *inifp; inifp = m->m_pkthdr.rcvif; if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { ip6stat.ip6s_toosmall++; in6_ifstat_inc(inifp, ifs6_in_hdrerr); return; } } ip6 = mtod(m, struct ip6_hdr *); if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { ip6stat.ip6s_badvers++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); goto bad; } ip6stat.ip6s_nxthist[ip6->ip6_nxt]++; /* * Check against address spoofing/corruption. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) { /* * XXX: "badscope" is not very suitable for a multicast source. */ ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) && !(m->m_flags & M_LOOP)) { /* * In this case, the packet should come from the loopback * interface. However, we cannot just check the if_flags, * because ip6_mloopback() passes the "actual" interface * as the outgoing/incoming interface. */ ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET6) == 0) { /* packet is dropped by traffic conditioner */ return; } #endif /* * The following check is not documented in specs. A malicious * party may be able to use IPv4 mapped addr to confuse tcp/udp stack * and bypass security checks (act as if it was from 127.0.0.1 by using * IPv6 src ::ffff:127.0.0.1). Be cautious. * * This check chokes if we are in an SIIT cloud. As none of BSDs * support IPv4-less kernel compilation, we cannot support SIIT * environment at all. So, it makes more sense for us to reject any * malicious packets for non-SIIT environment, than try to do a * partial support for SIIT environment. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #if 0 /* * Reject packets with IPv4 compatible addresses (auto tunnel). * * The code forbids auto tunnel relay case in RFC1933 (the check is * stronger than RFC1933). We may want to re-enable it if mech-xx * is revised to forbid relaying case. */ if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) || IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) { ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #endif /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing * (e.g. by NAT rewriting). When this happens, * tell ip6_forward to do the right thing. */ odst = ip6->ip6_dst; /* Jump over all PFIL processing if hooks are not active. */ - if (inet6_pfil_hook.ph_busy_count == -1) + if (!PFIL_HOOKED(&inet6_pfil_hook)) goto passin; if (pfil_run_hooks(&inet6_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL)) return; if (m == NULL) /* consumed by filter */ return; ip6 = mtod(m, struct ip6_hdr *); srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst); passin: /* * Check with the firewall... */ if (ip6_fw_enable && ip6_fw_chk_ptr) { u_short port = 0; /* If ipfw says divert, we have to just drop packet */ /* use port as a dummy argument */ if ((*ip6_fw_chk_ptr)(&ip6, NULL, &port, &m)) { m_freem(m); m = NULL; } if (!m) return; } /* * Disambiguate address scope zones (if there is ambiguity). * We first make sure that the original source or destination address * is not in our internal form for scoped addresses. Such addresses * are not necessarily invalid spec-wise, but we cannot accept them due * to the usage conflict. * in6_setscope() then also checks and rejects the cases where src or * dst are the loopback address and the receiving interface * is not loopback. */ if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) { ip6stat.ip6s_badscope++; /* XXX */ goto bad; } if (in6_setscope(&ip6->ip6_src, m->m_pkthdr.rcvif, NULL) || in6_setscope(&ip6->ip6_dst, m->m_pkthdr.rcvif, NULL)) { ip6stat.ip6s_badscope++; goto bad; } /* * Multicast check */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct in6_multi *in6m = 0; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mcast); /* * See if we belong to the destination multicast group on the * arrival interface. */ IN6_LOOKUP_MULTI(ip6->ip6_dst, m->m_pkthdr.rcvif, in6m); if (in6m) ours = 1; else if (!ip6_mrouter) { ip6stat.ip6s_notmember++; ip6stat.ip6s_cantforward++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto bad; } deliverifp = m->m_pkthdr.rcvif; goto hbhcheck; } /* * Unicast check */ if (ip6_forward_rt.ro_rt != NULL && (ip6_forward_rt.ro_rt->rt_flags & RTF_UP) != 0 && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &((struct sockaddr_in6 *)(&ip6_forward_rt.ro_dst))->sin6_addr)) ip6stat.ip6s_forward_cachehit++; else { struct sockaddr_in6 *dst6; if (ip6_forward_rt.ro_rt) { /* route is down or destination is different */ ip6stat.ip6s_forward_cachemiss++; RTFREE(ip6_forward_rt.ro_rt); ip6_forward_rt.ro_rt = 0; } bzero(&ip6_forward_rt.ro_dst, sizeof(struct sockaddr_in6)); dst6 = (struct sockaddr_in6 *)&ip6_forward_rt.ro_dst; dst6->sin6_len = sizeof(struct sockaddr_in6); dst6->sin6_family = AF_INET6; dst6->sin6_addr = ip6->ip6_dst; rtalloc((struct route *)&ip6_forward_rt); } #define rt6_key(r) ((struct sockaddr_in6 *)((r)->rt_nodes->rn_key)) /* * Accept the packet if the forwarding interface to the destination * according to the routing table is the loopback interface, * unless the associated route has a gateway. * Note that this approach causes to accept a packet if there is a * route to the loopback interface for the destination of the packet. * But we think it's even useful in some situations, e.g. when using * a special daemon which wants to intercept the packet. * * XXX: some OSes automatically make a cloned route for the destination * of an outgoing packet. If the outgoing interface of the packet * is a loopback one, the kernel would consider the packet to be * accepted, even if we have no such address assinged on the interface. * We check the cloned flag of the route entry to reject such cases, * assuming that route entries for our own addresses are not made by * cloning (it should be true because in6_addloop explicitly installs * the host route). However, we might have to do an explicit check * while it would be less efficient. Or, should we rather install a * reject route for such a case? */ if (ip6_forward_rt.ro_rt && (ip6_forward_rt.ro_rt->rt_flags & (RTF_HOST|RTF_GATEWAY)) == RTF_HOST && #ifdef RTF_WASCLONED !(ip6_forward_rt.ro_rt->rt_flags & RTF_WASCLONED) && #endif #ifdef RTF_CLONED !(ip6_forward_rt.ro_rt->rt_flags & RTF_CLONED) && #endif #if 0 /* * The check below is redundant since the comparison of * the destination and the key of the rtentry has * already done through looking up the routing table. */ IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &rt6_key(ip6_forward_rt.ro_rt)->sin6_addr) #endif ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_LOOP) { struct in6_ifaddr *ia6 = (struct in6_ifaddr *)ip6_forward_rt.ro_rt->rt_ifa; /* * record address information into m_tag. */ (void)ip6_setdstifaddr(m, ia6); /* * packets to a tentative, duplicated, or somehow invalid * address must not be accepted. */ if (!(ia6->ia6_flags & IN6_IFF_NOTREADY)) { /* this address is ready */ ours = 1; deliverifp = ia6->ia_ifp; /* correct? */ /* Count the packet in the ip address stats */ ia6->ia_ifa.if_ipackets++; ia6->ia_ifa.if_ibytes += m->m_pkthdr.len; goto hbhcheck; } else { /* address is not ready, so discard the packet. */ nd6log((LOG_INFO, "ip6_input: packet to an unready address %s->%s\n", ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst))); goto bad; } } /* * FAITH (Firewall Aided Internet Translator) */ if (ip6_keepfaith) { if (ip6_forward_rt.ro_rt && ip6_forward_rt.ro_rt->rt_ifp && ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_FAITH) { /* XXX do we need more sanity checks? */ ours = 1; deliverifp = ip6_forward_rt.ro_rt->rt_ifp; /* faith */ goto hbhcheck; } } /* * Now there is no reason to process the packet if it's not our own * and we're not a router. */ if (!ip6_forwarding) { ip6stat.ip6s_cantforward++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto bad; } hbhcheck: /* * record address information into m_tag, if we don't have one yet. * note that we are unable to record it, if the address is not listed * as our interface address (e.g. multicast addresses, addresses * within FAITH prefixes and such). */ if (deliverifp && !ip6_getdstifaddr(m)) { struct in6_ifaddr *ia6; ia6 = in6_ifawithifp(deliverifp, &ip6->ip6_dst); if (ia6) { if (!ip6_setdstifaddr(m, ia6)) { /* * XXX maybe we should drop the packet here, * as we could not provide enough information * to the upper layers. */ } } } /* * Process Hop-by-Hop options header if it's contained. * m may be modified in ip6_hopopts_input(). * If a JumboPayload option is included, plen will also be modified. */ plen = (u_int32_t)ntohs(ip6->ip6_plen); if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { struct ip6_hbh *hbh; if (ip6_hopopts_input(&plen, &rtalert, &m, &off)) { #if 0 /*touches NULL pointer*/ in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); #endif return; /* m have already been freed */ } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* * if the payload length field is 0 and the next header field * indicates Hop-by-Hop Options header, then a Jumbo Payload * option MUST be included. */ if (ip6->ip6_plen == 0 && plen == 0) { /* * Note that if a valid jumbo payload option is * contained, ip6_hopopts_input() must set a valid * (non-zero) payload length to the variable plen. */ ip6stat.ip6s_badoptions++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&ip6->ip6_plen - (caddr_t)ip6); return; } #ifndef PULLDOWN_TEST /* ip6_hopopts_input() ensures that mbuf is contiguous */ hbh = (struct ip6_hbh *)(ip6 + 1); #else IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { ip6stat.ip6s_tooshort++; return; } #endif nxt = hbh->ip6h_nxt; /* * accept the packet if a router alert option is included * and we act as an IPv6 router. */ if (rtalert != ~0 && ip6_forwarding) ours = 1; } else nxt = ip6->ip6_nxt; /* * Check that the amount of data in the buffers * is as at least much as the IPv6 header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) { ip6stat.ip6s_tooshort++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) { if (m->m_len == m->m_pkthdr.len) { m->m_len = sizeof(struct ip6_hdr) + plen; m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen; } else m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len); } /* * Forward if desirable. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip6_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ if (ip6_mrouter && ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) { ip6stat.ip6s_cantforward++; m_freem(m); return; } if (!ours) { m_freem(m); return; } } else if (!ours) { ip6_forward(m, srcrt); return; } ip6 = mtod(m, struct ip6_hdr *); /* * Malicious party may be able to use IPv4 mapped addr to confuse * tcp/udp stack and bypass security checks (act as if it was from * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1). Be cautious. * * For SIIT end node behavior, you may want to disable the check. * However, you will become vulnerable to attacks using IPv4 mapped * source. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } /* * Tell launch routine the next header */ ip6stat.ip6s_delivered++; in6_ifstat_inc(deliverifp, ifs6_in_deliver); nest = 0; while (nxt != IPPROTO_DONE) { if (ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) { ip6stat.ip6s_toomanyhdr++; goto bad; } /* * protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { ip6stat.ip6s_tooshort++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } #ifdef IPSEC /* * enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ if ((inet6sw[ip6_protox[nxt]].pr_flags & PR_LASTHDR) != 0 && ipsec6_in_reject(m, NULL)) { ipsec6stat.in_polvio++; goto bad; } #endif nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt); } return; bad: m_freem(m); } /* * set/grab in6_ifaddr correspond to IPv6 destination address. * XXX backward compatibility wrapper */ static struct ip6aux * ip6_setdstifaddr(m, ia6) struct mbuf *m; struct in6_ifaddr *ia6; { struct ip6aux *ip6a; ip6a = ip6_addaux(m); if (ip6a) ip6a->ip6a_dstia6 = ia6; return ip6a; /* NULL if failed to set */ } struct in6_ifaddr * ip6_getdstifaddr(m) struct mbuf *m; { struct ip6aux *ip6a; ip6a = ip6_findaux(m); if (ip6a) return ip6a->ip6a_dstia6; else return NULL; } /* * Hop-by-Hop options header processing. If a valid jumbo payload option is * included, the real payload length will be stored in plenp. */ static int ip6_hopopts_input(plenp, rtalertp, mp, offp) u_int32_t *plenp; u_int32_t *rtalertp; /* XXX: should be stored more smart way */ struct mbuf **mp; int *offp; { struct mbuf *m = *mp; int off = *offp, hbhlen; struct ip6_hbh *hbh; u_int8_t *opt; /* validation of the length of the header */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_CHECK(m, off, hbhlen, -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { ip6stat.ip6s_tooshort++; return -1; } hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), hbhlen); if (hbh == NULL) { ip6stat.ip6s_tooshort++; return -1; } #endif off += hbhlen; hbhlen -= sizeof(struct ip6_hbh); opt = (u_int8_t *)hbh + sizeof(struct ip6_hbh); if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh), hbhlen, rtalertp, plenp) < 0) return (-1); *offp = off; *mp = m; return (0); } /* * Search header for all Hop-by-hop options and process each option. * This function is separate from ip6_hopopts_input() in order to * handle a case where the sending node itself process its hop-by-hop * options header. In such a case, the function is called from ip6_output(). * * The function assumes that hbh header is located right after the IPv6 header * (RFC2460 p7), opthead is pointer into data content in m, and opthead to * opthead + hbhlen is located in continuous memory region. */ int ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) struct mbuf *m; u_int8_t *opthead; int hbhlen; u_int32_t *rtalertp; u_int32_t *plenp; { struct ip6_hdr *ip6; int optlen = 0; u_int8_t *opt = opthead; u_int16_t rtalert_val; u_int32_t jumboplen; const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh); for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) { switch (*opt) { case IP6OPT_PAD1: optlen = 1; break; case IP6OPT_PADN: if (hbhlen < IP6OPT_MINLEN) { ip6stat.ip6s_toosmall++; goto bad; } optlen = *(opt + 1) + 2; break; case IP6OPT_ROUTER_ALERT: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_RTALERT_LEN) { ip6stat.ip6s_toosmall++; goto bad; } if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_RTALERT_LEN; bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2); *rtalertp = ntohs(rtalert_val); break; case IP6OPT_JUMBO: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_JUMBO_LEN) { ip6stat.ip6s_toosmall++; goto bad; } if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_JUMBO_LEN; /* * IPv6 packets that have non 0 payload length * must not contain a jumbo payload option. */ ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_plen) { ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt - opthead); return (-1); } /* * We may see jumbolen in unaligned location, so * we'd need to perform bcopy(). */ bcopy(opt + 2, &jumboplen, sizeof(jumboplen)); jumboplen = (u_int32_t)htonl(jumboplen); #if 1 /* * if there are multiple jumbo payload options, * *plenp will be non-zero and the packet will be * rejected. * the behavior may need some debate in ipngwg - * multiple options does not make sense, however, * there's no explicit mention in specification. */ if (*plenp != 0) { ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } #endif /* * jumbo payload length must be larger than 65535. */ if (jumboplen <= IPV6_MAXPACKET) { ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } *plenp = jumboplen; break; default: /* unknown option */ if (hbhlen < IP6OPT_MINLEN) { ip6stat.ip6s_toosmall++; goto bad; } optlen = ip6_unknown_opt(opt, m, erroff + opt - opthead); if (optlen == -1) return (-1); optlen += 2; break; } } return (0); bad: m_freem(m); return (-1); } /* * Unknown option processing. * The third argument `off' is the offset from the IPv6 header to the option, * which is necessary if the IPv6 header the and option header and IPv6 header * is not continuous in order to return an ICMPv6 error. */ int ip6_unknown_opt(optp, m, off) u_int8_t *optp; struct mbuf *m; int off; { struct ip6_hdr *ip6; switch (IP6OPT_TYPE(*optp)) { case IP6OPT_TYPE_SKIP: /* ignore the option */ return ((int)*(optp + 1)); case IP6OPT_TYPE_DISCARD: /* silently discard */ m_freem(m); return (-1); case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */ ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */ ip6stat.ip6s_badoptions++; ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || (m->m_flags & (M_BCAST|M_MCAST))) m_freem(m); else icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); } m_freem(m); /* XXX: NOTREACHED */ return (-1); } /* * Create the "control" list for this pcb. * The function will not modify mbuf chain at all. * * with KAME mbuf chain restriction: * The routine will be called from upper layer handlers like tcp6_input(). * Thus the routine assumes that the caller (tcp6_input) have already * called IP6_EXTHDR_CHECK() and all the extension headers are located in the * very first mbuf on the mbuf chain. */ void ip6_savecontrol(in6p, m, mp) struct inpcb *in6p; struct mbuf *m, **mp; { #define IS2292(x, y) ((in6p->in6p_flags & IN6P_RFC2292) ? (x) : (y)) struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); #ifdef SO_TIMESTAMP if ((in6p->in6p_socket->so_options & SO_TIMESTAMP) != 0) { struct timeval tv; microtime(&tv); *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } #endif if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) return; /* RFC 2292 sec. 5 */ if ((in6p->in6p_flags & IN6P_PKTINFO) != 0) { struct in6_pktinfo pi6; bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr)); in6_clearscope(&pi6.ipi6_addr); /* XXX */ pi6.ipi6_ifindex = (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0; *mp = sbcreatecontrol((caddr_t) &pi6, sizeof(struct in6_pktinfo), IS2292(IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if ((in6p->in6p_flags & IN6P_HOPLIMIT) != 0) { int hlim = ip6->ip6_hlim & 0xff; *mp = sbcreatecontrol((caddr_t) &hlim, sizeof(int), IS2292(IPV6_2292HOPLIMIT, IPV6_HOPLIMIT), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if ((in6p->in6p_flags & IN6P_TCLASS) != 0) { u_int32_t flowinfo; int tclass; flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK); flowinfo >>= 20; tclass = flowinfo & 0xff; *mp = sbcreatecontrol((caddr_t) &tclass, sizeof(tclass), IPV6_TCLASS, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } /* * IPV6_HOPOPTS socket option. Recall that we required super-user * privilege for the option (see ip6_ctloutput), but it might be too * strict, since there might be some hop-by-hop options which can be * returned to normal user. * See also RFC 2292 section 6 (or RFC 3542 section 8). */ if ((in6p->in6p_flags & IN6P_HOPOPTS) != 0) { /* * Check if a hop-by-hop options header is contatined in the * received packet, and if so, store the options as ancillary * data. Note that a hop-by-hop options header must be * just after the IPv6 header, which is assured through the * IPv6 input processing. */ if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { struct ip6_hbh *hbh; int hbhlen = 0; #ifdef PULLDOWN_TEST struct mbuf *ext; #endif #ifndef PULLDOWN_TEST hbh = (struct ip6_hbh *)(ip6 + 1); hbhlen = (hbh->ip6h_len + 1) << 3; #else ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr), ip6->ip6_nxt); if (ext == NULL) { ip6stat.ip6s_tooshort++; return; } hbh = mtod(ext, struct ip6_hbh *); hbhlen = (hbh->ip6h_len + 1) << 3; if (hbhlen != ext->m_len) { m_freem(ext); ip6stat.ip6s_tooshort++; return; } #endif /* * XXX: We copy the whole header even if a * jumbo payload option is included, the option which * is to be removed before returning according to * RFC2292. * Note: this constraint is removed in RFC3542 */ *mp = sbcreatecontrol((caddr_t)hbh, hbhlen, IS2292(IPV6_2292HOPOPTS, IPV6_HOPOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; #ifdef PULLDOWN_TEST m_freem(ext); #endif } } if ((in6p->in6p_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) { int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr); /* * Search for destination options headers or routing * header(s) through the header chain, and stores each * header as ancillary data. * Note that the order of the headers remains in * the chain of ancillary data. */ while (1) { /* is explicit loop prevention necessary? */ struct ip6_ext *ip6e = NULL; int elen; #ifdef PULLDOWN_TEST struct mbuf *ext = NULL; #endif /* * if it is not an extension header, don't try to * pull it from the chain. */ switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: goto loopend; } #ifndef PULLDOWN_TEST if (off + sizeof(*ip6e) > m->m_len) goto loopend; ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (off + elen > m->m_len) goto loopend; #else ext = ip6_pullexthdr(m, off, nxt); if (ext == NULL) { ip6stat.ip6s_tooshort++; return; } ip6e = mtod(ext, struct ip6_ext *); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (elen != ext->m_len) { m_freem(ext); ip6stat.ip6s_tooshort++; return; } #endif switch (nxt) { case IPPROTO_DSTOPTS: if (!(in6p->in6p_flags & IN6P_DSTOPTS)) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, IS2292(IPV6_2292DSTOPTS, IPV6_DSTOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_ROUTING: if (!in6p->in6p_flags & IN6P_RTHDR) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, IS2292(IPV6_2292RTHDR, IPV6_RTHDR), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: /* * other cases have been filtered in the above. * none will visit this case. here we supply * the code just in case (nxt overwritten or * other cases). */ #ifdef PULLDOWN_TEST m_freem(ext); #endif goto loopend; } /* proceed with the next header. */ off += elen; nxt = ip6e->ip6e_nxt; ip6e = NULL; #ifdef PULLDOWN_TEST m_freem(ext); ext = NULL; #endif } loopend: ; } #undef IS2292 } void ip6_notify_pmtu(in6p, dst, mtu) struct inpcb *in6p; struct sockaddr_in6 *dst; u_int32_t *mtu; { struct socket *so; struct mbuf *m_mtu; struct ip6_mtuinfo mtuctl; so = in6p->inp_socket; if (mtu == NULL) return; #ifdef DIAGNOSTIC if (so == NULL) /* I believe this is impossible */ panic("ip6_notify_pmtu: socket is NULL"); #endif bzero(&mtuctl, sizeof(mtuctl)); /* zero-clear for safety */ mtuctl.ip6m_mtu = *mtu; mtuctl.ip6m_addr = *dst; if (sa6_recoverscope(&mtuctl.ip6m_addr)) return; if ((m_mtu = sbcreatecontrol((caddr_t)&mtuctl, sizeof(mtuctl), IPV6_PATHMTU, IPPROTO_IPV6)) == NULL) return; if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu) == 0) { m_freem(m_mtu); /* XXX: should count statistics */ } else sorwakeup(so); return; } #ifdef PULLDOWN_TEST /* * pull single extension header from mbuf chain. returns single mbuf that * contains the result, or NULL on error. */ static struct mbuf * ip6_pullexthdr(m, off, nxt) struct mbuf *m; size_t off; int nxt; { struct ip6_ext ip6e; size_t elen; struct mbuf *n; #ifdef DIAGNOSTIC switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: printf("ip6_pullexthdr: invalid nxt=%d\n", nxt); } #endif m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxt == IPPROTO_AH) elen = (ip6e.ip6e_len + 2) << 2; else elen = (ip6e.ip6e_len + 1) << 3; MGET(n, M_DONTWAIT, MT_DATA); if (n && elen >= MLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_free(n); n = NULL; } } if (!n) return NULL; n->m_len = 0; if (elen >= M_TRAILINGSPACE(n)) { m_free(n); return NULL; } m_copydata(m, off, elen, mtod(n, caddr_t)); n->m_len = elen; return n; } #endif /* * Get pointer to the previous header followed by the header * currently processed. * XXX: This function supposes that * M includes all headers, * the next header field and the header length field of each header * are valid, and * the sum of each header length equals to OFF. * Because of these assumptions, this function must be called very * carefully. Moreover, it will not be used in the near future when * we develop `neater' mechanism to process extension headers. */ char * ip6_get_prevhdr(m, off) struct mbuf *m; int off; { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); if (off == sizeof(struct ip6_hdr)) return (&ip6->ip6_nxt); else { int len, nxt; struct ip6_ext *ip6e = NULL; nxt = ip6->ip6_nxt; len = sizeof(struct ip6_hdr); while (len < off) { ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + len); switch (nxt) { case IPPROTO_FRAGMENT: len += sizeof(struct ip6_frag); break; case IPPROTO_AH: len += (ip6e->ip6e_len + 2) << 2; break; default: len += (ip6e->ip6e_len + 1) << 3; break; } nxt = ip6e->ip6e_nxt; } if (ip6e) return (&ip6e->ip6e_nxt); else return NULL; } } /* * get next header offset. m will be retained. */ int ip6_nexthdr(m, off, proto, nxtp) struct mbuf *m; int off; int proto; int *nxtp; { struct ip6_hdr ip6; struct ip6_ext ip6e; struct ip6_frag fh; /* just in case */ if (m == NULL) panic("ip6_nexthdr: m == NULL"); if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off) return -1; switch (proto) { case IPPROTO_IPV6: if (m->m_pkthdr.len < off + sizeof(ip6)) return -1; m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6); if (nxtp) *nxtp = ip6.ip6_nxt; off += sizeof(ip6); return off; case IPPROTO_FRAGMENT: /* * terminate parsing if it is not the first fragment, * it does not make sense to parse through it. */ if (m->m_pkthdr.len < off + sizeof(fh)) return -1; m_copydata(m, off, sizeof(fh), (caddr_t)&fh); /* IP6F_OFF_MASK = 0xfff8(BigEndian), 0xf8ff(LittleEndian) */ if (fh.ip6f_offlg & IP6F_OFF_MASK) return -1; if (nxtp) *nxtp = fh.ip6f_nxt; off += sizeof(struct ip6_frag); return off; case IPPROTO_AH: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 2) << 2; return off; case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 1) << 3; return off; case IPPROTO_NONE: case IPPROTO_ESP: case IPPROTO_IPCOMP: /* give up */ return -1; default: return -1; } return -1; } /* * get offset for the last header in the chain. m will be kept untainted. */ int ip6_lasthdr(m, off, proto, nxtp) struct mbuf *m; int off; int proto; int *nxtp; { int newoff; int nxt; if (!nxtp) { nxt = -1; nxtp = &nxt; } while (1) { newoff = ip6_nexthdr(m, off, proto, nxtp); if (newoff < 0) return off; else if (newoff < off) return -1; /* invalid */ else if (newoff == off) return newoff; off = newoff; proto = *nxtp; } } struct ip6aux * ip6_addaux(m) struct mbuf *m; { struct m_tag *mtag; mtag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL); if (!mtag) { mtag = m_tag_get(PACKET_TAG_IPV6_INPUT, sizeof(struct ip6aux), M_NOWAIT); if (mtag) { m_tag_prepend(m, mtag); bzero(mtag + 1, sizeof(struct ip6aux)); } } return mtag ? (struct ip6aux *)(mtag + 1) : NULL; } struct ip6aux * ip6_findaux(m) struct mbuf *m; { struct m_tag *mtag; mtag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL); return mtag ? (struct ip6aux *)(mtag + 1) : NULL; } void ip6_delaux(m) struct mbuf *m; { struct m_tag *mtag; mtag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL); if (mtag) m_tag_delete(m, mtag); } /* * System control for IP6 */ u_char inet6ctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, 0, 0, ENOPROTOOPT }; diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c index 4eb7a6ed368c..57999f86d1ef 100644 --- a/sys/netinet6/ip6_output.c +++ b/sys/netinet6/ip6_output.c @@ -1,3500 +1,3500 @@ /* $FreeBSD$ */ /* $KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $ */ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include "opt_ip6fw.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #ifdef INET6 #include #endif #include #endif /* IPSEC */ #ifdef FAST_IPSEC #include #include #include #endif /* FAST_IPSEC */ #include #include #include #include static MALLOC_DEFINE(M_IP6MOPTS, "ip6_moptions", "internet multicast options"); struct ip6_exthdrs { struct mbuf *ip6e_ip6; struct mbuf *ip6e_hbh; struct mbuf *ip6e_dest1; struct mbuf *ip6e_rthdr; struct mbuf *ip6e_dest2; }; static int ip6_pcbopt __P((int, u_char *, int, struct ip6_pktopts **, int, int)); static int ip6_pcbopts __P((struct ip6_pktopts **, struct mbuf *, struct socket *, struct sockopt *)); static int ip6_getpcbopt __P((struct ip6_pktopts *, int, struct sockopt *)); static int ip6_setpktopt __P((int, u_char *, int, struct ip6_pktopts *, int, int, int, int)); static int ip6_setmoptions __P((int, struct ip6_moptions **, struct mbuf *)); static int ip6_getmoptions __P((int, struct ip6_moptions *, struct mbuf **)); static int ip6_copyexthdr __P((struct mbuf **, caddr_t, int)); static int ip6_insertfraghdr __P((struct mbuf *, struct mbuf *, int, struct ip6_frag **)); static int ip6_insert_jumboopt __P((struct ip6_exthdrs *, u_int32_t)); static int ip6_splithdr __P((struct mbuf *, struct ip6_exthdrs *)); static int ip6_getpmtu __P((struct route_in6 *, struct route_in6 *, struct ifnet *, struct in6_addr *, u_long *, int *)); static int copypktopts __P((struct ip6_pktopts *, struct ip6_pktopts *, int)); /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 * header (with pri, len, nxt, hlim, src, dst). * This function may modify ver and hlim only. * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * * type of "mtu": rt_rmx.rmx_mtu is u_long, ifnet.ifr_mtu is int, and * nd_ifinfo.linkmtu is u_int32_t. so we use u_long to hold largest one, * which is rt_rmx.rmx_mtu. */ int ip6_output(m0, opt, ro, flags, im6o, ifpp, inp) struct mbuf *m0; struct ip6_pktopts *opt; struct route_in6 *ro; int flags; struct ip6_moptions *im6o; struct ifnet **ifpp; /* XXX: just for statistics */ struct inpcb *inp; { struct ip6_hdr *ip6, *mhip6; struct ifnet *ifp, *origifp; struct mbuf *m = m0; int hlen, tlen, len, off; struct route_in6 ip6route; struct rtentry *rt = NULL; struct sockaddr_in6 *dst, src_sa, dst_sa; struct in6_addr odst; int error = 0; struct in6_ifaddr *ia = NULL; u_long mtu; int alwaysfrag, dontfrag; u_int32_t optlen = 0, plen = 0, unfragpartlen = 0; struct ip6_exthdrs exthdrs; struct in6_addr finaldst, src0, dst0; u_int32_t zone; struct route_in6 *ro_pmtu = NULL; int hdrsplit = 0; int needipsec = 0; #if defined(IPSEC) || defined(FAST_IPSEC) int needipsectun = 0; struct secpolicy *sp = NULL; #endif /*IPSEC || FAST_IPSEC*/ ip6 = mtod(m, struct ip6_hdr *); finaldst = ip6->ip6_dst; #define MAKE_EXTHDR(hp, mp) \ do { \ if (hp) { \ struct ip6_ext *eh = (struct ip6_ext *)(hp); \ error = ip6_copyexthdr((mp), (caddr_t)(hp), \ ((eh)->ip6e_len + 1) << 3); \ if (error) \ goto freehdrs; \ } \ } while (/*CONSTCOND*/ 0) bzero(&exthdrs, sizeof(exthdrs)); if (opt) { /* Hop-by-Hop options header */ MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh); /* Destination options header(1st part) */ if (opt->ip6po_rthdr) { /* * Destination options header(1st part) * This only makes sence with a routing header. * See Section 9.2 of RFC 3542. * Disabling this part just for MIP6 convenience is * a bad idea. We need to think carefully about a * way to make the advanced API coexist with MIP6 * options, which might automatically be inserted in * the kernel. */ MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1); } /* Routing header */ MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr); /* Destination options header(2nd part) */ MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2); } #ifdef IPSEC /* get a security policy for this packet */ if (inp == NULL) sp = ipsec6_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, 0, &error); else sp = ipsec6_getpolicybypcb(m, IPSEC_DIR_OUTBOUND, inp, &error); if (sp == NULL) { ipsec6stat.out_inval++; goto freehdrs; } error = 0; /* check policy */ switch (sp->policy) { case IPSEC_POLICY_DISCARD: /* * This packet is just discarded. */ ipsec6stat.out_polvio++; goto freehdrs; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: /* no need to do IPsec. */ needipsec = 0; break; case IPSEC_POLICY_IPSEC: if (sp->req == NULL) { /* acquire a policy */ error = key_spdacquire(sp); goto freehdrs; } needipsec = 1; break; case IPSEC_POLICY_ENTRUST: default: printf("ip6_output: Invalid policy found. %d\n", sp->policy); } #endif /* IPSEC */ #ifdef FAST_IPSEC /* get a security policy for this packet */ if (inp == NULL) sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, 0, &error); else sp = ipsec_getpolicybysock(m, IPSEC_DIR_OUTBOUND, inp, &error); if (sp == NULL) { newipsecstat.ips_out_inval++; goto freehdrs; } error = 0; /* check policy */ switch (sp->policy) { case IPSEC_POLICY_DISCARD: /* * This packet is just discarded. */ newipsecstat.ips_out_polvio++; goto freehdrs; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: /* no need to do IPsec. */ needipsec = 0; break; case IPSEC_POLICY_IPSEC: if (sp->req == NULL) { /* acquire a policy */ error = key_spdacquire(sp); goto freehdrs; } needipsec = 1; break; case IPSEC_POLICY_ENTRUST: default: printf("ip6_output: Invalid policy found. %d\n", sp->policy); } #endif /* FAST_IPSEC */ /* * Calculate the total length of the extension header chain. * Keep the length of the unfragmentable part for fragmentation. */ optlen = 0; if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len; if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len; if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len; unfragpartlen = optlen + sizeof(struct ip6_hdr); /* NOTE: we don't add AH/ESP length here. do that later. */ if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len; /* * If we need IPsec, or there is at least one extension header, * separate IP6 header from the payload. */ if ((needipsec || optlen) && !hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* adjust mbuf packet header length */ m->m_pkthdr.len += optlen; plen = m->m_pkthdr.len - sizeof(*ip6); /* If this is a jumbo payload, insert a jumbo payload option. */ if (plen > IPV6_MAXPACKET) { if (!hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0) goto freehdrs; ip6->ip6_plen = 0; } else ip6->ip6_plen = htons(plen); /* * Concatenate headers and fill in next header fields. * Here we have, on "m" * IPv6 payload * and we insert headers accordingly. Finally, we should be getting: * IPv6 hbh dest1 rthdr ah* [esp* dest2 payload] * * during the header composing process, "m" points to IPv6 header. * "mprev" points to an extension header prior to esp. */ { u_char *nexthdrp = &ip6->ip6_nxt; struct mbuf *mprev = m; /* * we treat dest2 specially. this makes IPsec processing * much easier. the goal here is to make mprev point the * mbuf prior to dest2. * * result: IPv6 dest2 payload * m and mprev will point to IPv6 header. */ if (exthdrs.ip6e_dest2) { if (!hdrsplit) panic("assumption failed: hdr not split"); exthdrs.ip6e_dest2->m_next = m->m_next; m->m_next = exthdrs.ip6e_dest2; *mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_DSTOPTS; } #define MAKE_CHAIN(m, mp, p, i)\ do {\ if (m) {\ if (!hdrsplit) \ panic("assumption failed: hdr not split"); \ *mtod((m), u_char *) = *(p);\ *(p) = (i);\ p = mtod((m), u_char *);\ (m)->m_next = (mp)->m_next;\ (mp)->m_next = (m);\ (mp) = (m);\ }\ } while (/*CONSTCOND*/ 0) /* * result: IPv6 hbh dest1 rthdr dest2 payload * m will point to IPv6 header. mprev will point to the * extension header prior to dest2 (rthdr in the above case). */ MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS); MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp, IPPROTO_DSTOPTS); MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp, IPPROTO_ROUTING); #if defined(IPSEC) || defined(FAST_IPSEC) if (!needipsec) goto skip_ipsec2; /* * pointers after IPsec headers are not valid any more. * other pointers need a great care too. * (IPsec routines should not mangle mbufs prior to AH/ESP) */ exthdrs.ip6e_dest2 = NULL; { struct ip6_rthdr *rh = NULL; int segleft_org = 0; struct ipsec_output_state state; if (exthdrs.ip6e_rthdr) { rh = mtod(exthdrs.ip6e_rthdr, struct ip6_rthdr *); segleft_org = rh->ip6r_segleft; rh->ip6r_segleft = 0; } bzero(&state, sizeof(state)); state.m = m; error = ipsec6_output_trans(&state, nexthdrp, mprev, sp, flags, &needipsectun); m = state.m; if (error) { /* mbuf is already reclaimed in ipsec6_output_trans. */ m = NULL; switch (error) { case EHOSTUNREACH: case ENETUNREACH: case EMSGSIZE: case ENOBUFS: case ENOMEM: break; default: printf("ip6_output (ipsec): error code %d\n", error); /* FALLTHROUGH */ case ENOENT: /* don't show these error codes to the user */ error = 0; break; } goto bad; } if (exthdrs.ip6e_rthdr) { /* ah6_output doesn't modify mbuf chain */ rh->ip6r_segleft = segleft_org; } } skip_ipsec2:; #endif } /* * If there is a routing header, replace the destination address field * with the first hop of the routing header. */ if (exthdrs.ip6e_rthdr) { struct ip6_rthdr *rh = (struct ip6_rthdr *)(mtod(exthdrs.ip6e_rthdr, struct ip6_rthdr *)); struct ip6_rthdr0 *rh0; struct in6_addr *addr; struct sockaddr_in6 sa; switch (rh->ip6r_type) { case IPV6_RTHDR_TYPE_0: rh0 = (struct ip6_rthdr0 *)rh; addr = (struct in6_addr *)(rh0 + 1); /* * construct a sockaddr_in6 form of * the first hop. * * XXX: we may not have enough * information about its scope zone; * there is no standard API to pass * the information from the * application. */ bzero(&sa, sizeof(sa)); sa.sin6_family = AF_INET6; sa.sin6_len = sizeof(sa); sa.sin6_addr = addr[0]; if ((error = sa6_embedscope(&sa, ip6_use_defzone)) != 0) { goto bad; } ip6->ip6_dst = sa.sin6_addr; bcopy(&addr[1], &addr[0], sizeof(struct in6_addr) * (rh0->ip6r0_segleft - 1)); addr[rh0->ip6r0_segleft - 1] = finaldst; /* XXX */ in6_clearscope(addr + rh0->ip6r0_segleft - 1); break; default: /* is it possible? */ error = EINVAL; goto bad; } } /* Source address validation */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && (flags & IPV6_UNSPECSRC) == 0) { error = EOPNOTSUPP; ip6stat.ip6s_badscope++; goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { error = EOPNOTSUPP; ip6stat.ip6s_badscope++; goto bad; } ip6stat.ip6s_localout++; /* * Route packet. */ if (ro == 0) { ro = &ip6route; bzero((caddr_t)ro, sizeof(*ro)); } ro_pmtu = ro; if (opt && opt->ip6po_rthdr) ro = &opt->ip6po_route; dst = (struct sockaddr_in6 *)&ro->ro_dst; again: /* * if specified, try to fill in the traffic class field. * do not override if a non-zero value is already set. * we check the diffserv field and the ecn field separately. */ if (opt && opt->ip6po_tclass >= 0) { int mask = 0; if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0) mask |= 0xfc; if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0) mask |= 0x03; if (mask != 0) ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); } /* fill in or override the hop limit field, if necessary. */ if (opt && opt->ip6po_hlim != -1) ip6->ip6_hlim = opt->ip6po_hlim & 0xff; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (im6o != NULL) ip6->ip6_hlim = im6o->im6o_multicast_hlim; else ip6->ip6_hlim = ip6_defmcasthlim; } #if defined(IPSEC) || defined(FAST_IPSEC) if (needipsec && needipsectun) { struct ipsec_output_state state; /* * All the extension headers will become inaccessible * (since they can be encrypted). * Don't panic, we need no more updates to extension headers * on inner IPv6 packet (since they are now encapsulated). * * IPv6 [ESP|AH] IPv6 [extension headers] payload */ bzero(&exthdrs, sizeof(exthdrs)); exthdrs.ip6e_ip6 = m; bzero(&state, sizeof(state)); state.m = m; state.ro = (struct route *)ro; state.dst = (struct sockaddr *)dst; error = ipsec6_output_tunnel(&state, sp, flags); m = state.m; ro = (struct route_in6 *)state.ro; dst = (struct sockaddr_in6 *)state.dst; if (error) { /* mbuf is already reclaimed in ipsec6_output_tunnel. */ m0 = m = NULL; m = NULL; switch (error) { case EHOSTUNREACH: case ENETUNREACH: case EMSGSIZE: case ENOBUFS: case ENOMEM: break; default: printf("ip6_output (ipsec): error code %d\n", error); /* FALLTHROUGH */ case ENOENT: /* don't show these error codes to the user */ error = 0; break; } goto bad; } exthdrs.ip6e_ip6 = m; } #endif /* IPSEC */ /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; if ((error = in6_selectroute(&dst_sa, opt, im6o, ro, &ifp, &rt, 0)) != 0) { switch (error) { case EHOSTUNREACH: ip6stat.ip6s_noroute++; break; case EADDRNOTAVAIL: default: break; /* XXX statistics? */ } if (ifp != NULL) in6_ifstat_inc(ifp, ifs6_out_discard); goto bad; } if (rt == NULL) { /* * If in6_selectroute() does not return a route entry, * dst may not have been updated. */ *dst = dst_sa; /* XXX */ } /* * then rt (for unicast) and ifp must be non-NULL valid values. */ if ((flags & IPV6_FORWARDING) == 0) { /* XXX: the FORWARDING flag can be set for mrouting. */ in6_ifstat_inc(ifp, ifs6_out_request); } if (rt != NULL) { ia = (struct in6_ifaddr *)(rt->rt_ifa); rt->rt_use++; } /* * The outgoing interface must be in the zone of source and * destination addresses. We should use ia_ifp to support the * case of sending packets to an address of our own. */ if (ia != NULL && ia->ia_ifp) origifp = ia->ia_ifp; else origifp = ifp; src0 = ip6->ip6_src; if (in6_setscope(&src0, origifp, &zone)) goto badscope; bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = ip6->ip6_src; if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id) goto badscope; dst0 = ip6->ip6_dst; if (in6_setscope(&dst0, origifp, &zone)) goto badscope; /* re-initialize to be sure */ bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id) { goto badscope; } /* scope check is done. */ goto routefound; badscope: ip6stat.ip6s_badscope++; in6_ifstat_inc(origifp, ifs6_out_discard); if (error == 0) error = EHOSTUNREACH; /* XXX */ goto bad; routefound: if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (opt && opt->ip6po_nextroute.ro_rt) { /* * The nexthop is explicitly specified by the * application. We assume the next hop is an IPv6 * address. */ dst = (struct sockaddr_in6 *)opt->ip6po_nexthop; } else if ((rt->rt_flags & RTF_GATEWAY)) dst = (struct sockaddr_in6 *)rt->rt_gateway; } if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */ } else { struct in6_multi *in6m; m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST; in6_ifstat_inc(ifp, ifs6_out_mcast); /* * Confirm that the outgoing interface supports multicast. */ if (!(ifp->if_flags & IFF_MULTICAST)) { ip6stat.ip6s_noroute++; in6_ifstat_inc(ifp, ifs6_out_discard); error = ENETUNREACH; goto bad; } IN6_LOOKUP_MULTI(ip6->ip6_dst, ifp, in6m); if (in6m != NULL && (im6o == NULL || im6o->im6o_multicast_loop)) { /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not * forbid loopback, loop back a copy. */ ip6_mloopback(ifp, m, dst); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IPV6_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip6_mloopback(), * above, will be forwarded by the ip6_input() routine, * if necessary. */ if (ip6_mrouter && (flags & IPV6_FORWARDING) == 0) { /* * XXX: ip6_mforward expects that rcvif is NULL * when it is called from the originating path. * However, it is not always the case, since * some versions of MGETHDR() does not * initialize the field. */ m->m_pkthdr.rcvif = NULL; if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a hoplimit of zero may be looped back, * above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip6_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) { m_freem(m); goto done; } } /* * Fill the outgoing inteface to tell the upper layer * to increment per-interface statistics. */ if (ifpp) *ifpp = ifp; /* Determine path MTU. */ if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu, &alwaysfrag)) != 0) goto bad; /* * The caller of this function may specify to use the minimum MTU * in some cases. * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU * setting. The logic is a bit complicated; by default, unicast * packets will follow path MTU while multicast packets will be sent at * the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets * including unicast ones will be sent at the minimum MTU. Multicast * packets will always be sent at the minimum MTU unless * IP6PO_MINMTU_DISABLE is explicitly specified. * See RFC 3542 for more details. */ if (mtu > IPV6_MMTU) { if ((flags & IPV6_MINMTU)) mtu = IPV6_MMTU; else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL) mtu = IPV6_MMTU; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && (opt == NULL || opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) { mtu = IPV6_MMTU; } } /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); /* * Check with the firewall... */ if (ip6_fw_enable && ip6_fw_chk_ptr) { u_short port = 0; m->m_pkthdr.rcvif = NULL; /* XXX */ /* If ipfw says divert, we have to just drop packet */ if ((*ip6_fw_chk_ptr)(&ip6, ifp, &port, &m)) { m_freem(m); goto done; } if (!m) { error = EACCES; goto done; } } /* * If the outgoing packet contains a hop-by-hop options header, * it must be examined and processed even by the source node. * (RFC 2460, section 4.) */ if (exthdrs.ip6e_hbh) { struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *); u_int32_t dummy; /* XXX unused */ u_int32_t plen = 0; /* XXX: ip6_process will check the value */ #ifdef DIAGNOSTIC if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len) panic("ip6e_hbh is not continuous"); #endif /* * XXX: if we have to send an ICMPv6 error to the sender, * we need the M_LOOP flag since icmp6_error() expects * the IPv6 and the hop-by-hop options header are * continuous unless the flag is set. */ m->m_flags |= M_LOOP; m->m_pkthdr.rcvif = ifp; if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1), ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh), &dummy, &plen) < 0) { /* m was already freed at this point */ error = EINVAL;/* better error? */ goto done; } m->m_flags &= ~M_LOOP; /* XXX */ m->m_pkthdr.rcvif = NULL; } /* Jump over all PFIL processing if hooks are not active. */ - if (inet6_pfil_hook.ph_busy_count == -1) + if (!PFIL_HOOKED(&inet6_pfil_hook)) goto passout; odst = ip6->ip6_dst; /* Run through list of hooks for output packets. */ error = pfil_run_hooks(&inet6_pfil_hook, &m, ifp, PFIL_OUT, inp); if (error != 0 || m == NULL) goto done; ip6 = mtod(m, struct ip6_hdr *); /* See if destination IP address was changed by packet filter. */ if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip6_input(). */ if (in6_localaddr(&ip6->ip6_dst)) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; error = netisr_queue(NETISR_IPV6, m); goto done; } else goto again; /* Redo the routing table lookup. */ } /* XXX: IPFIREWALL_FORWARD */ passout: /* * Send the packet to the outgoing interface. * If necessary, do IPv6 fragmentation before sending. * * the logic here is rather complex: * 1: normal case (dontfrag == 0, alwaysfrag == 0) * 1-a: send as is if tlen <= path mtu * 1-b: fragment if tlen > path mtu * * 2: if user asks us not to fragment (dontfrag == 1) * 2-a: send as is if tlen <= interface mtu * 2-b: error if tlen > interface mtu * * 3: if we always need to attach fragment header (alwaysfrag == 1) * always fragment * * 4: if dontfrag == 1 && alwaysfrag == 1 * error, as we cannot handle this conflicting request */ tlen = m->m_pkthdr.len; if (opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) dontfrag = 1; else dontfrag = 0; if (dontfrag && alwaysfrag) { /* case 4 */ /* conflicting request - can't transmit */ error = EMSGSIZE; goto bad; } if (dontfrag && tlen > IN6_LINKMTU(ifp)) { /* case 2-b */ /* * Even if the DONTFRAG option is specified, we cannot send the * packet when the data length is larger than the MTU of the * outgoing interface. * Notify the error by sending IPV6_PATHMTU ancillary data as * well as returning an error code (the latter is not described * in the API spec.) */ u_int32_t mtu32; struct ip6ctlparam ip6cp; mtu32 = (u_int32_t)mtu; bzero(&ip6cp, sizeof(ip6cp)); ip6cp.ip6c_cmdarg = (void *)&mtu32; pfctlinput2(PRC_MSGSIZE, (struct sockaddr *)&ro_pmtu->ro_dst, (void *)&ip6cp); error = EMSGSIZE; goto bad; } /* * transmit packet without fragmentation */ if (dontfrag || (!alwaysfrag && tlen <= mtu)) { /* case 1-a and 2-a */ struct in6_ifaddr *ia6; ip6 = mtod(m, struct ip6_hdr *); ia6 = in6_ifawithifp(ifp, &ip6->ip6_src); if (ia6) { /* Record statistics for this interface address. */ ia6->ia_ifa.if_opackets++; ia6->ia_ifa.if_obytes += m->m_pkthdr.len; } #ifdef IPSEC /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif error = nd6_output(ifp, origifp, m, dst, ro->ro_rt); goto done; } /* * try to fragment the packet. case 1-b and 3 */ if (mtu < IPV6_MMTU) { /* path MTU cannot be less than IPV6_MMTU */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else if (ip6->ip6_plen == 0) { /* jumbo payload cannot be fragmented */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else { struct mbuf **mnext, *m_frgpart; struct ip6_frag *ip6f; u_int32_t id = htonl(ip6_randomid()); u_char nextproto; #if 0 struct ip6ctlparam ip6cp; u_int32_t mtu32; #endif int qslots = ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len; /* * Too large for the destination or interface; * fragment if possible. * Must be able to put at least 8 bytes per fragment. */ hlen = unfragpartlen; if (mtu > IPV6_MAXPACKET) mtu = IPV6_MAXPACKET; #if 0 /* * It is believed this code is a leftover from the * development of the IPV6_RECVPATHMTU sockopt and * associated work to implement RFC3542. * It's not entirely clear what the intent of the API * is at this point, so disable this code for now. * The IPV6_RECVPATHMTU sockopt and/or IPV6_DONTFRAG * will send notifications if the application requests. */ /* Notify a proper path MTU to applications. */ mtu32 = (u_int32_t)mtu; bzero(&ip6cp, sizeof(ip6cp)); ip6cp.ip6c_cmdarg = (void *)&mtu32; pfctlinput2(PRC_MSGSIZE, (struct sockaddr *)&ro_pmtu->ro_dst, (void *)&ip6cp); #endif len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7; if (len < 8) { error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } /* * Verify that we have any chance at all of being able to queue * the packet or packet fragments */ if (qslots <= 0 || ((u_int)qslots * (mtu - hlen) < tlen /* - hlen */)) { error = ENOBUFS; ip6stat.ip6s_odropped++; goto bad; } mnext = &m->m_nextpkt; /* * Change the next header field of the last header in the * unfragmentable part. */ if (exthdrs.ip6e_rthdr) { nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *); *mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_dest1) { nextproto = *mtod(exthdrs.ip6e_dest1, u_char *); *mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_hbh) { nextproto = *mtod(exthdrs.ip6e_hbh, u_char *); *mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT; } else { nextproto = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_FRAGMENT; } /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto * chain. */ m0 = m; for (off = hlen; off < tlen; off += len) { MGETHDR(m, M_DONTWAIT, MT_HEADER); if (!m) { error = ENOBUFS; ip6stat.ip6s_odropped++; goto sendorfree; } m->m_pkthdr.rcvif = NULL; m->m_flags = m0->m_flags & M_COPYFLAGS; *mnext = m; mnext = &m->m_nextpkt; m->m_data += max_linkhdr; mhip6 = mtod(m, struct ip6_hdr *); *mhip6 = *ip6; m->m_len = sizeof(*mhip6); error = ip6_insertfraghdr(m0, m, hlen, &ip6f); if (error) { ip6stat.ip6s_odropped++; goto sendorfree; } ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7)); if (off + len >= tlen) len = tlen - off; else ip6f->ip6f_offlg |= IP6F_MORE_FRAG; mhip6->ip6_plen = htons((u_short)(len + hlen + sizeof(*ip6f) - sizeof(struct ip6_hdr))); if ((m_frgpart = m_copy(m0, off, len)) == 0) { error = ENOBUFS; ip6stat.ip6s_odropped++; goto sendorfree; } m_cat(m, m_frgpart); m->m_pkthdr.len = len + hlen + sizeof(*ip6f); m->m_pkthdr.rcvif = NULL; ip6f->ip6f_reserved = 0; ip6f->ip6f_ident = id; ip6f->ip6f_nxt = nextproto; ip6stat.ip6s_ofragments++; in6_ifstat_inc(ifp, ifs6_out_fragcreat); } in6_ifstat_inc(ifp, ifs6_out_fragok); } /* * Remove leading garbages. */ sendorfree: m = m0->m_nextpkt; m0->m_nextpkt = 0; m_freem(m0); for (m0 = m; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia) { ia->ia_ifa.if_opackets++; ia->ia_ifa.if_obytes += m->m_pkthdr.len; } #ifdef IPSEC /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif error = nd6_output(ifp, origifp, m, dst, ro->ro_rt); } else m_freem(m); } if (error == 0) ip6stat.ip6s_fragmented++; done: if (ro == &ip6route && ro->ro_rt) { /* brace necessary for RTFREE */ RTFREE(ro->ro_rt); } else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) { RTFREE(ro_pmtu->ro_rt); } #ifdef IPSEC if (sp != NULL) key_freesp(sp); #endif /* IPSEC */ #ifdef FAST_IPSEC if (sp != NULL) KEY_FREESP(&sp); #endif /* FAST_IPSEC */ return (error); freehdrs: m_freem(exthdrs.ip6e_hbh); /* m_freem will check if mbuf is 0 */ m_freem(exthdrs.ip6e_dest1); m_freem(exthdrs.ip6e_rthdr); m_freem(exthdrs.ip6e_dest2); /* FALLTHROUGH */ bad: m_freem(m); goto done; } static int ip6_copyexthdr(mp, hdr, hlen) struct mbuf **mp; caddr_t hdr; int hlen; { struct mbuf *m; if (hlen > MCLBYTES) return (ENOBUFS); /* XXX */ MGET(m, M_DONTWAIT, MT_DATA); if (!m) return (ENOBUFS); if (hlen > MLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return (ENOBUFS); } } m->m_len = hlen; if (hdr) bcopy(hdr, mtod(m, caddr_t), hlen); *mp = m; return (0); } /* * Insert jumbo payload option. */ static int ip6_insert_jumboopt(exthdrs, plen) struct ip6_exthdrs *exthdrs; u_int32_t plen; { struct mbuf *mopt; u_char *optbuf; u_int32_t v; #define JUMBOOPTLEN 8 /* length of jumbo payload option and padding */ /* * If there is no hop-by-hop options header, allocate new one. * If there is one but it doesn't have enough space to store the * jumbo payload option, allocate a cluster to store the whole options. * Otherwise, use it to store the options. */ if (exthdrs->ip6e_hbh == 0) { MGET(mopt, M_DONTWAIT, MT_DATA); if (mopt == 0) return (ENOBUFS); mopt->m_len = JUMBOOPTLEN; optbuf = mtod(mopt, u_char *); optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */ exthdrs->ip6e_hbh = mopt; } else { struct ip6_hbh *hbh; mopt = exthdrs->ip6e_hbh; if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) { /* * XXX assumption: * - exthdrs->ip6e_hbh is not referenced from places * other than exthdrs. * - exthdrs->ip6e_hbh is not an mbuf chain. */ int oldoptlen = mopt->m_len; struct mbuf *n; /* * XXX: give up if the whole (new) hbh header does * not fit even in an mbuf cluster. */ if (oldoptlen + JUMBOOPTLEN > MCLBYTES) return (ENOBUFS); /* * As a consequence, we must always prepare a cluster * at this point. */ MGET(n, M_DONTWAIT, MT_DATA); if (n) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_freem(n); n = NULL; } } if (!n) return (ENOBUFS); n->m_len = oldoptlen + JUMBOOPTLEN; bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t), oldoptlen); optbuf = mtod(n, caddr_t) + oldoptlen; m_freem(mopt); mopt = exthdrs->ip6e_hbh = n; } else { optbuf = mtod(mopt, u_char *) + mopt->m_len; mopt->m_len += JUMBOOPTLEN; } optbuf[0] = IP6OPT_PADN; optbuf[1] = 1; /* * Adjust the header length according to the pad and * the jumbo payload option. */ hbh = mtod(mopt, struct ip6_hbh *); hbh->ip6h_len += (JUMBOOPTLEN >> 3); } /* fill in the option. */ optbuf[2] = IP6OPT_JUMBO; optbuf[3] = 4; v = (u_int32_t)htonl(plen + JUMBOOPTLEN); bcopy(&v, &optbuf[4], sizeof(u_int32_t)); /* finally, adjust the packet header length */ exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN; return (0); #undef JUMBOOPTLEN } /* * Insert fragment header and copy unfragmentable header portions. */ static int ip6_insertfraghdr(m0, m, hlen, frghdrp) struct mbuf *m0, *m; int hlen; struct ip6_frag **frghdrp; { struct mbuf *n, *mlast; if (hlen > sizeof(struct ip6_hdr)) { n = m_copym(m0, sizeof(struct ip6_hdr), hlen - sizeof(struct ip6_hdr), M_DONTWAIT); if (n == 0) return (ENOBUFS); m->m_next = n; } else n = m; /* Search for the last mbuf of unfragmentable part. */ for (mlast = n; mlast->m_next; mlast = mlast->m_next) ; if ((mlast->m_flags & M_EXT) == 0 && M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) { /* use the trailing space of the last mbuf for the fragment hdr */ *frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) + mlast->m_len); mlast->m_len += sizeof(struct ip6_frag); m->m_pkthdr.len += sizeof(struct ip6_frag); } else { /* allocate a new mbuf for the fragment header */ struct mbuf *mfrg; MGET(mfrg, M_DONTWAIT, MT_DATA); if (mfrg == 0) return (ENOBUFS); mfrg->m_len = sizeof(struct ip6_frag); *frghdrp = mtod(mfrg, struct ip6_frag *); mlast->m_next = mfrg; } return (0); } static int ip6_getpmtu(ro_pmtu, ro, ifp, dst, mtup, alwaysfragp) struct route_in6 *ro_pmtu, *ro; struct ifnet *ifp; struct in6_addr *dst; u_long *mtup; int *alwaysfragp; { u_int32_t mtu = 0; int alwaysfrag = 0; int error = 0; if (ro_pmtu != ro) { /* The first hop and the final destination may differ. */ struct sockaddr_in6 *sa6_dst = (struct sockaddr_in6 *)&ro_pmtu->ro_dst; if (ro_pmtu->ro_rt && ((ro_pmtu->ro_rt->rt_flags & RTF_UP) == 0 || !IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))) { RTFREE(ro_pmtu->ro_rt); ro_pmtu->ro_rt = (struct rtentry *)NULL; } if (ro_pmtu->ro_rt == NULL) { bzero(sa6_dst, sizeof(*sa6_dst)); sa6_dst->sin6_family = AF_INET6; sa6_dst->sin6_len = sizeof(struct sockaddr_in6); sa6_dst->sin6_addr = *dst; rtalloc((struct route *)ro_pmtu); } } if (ro_pmtu->ro_rt) { u_int32_t ifmtu; struct in_conninfo inc; bzero(&inc, sizeof(inc)); inc.inc_flags = 1; /* IPv6 */ inc.inc6_faddr = *dst; if (ifp == NULL) ifp = ro_pmtu->ro_rt->rt_ifp; ifmtu = IN6_LINKMTU(ifp); mtu = tcp_hc_getmtu(&inc); if (mtu) mtu = min(mtu, ro_pmtu->ro_rt->rt_rmx.rmx_mtu); else mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu; if (mtu == 0) mtu = ifmtu; else if (mtu < IPV6_MMTU) { /* * RFC2460 section 5, last paragraph: * if we record ICMPv6 too big message with * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU * or smaller, with framgent header attached. * (fragment header is needed regardless from the * packet size, for translators to identify packets) */ alwaysfrag = 1; mtu = IPV6_MMTU; } else if (mtu > ifmtu) { /* * The MTU on the route is larger than the MTU on * the interface! This shouldn't happen, unless the * MTU of the interface has been changed after the * interface was brought up. Change the MTU in the * route to match the interface MTU (as long as the * field isn't locked). */ mtu = ifmtu; ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu; } } else if (ifp) { mtu = IN6_LINKMTU(ifp); } else error = EHOSTUNREACH; /* XXX */ *mtup = mtu; if (alwaysfragp) *alwaysfragp = alwaysfrag; return (error); } /* * IP6 socket option processing. */ int ip6_ctloutput(so, sopt) struct socket *so; struct sockopt *sopt; { int privileged, optdatalen, uproto; void *optdata; struct inpcb *in6p = sotoinpcb(so); int error, optval; int level, op, optname; int optlen; struct thread *td; if (sopt) { level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; td = sopt->sopt_td; } else { panic("ip6_ctloutput: arg soopt is NULL"); } error = optval = 0; privileged = (td == 0 || suser(td)) ? 0 : 1; uproto = (int)so->so_proto->pr_protocol; if (level == IPPROTO_IPV6) { switch (op) { case SOPT_SET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif { struct mbuf *m; error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; error = ip6_pcbopts(&in6p->in6p_outputopts, m, so, sopt); m_freem(m); /* XXX */ break; } /* * Use of some Hop-by-Hop options or some * Destination options, might require special * privilege. That is, normal applications * (without special privilege) might be forbidden * from setting certain options in outgoing packets, * and might never see certain options in received * packets. [RFC 2292 Section 6] * KAME specific note: * KAME prevents non-privileged users from sending or * receiving ANY hbh/dst options in order to avoid * overhead of parsing options in the kernel. */ case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: if (!privileged) { error = EPERM; break; } /* FALLTHROUGH */ case IPV6_UNICAST_HOPS: case IPV6_HOPLIMIT: case IPV6_FAITH: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_RECVTCLASS: case IPV6_V6ONLY: case IPV6_AUTOFLOWLABEL: if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_UNICAST_HOPS: if (optval < -1 || optval >= 256) error = EINVAL; else { /* -1 = kernel default */ in6p->in6p_hops = optval; if ((in6p->in6p_vflag & INP_IPV4) != 0) in6p->inp_ip_ttl = optval; } break; #define OPTSET(bit) \ do { \ if (optval) \ in6p->in6p_flags |= (bit); \ else \ in6p->in6p_flags &= ~(bit); \ } while (/*CONSTCOND*/ 0) #define OPTSET2292(bit) \ do { \ in6p->in6p_flags |= IN6P_RFC2292; \ if (optval) \ in6p->in6p_flags |= (bit); \ else \ in6p->in6p_flags &= ~(bit); \ } while (/*CONSTCOND*/ 0) #define OPTBIT(bit) (in6p->in6p_flags & (bit) ? 1 : 0) case IPV6_RECVPKTINFO: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_PKTINFO); break; case IPV6_HOPLIMIT: { struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } optp = &in6p->in6p_outputopts; error = ip6_pcbopt(IPV6_HOPLIMIT, (u_char *)&optval, sizeof(optval), optp, privileged, uproto); break; } case IPV6_RECVHOPLIMIT: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_HOPLIMIT); break; case IPV6_RECVHOPOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_RTHDRDSTOPTS); break; case IPV6_RECVRTHDR: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_RTHDR); break; case IPV6_FAITH: OPTSET(IN6P_FAITH); break; case IPV6_RECVPATHMTU: /* * We ignore this option for TCP * sockets. * (RFC3542 leaves this case * unspecified.) */ if (uproto != IPPROTO_TCP) OPTSET(IN6P_MTU); break; case IPV6_V6ONLY: /* * make setsockopt(IPV6_V6ONLY) * available only prior to bind(2). * see ipng mailing list, Jun 22 2001. */ if (in6p->in6p_lport || !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) { error = EINVAL; break; } OPTSET(IN6P_IPV6_V6ONLY); if (optval) in6p->in6p_vflag &= ~INP_IPV4; else in6p->in6p_vflag |= INP_IPV4; break; case IPV6_RECVTCLASS: /* cannot mix with RFC2292 XXX */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: OPTSET(IN6P_AUTOFLOWLABEL); break; } break; case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: if (optlen != sizeof(optval)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; { struct ip6_pktopts **optp; optp = &in6p->in6p_outputopts; error = ip6_pcbopt(optname, (u_char *)&optval, sizeof(optval), optp, privileged, uproto); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: /* RFC 2292 */ if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_2292PKTINFO: OPTSET2292(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: OPTSET2292(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: /* * Check super-user privilege. * See comments for IPV6_RECVHOPOPTS. */ if (!privileged) return (EPERM); OPTSET2292(IN6P_HOPOPTS); break; case IPV6_2292DSTOPTS: if (!privileged) return (EPERM); OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */ break; case IPV6_2292RTHDR: OPTSET2292(IN6P_RTHDR); break; } break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: { /* new advanced API (RFC3542) */ u_char *optbuf; u_char optbuf_storage[MCLBYTES]; int optlen; struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } /* * We only ensure valsize is not too large * here. Further validation will be done * later. */ error = sooptcopyin(sopt, optbuf_storage, sizeof(optbuf_storage), 0); if (error) break; optlen = sopt->sopt_valsize; optbuf = optbuf_storage; optp = &in6p->in6p_outputopts; error = ip6_pcbopt(optname, optbuf, optlen, optp, privileged, uproto); break; } #undef OPTSET case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: { if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } /* XXX */ } /* FALLTHROUGH */ { struct mbuf *m; if (sopt->sopt_valsize > MCLBYTES) { error = EMSGSIZE; break; } /* XXX */ MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); if (m == 0) { error = ENOBUFS; break; } if (sopt->sopt_valsize > MLEN) { MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); error = ENOBUFS; break; } } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); if (error) { (void)m_free(m); break; } error = ip6_setmoptions(sopt->sopt_name, &in6p->in6p_moptions, m); (void)m_free(m); } break; case IPV6_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optval) { case IPV6_PORTRANGE_DEFAULT: in6p->in6p_flags &= ~(IN6P_LOWPORT); in6p->in6p_flags &= ~(IN6P_HIGHPORT); break; case IPV6_PORTRANGE_HIGH: in6p->in6p_flags &= ~(IN6P_LOWPORT); in6p->in6p_flags |= IN6P_HIGHPORT; break; case IPV6_PORTRANGE_LOW: in6p->in6p_flags &= ~(IN6P_HIGHPORT); in6p->in6p_flags |= IN6P_LOWPORT; break; default: error = EINVAL; break; } break; #if defined(IPSEC) || defined(FAST_IPSEC) case IPV6_IPSEC_POLICY: { caddr_t req = NULL; size_t len = 0; struct mbuf *m; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; if (m) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec6_set_policy(in6p, optname, req, len, privileged); m_freem(m); } break; #endif /* KAME IPSEC */ case IPV6_FW_ADD: case IPV6_FW_DEL: case IPV6_FW_FLUSH: case IPV6_FW_ZERO: { struct mbuf *m; struct mbuf **mp = &m; if (ip6_fw_ctl_ptr == NULL) return EINVAL; /* XXX */ if ((error = soopt_getm(sopt, &m)) != 0) break; /* XXX */ if ((error = soopt_mcopyin(sopt, m)) != 0) break; error = (*ip6_fw_ctl_ptr)(optname, mp); m = *mp; } break; default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif /* * RFC3542 (effectively) deprecated the * semantics of the 2292-style pktoptions. * Since it was not reliable in nature (i.e., * applications had to expect the lack of some * information after all), it would make sense * to simplify this part by always returning * empty data. */ sopt->sopt_valsize = 0; break; case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: case IPV6_UNICAST_HOPS: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_FAITH: case IPV6_V6ONLY: case IPV6_PORTRANGE: case IPV6_RECVTCLASS: case IPV6_AUTOFLOWLABEL: switch (optname) { case IPV6_RECVHOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: optval = OPTBIT(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: optval = OPTBIT(IN6P_RTHDRDSTOPTS); break; case IPV6_UNICAST_HOPS: optval = in6p->in6p_hops; break; case IPV6_RECVPKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_RECVHOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_RECVRTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: optval = OPTBIT(IN6P_MTU); break; case IPV6_FAITH: optval = OPTBIT(IN6P_FAITH); break; case IPV6_V6ONLY: optval = OPTBIT(IN6P_IPV6_V6ONLY); break; case IPV6_PORTRANGE: { int flags; flags = in6p->in6p_flags; if (flags & IN6P_HIGHPORT) optval = IPV6_PORTRANGE_HIGH; else if (flags & IN6P_LOWPORT) optval = IPV6_PORTRANGE_LOW; else optval = 0; break; } case IPV6_RECVTCLASS: optval = OPTBIT(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: optval = OPTBIT(IN6P_AUTOFLOWLABEL); break; } if (error) break; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PATHMTU: { u_long pmtu = 0; struct ip6_mtuinfo mtuinfo; struct route_in6 sro; bzero(&sro, sizeof(sro)); if (!(so->so_state & SS_ISCONNECTED)) return (ENOTCONN); /* * XXX: we dot not consider the case of source * routing, or optional information to specify * the outgoing interface. */ error = ip6_getpmtu(&sro, NULL, NULL, &in6p->in6p_faddr, &pmtu, NULL); if (sro.ro_rt) RTFREE(sro.ro_rt); if (error) break; if (pmtu > IPV6_MAXPACKET) pmtu = IPV6_MAXPACKET; bzero(&mtuinfo, sizeof(mtuinfo)); mtuinfo.ip6m_mtu = (u_int32_t)pmtu; optdata = (void *)&mtuinfo; optdatalen = sizeof(mtuinfo); error = sooptcopyout(sopt, optdata, optdatalen); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292RTHDR: case IPV6_2292DSTOPTS: switch (optname) { case IPV6_2292PKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_2292RTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_2292DSTOPTS: optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: error = ip6_getpcbopt(in6p->in6p_outputopts, optname, sopt); break; case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: { struct mbuf *m; error = ip6_getmoptions(sopt->sopt_name, in6p->in6p_moptions, &m); if (error == 0) error = sooptcopyout(sopt, mtod(m, char *), m->m_len); m_freem(m); } break; #if defined(IPSEC) || defined(FAST_IPSEC) case IPV6_IPSEC_POLICY: { caddr_t req = NULL; size_t len = 0; struct mbuf *m = NULL; struct mbuf **mp = &m; size_t ovalsize = sopt->sopt_valsize; caddr_t oval = (caddr_t)sopt->sopt_val; error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; sopt->sopt_valsize = ovalsize; sopt->sopt_val = oval; if (m) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec6_get_policy(in6p, req, len, mp); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0 && m) m_freem(m); break; } #endif /* KAME IPSEC */ case IPV6_FW_GET: { struct mbuf *m; struct mbuf **mp = &m; if (ip6_fw_ctl_ptr == NULL) { return EINVAL; } error = (*ip6_fw_ctl_ptr)(optname, mp); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0 && m) m_freem(m); } break; default: error = ENOPROTOOPT; break; } break; } } else { /* level != IPPROTO_IPV6 */ error = EINVAL; } return (error); } int ip6_raw_ctloutput(so, sopt) struct socket *so; struct sockopt *sopt; { int error = 0, optval, optlen; const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum); struct in6pcb *in6p = sotoin6pcb(so); int level, op, optname; if (sopt) { level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; } else panic("ip6_raw_ctloutput: arg soopt is NULL"); if (level != IPPROTO_IPV6) { return (EINVAL); } switch (optname) { case IPV6_CHECKSUM: /* * For ICMPv6 sockets, no modification allowed for checksum * offset, permit "no change" values to help existing apps. * * RFC3542 says: "An attempt to set IPV6_CHECKSUM * for an ICMPv6 socket will fail." * The current behavior does not meet RFC3542. */ switch (op) { case SOPT_SET: if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; if ((optval % 2) != 0) { /* the API assumes even offset values */ error = EINVAL; } else if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (optval != icmp6off) error = EINVAL; } else in6p->in6p_cksum = optval; break; case SOPT_GET: if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) optval = icmp6off; else optval = in6p->in6p_cksum; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = EINVAL; break; } break; default: error = ENOPROTOOPT; break; } return (error); } /* * Set up IP6 options in pcb for insertion in output packets or * specifying behavior of outgoing packets. */ static int ip6_pcbopts(pktopt, m, so, sopt) struct ip6_pktopts **pktopt; struct mbuf *m; struct socket *so; struct sockopt *sopt; { struct ip6_pktopts *opt = *pktopt; int error = 0; struct thread *td = sopt->sopt_td; int priv = 0; /* turn off any old options. */ if (opt) { #ifdef DIAGNOSTIC if (opt->ip6po_pktinfo || opt->ip6po_nexthop || opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 || opt->ip6po_rhinfo.ip6po_rhi_rthdr) printf("ip6_pcbopts: all specified options are cleared.\n"); #endif ip6_clearpktopts(opt, -1); } else opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK); *pktopt = NULL; if (!m || m->m_len == 0) { /* * Only turning off any previous options, regardless of * whether the opt is just created or given. */ free(opt, M_IP6OPT); return (0); } /* set options specified by user. */ if (td && !suser(td)) priv = 1; if ((error = ip6_setpktopts(m, opt, NULL, priv, so->so_proto->pr_protocol)) != 0) { ip6_clearpktopts(opt, -1); /* XXX: discard all options */ free(opt, M_IP6OPT); return (error); } *pktopt = opt; return (0); } /* * initialize ip6_pktopts. beware that there are non-zero default values in * the struct. */ void ip6_initpktopts(opt) struct ip6_pktopts *opt; { bzero(opt, sizeof(*opt)); opt->ip6po_hlim = -1; /* -1 means default hop limit */ opt->ip6po_tclass = -1; /* -1 means default traffic class */ opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY; opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM; } static int ip6_pcbopt(optname, buf, len, pktopt, priv, uproto) int optname, len, priv; u_char *buf; struct ip6_pktopts **pktopt; int uproto; { struct ip6_pktopts *opt; if (*pktopt == NULL) { *pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT, M_WAITOK); ip6_initpktopts(*pktopt); } opt = *pktopt; return (ip6_setpktopt(optname, buf, len, opt, priv, 1, 0, uproto)); } static int ip6_getpcbopt(pktopt, optname, sopt) struct ip6_pktopts *pktopt; struct sockopt *sopt; int optname; { void *optdata = NULL; int optdatalen = 0; struct ip6_ext *ip6e; int error = 0; struct in6_pktinfo null_pktinfo; int deftclass = 0, on; int defminmtu = IP6PO_MINMTU_MCASTONLY; int defpreftemp = IP6PO_TEMPADDR_SYSTEM; switch (optname) { case IPV6_PKTINFO: if (pktopt && pktopt->ip6po_pktinfo) optdata = (void *)pktopt->ip6po_pktinfo; else { /* XXX: we don't have to do this every time... */ bzero(&null_pktinfo, sizeof(null_pktinfo)); optdata = (void *)&null_pktinfo; } optdatalen = sizeof(struct in6_pktinfo); break; case IPV6_TCLASS: if (pktopt && pktopt->ip6po_tclass >= 0) optdata = (void *)&pktopt->ip6po_tclass; else optdata = (void *)&deftclass; optdatalen = sizeof(int); break; case IPV6_HOPOPTS: if (pktopt && pktopt->ip6po_hbh) { optdata = (void *)pktopt->ip6po_hbh; ip6e = (struct ip6_ext *)pktopt->ip6po_hbh; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_RTHDR: if (pktopt && pktopt->ip6po_rthdr) { optdata = (void *)pktopt->ip6po_rthdr; ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_RTHDRDSTOPTS: if (pktopt && pktopt->ip6po_dest1) { optdata = (void *)pktopt->ip6po_dest1; ip6e = (struct ip6_ext *)pktopt->ip6po_dest1; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_DSTOPTS: if (pktopt && pktopt->ip6po_dest2) { optdata = (void *)pktopt->ip6po_dest2; ip6e = (struct ip6_ext *)pktopt->ip6po_dest2; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_NEXTHOP: if (pktopt && pktopt->ip6po_nexthop) { optdata = (void *)pktopt->ip6po_nexthop; optdatalen = pktopt->ip6po_nexthop->sa_len; } break; case IPV6_USE_MIN_MTU: if (pktopt) optdata = (void *)&pktopt->ip6po_minmtu; else optdata = (void *)&defminmtu; optdatalen = sizeof(int); break; case IPV6_DONTFRAG: if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG)) on = 1; else on = 0; optdata = (void *)&on; optdatalen = sizeof(on); break; case IPV6_PREFER_TEMPADDR: if (pktopt) optdata = (void *)&pktopt->ip6po_prefer_tempaddr; else optdata = (void *)&defpreftemp; optdatalen = sizeof(int); break; default: /* should not happen */ #ifdef DIAGNOSTIC panic("ip6_getpcbopt: unexpected option\n"); #endif return (ENOPROTOOPT); } error = sooptcopyout(sopt, optdata, optdatalen); return (error); } void ip6_clearpktopts(pktopt, optname) struct ip6_pktopts *pktopt; int optname; { if (pktopt == NULL) return; if (optname == -1 || optname == IPV6_PKTINFO) { if (pktopt->ip6po_pktinfo) free(pktopt->ip6po_pktinfo, M_IP6OPT); pktopt->ip6po_pktinfo = NULL; } if (optname == -1 || optname == IPV6_HOPLIMIT) pktopt->ip6po_hlim = -1; if (optname == -1 || optname == IPV6_TCLASS) pktopt->ip6po_tclass = -1; if (optname == -1 || optname == IPV6_NEXTHOP) { if (pktopt->ip6po_nextroute.ro_rt) { RTFREE(pktopt->ip6po_nextroute.ro_rt); pktopt->ip6po_nextroute.ro_rt = NULL; } if (pktopt->ip6po_nexthop) free(pktopt->ip6po_nexthop, M_IP6OPT); pktopt->ip6po_nexthop = NULL; } if (optname == -1 || optname == IPV6_HOPOPTS) { if (pktopt->ip6po_hbh) free(pktopt->ip6po_hbh, M_IP6OPT); pktopt->ip6po_hbh = NULL; } if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) { if (pktopt->ip6po_dest1) free(pktopt->ip6po_dest1, M_IP6OPT); pktopt->ip6po_dest1 = NULL; } if (optname == -1 || optname == IPV6_RTHDR) { if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT); pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL; if (pktopt->ip6po_route.ro_rt) { RTFREE(pktopt->ip6po_route.ro_rt); pktopt->ip6po_route.ro_rt = NULL; } } if (optname == -1 || optname == IPV6_DSTOPTS) { if (pktopt->ip6po_dest2) free(pktopt->ip6po_dest2, M_IP6OPT); pktopt->ip6po_dest2 = NULL; } } #define PKTOPT_EXTHDRCPY(type) \ do {\ if (src->type) {\ int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\ dst->type = malloc(hlen, M_IP6OPT, canwait);\ if (dst->type == NULL && canwait == M_NOWAIT)\ goto bad;\ bcopy(src->type, dst->type, hlen);\ }\ } while (/*CONSTCOND*/ 0) static int copypktopts(dst, src, canwait) struct ip6_pktopts *dst, *src; int canwait; { if (dst == NULL || src == NULL) { printf("ip6_clearpktopts: invalid argument\n"); return (EINVAL); } dst->ip6po_hlim = src->ip6po_hlim; dst->ip6po_tclass = src->ip6po_tclass; dst->ip6po_flags = src->ip6po_flags; if (src->ip6po_pktinfo) { dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo), M_IP6OPT, canwait); if (dst->ip6po_pktinfo == NULL && canwait == M_NOWAIT) goto bad; *dst->ip6po_pktinfo = *src->ip6po_pktinfo; } if (src->ip6po_nexthop) { dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len, M_IP6OPT, canwait); if (dst->ip6po_nexthop == NULL) goto bad; bcopy(src->ip6po_nexthop, dst->ip6po_nexthop, src->ip6po_nexthop->sa_len); } PKTOPT_EXTHDRCPY(ip6po_hbh); PKTOPT_EXTHDRCPY(ip6po_dest1); PKTOPT_EXTHDRCPY(ip6po_dest2); PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */ return (0); bad: if (dst->ip6po_pktinfo) free(dst->ip6po_pktinfo, M_IP6OPT); if (dst->ip6po_nexthop) free(dst->ip6po_nexthop, M_IP6OPT); if (dst->ip6po_hbh) free(dst->ip6po_hbh, M_IP6OPT); if (dst->ip6po_dest1) free(dst->ip6po_dest1, M_IP6OPT); if (dst->ip6po_dest2) free(dst->ip6po_dest2, M_IP6OPT); if (dst->ip6po_rthdr) free(dst->ip6po_rthdr, M_IP6OPT); return (ENOBUFS); } #undef PKTOPT_EXTHDRCPY struct ip6_pktopts * ip6_copypktopts(src, canwait) struct ip6_pktopts *src; int canwait; { int error; struct ip6_pktopts *dst; dst = malloc(sizeof(*dst), M_IP6OPT, canwait); if (dst == NULL && canwait == M_NOWAIT) return (NULL); ip6_initpktopts(dst); if ((error = copypktopts(dst, src, canwait)) != 0) { free(dst, M_IP6OPT); return (NULL); } return (dst); } void ip6_freepcbopts(pktopt) struct ip6_pktopts *pktopt; { if (pktopt == NULL) return; ip6_clearpktopts(pktopt, -1); free(pktopt, M_IP6OPT); } /* * Set the IP6 multicast options in response to user setsockopt(). */ static int ip6_setmoptions(optname, im6op, m) int optname; struct ip6_moptions **im6op; struct mbuf *m; { int error = 0; u_int loop, ifindex; struct ipv6_mreq *mreq; struct ifnet *ifp; struct ip6_moptions *im6o = *im6op; struct route_in6 ro; struct in6_multi_mship *imm; struct thread *td = curthread; if (im6o == NULL) { /* * No multicast option buffer attached to the pcb; * allocate one and initialize to default values. */ im6o = (struct ip6_moptions *) malloc(sizeof(*im6o), M_IP6MOPTS, M_WAITOK); if (im6o == NULL) return (ENOBUFS); *im6op = im6o; im6o->im6o_multicast_ifp = NULL; im6o->im6o_multicast_hlim = ip6_defmcasthlim; im6o->im6o_multicast_loop = IPV6_DEFAULT_MULTICAST_LOOP; LIST_INIT(&im6o->im6o_memberships); } switch (optname) { case IPV6_MULTICAST_IF: /* * Select the interface for outgoing multicast packets. */ if (m == NULL || m->m_len != sizeof(u_int)) { error = EINVAL; break; } bcopy(mtod(m, u_int *), &ifindex, sizeof(ifindex)); if (ifindex < 0 || if_index < ifindex) { error = ENXIO; /* XXX EINVAL? */ break; } ifp = ifnet_byindex(ifindex); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; break; } im6o->im6o_multicast_ifp = ifp; break; case IPV6_MULTICAST_HOPS: { /* * Set the IP6 hoplimit for outgoing multicast packets. */ int optval; if (m == NULL || m->m_len != sizeof(int)) { error = EINVAL; break; } bcopy(mtod(m, u_int *), &optval, sizeof(optval)); if (optval < -1 || optval >= 256) error = EINVAL; else if (optval == -1) im6o->im6o_multicast_hlim = ip6_defmcasthlim; else im6o->im6o_multicast_hlim = optval; break; } case IPV6_MULTICAST_LOOP: /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. */ if (m == NULL || m->m_len != sizeof(u_int)) { error = EINVAL; break; } bcopy(mtod(m, u_int *), &loop, sizeof(loop)); if (loop > 1) { error = EINVAL; break; } im6o->im6o_multicast_loop = loop; break; case IPV6_JOIN_GROUP: /* * Add a multicast group membership. * Group must be a valid IP6 multicast address. */ if (m == NULL || m->m_len != sizeof(struct ipv6_mreq)) { error = EINVAL; break; } mreq = mtod(m, struct ipv6_mreq *); if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) { /* * We use the unspecified address to specify to accept * all multicast addresses. Only super user is allowed * to do this. */ if (suser(td)) { error = EACCES; break; } } else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) { error = EINVAL; break; } /* * If no interface was explicitly specified, choose an * appropriate one according to the given multicast address. */ if (mreq->ipv6mr_interface == 0) { struct sockaddr_in6 *dst; /* * Look up the routing table for the * address, and choose the outgoing interface. * XXX: is it a good approach? */ ro.ro_rt = NULL; dst = (struct sockaddr_in6 *)&ro.ro_dst; bzero(dst, sizeof(*dst)); dst->sin6_family = AF_INET6; dst->sin6_len = sizeof(*dst); dst->sin6_addr = mreq->ipv6mr_multiaddr; rtalloc((struct route *)&ro); if (ro.ro_rt == NULL) { error = EADDRNOTAVAIL; break; } ifp = ro.ro_rt->rt_ifp; RTFREE(ro.ro_rt); } else { /* * If the interface is specified, validate it. */ if (mreq->ipv6mr_interface < 0 || if_index < mreq->ipv6mr_interface) { error = ENXIO; /* XXX EINVAL? */ break; } ifp = ifnet_byindex(mreq->ipv6mr_interface); if (!ifp) { error = ENXIO; /* XXX EINVAL? */ break; } } /* * See if we found an interface, and confirm that it * supports multicast */ if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; break; } if (in6_setscope(&mreq->ipv6mr_multiaddr, ifp, NULL)) { error = EADDRNOTAVAIL; /* XXX: should not happen */ break; } /* * See if the membership already exists. */ for (imm = im6o->im6o_memberships.lh_first; imm != NULL; imm = imm->i6mm_chain.le_next) if (imm->i6mm_maddr->in6m_ifp == ifp && IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr, &mreq->ipv6mr_multiaddr)) break; if (imm != NULL) { error = EADDRINUSE; break; } /* * Everything looks good; add a new record to the multicast * address list for the given interface. */ imm = in6_joingroup(ifp, &mreq->ipv6mr_multiaddr, &error, 0); if (imm == NULL) break; LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain); break; case IPV6_LEAVE_GROUP: /* * Drop a multicast group membership. * Group must be a valid IP6 multicast address. */ if (m == NULL || m->m_len != sizeof(struct ipv6_mreq)) { error = EINVAL; break; } mreq = mtod(m, struct ipv6_mreq *); /* * If an interface address was specified, get a pointer * to its ifnet structure. */ if (mreq->ipv6mr_interface < 0 || if_index < mreq->ipv6mr_interface) { error = ENXIO; /* XXX EINVAL? */ break; } if (mreq->ipv6mr_interface == 0) ifp = NULL; else ifp = ifnet_byindex(mreq->ipv6mr_interface); /* Fill in the scope zone ID */ if (ifp) { if (in6_setscope(&mreq->ipv6mr_multiaddr, ifp, NULL)) { /* XXX: should not happen */ error = EADDRNOTAVAIL; break; } } else if (mreq->ipv6mr_interface != 0) { /* * This case happens when the (positive) index is in * the valid range, but the corresponding interface has * been detached dynamically (XXX). */ error = EADDRNOTAVAIL; break; } else { /* ipv6mr_interface == 0 */ struct sockaddr_in6 sa6_mc; /* * The API spec says as follows: * If the interface index is specified as 0, the * system may choose a multicast group membership to * drop by matching the multicast address only. * On the other hand, we cannot disambiguate the scope * zone unless an interface is provided. Thus, we * check if there's ambiguity with the default scope * zone as the last resort. */ bzero(&sa6_mc, sizeof(sa6_mc)); sa6_mc.sin6_family = AF_INET6; sa6_mc.sin6_len = sizeof(sa6_mc); sa6_mc.sin6_addr = mreq->ipv6mr_multiaddr; error = sa6_embedscope(&sa6_mc, ip6_use_defzone); if (error != 0) break; mreq->ipv6mr_multiaddr = sa6_mc.sin6_addr; } /* * Find the membership in the membership list. */ for (imm = im6o->im6o_memberships.lh_first; imm != NULL; imm = imm->i6mm_chain.le_next) { if ((ifp == NULL || imm->i6mm_maddr->in6m_ifp == ifp) && IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr, &mreq->ipv6mr_multiaddr)) break; } if (imm == NULL) { /* Unable to resolve interface */ error = EADDRNOTAVAIL; break; } /* * Give up the multicast address record to which the * membership points. */ LIST_REMOVE(imm, i6mm_chain); in6_delmulti(imm->i6mm_maddr); free(imm, M_IP6MADDR); break; default: error = EOPNOTSUPP; break; } /* * If all options have default values, no need to keep the mbuf. */ if (im6o->im6o_multicast_ifp == NULL && im6o->im6o_multicast_hlim == ip6_defmcasthlim && im6o->im6o_multicast_loop == IPV6_DEFAULT_MULTICAST_LOOP && im6o->im6o_memberships.lh_first == NULL) { free(*im6op, M_IP6MOPTS); *im6op = NULL; } return (error); } /* * Return the IP6 multicast options in response to user getsockopt(). */ static int ip6_getmoptions(optname, im6o, mp) int optname; struct ip6_moptions *im6o; struct mbuf **mp; { u_int *hlim, *loop, *ifindex; *mp = m_get(M_TRYWAIT, MT_HEADER); /* XXX */ switch (optname) { case IPV6_MULTICAST_IF: ifindex = mtod(*mp, u_int *); (*mp)->m_len = sizeof(u_int); if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) *ifindex = 0; else *ifindex = im6o->im6o_multicast_ifp->if_index; return (0); case IPV6_MULTICAST_HOPS: hlim = mtod(*mp, u_int *); (*mp)->m_len = sizeof(u_int); if (im6o == NULL) *hlim = ip6_defmcasthlim; else *hlim = im6o->im6o_multicast_hlim; return (0); case IPV6_MULTICAST_LOOP: loop = mtod(*mp, u_int *); (*mp)->m_len = sizeof(u_int); if (im6o == NULL) *loop = ip6_defmcasthlim; else *loop = im6o->im6o_multicast_loop; return (0); default: return (EOPNOTSUPP); } } /* * Discard the IP6 multicast options. */ void ip6_freemoptions(im6o) struct ip6_moptions *im6o; { struct in6_multi_mship *imm; if (im6o == NULL) return; while ((imm = im6o->im6o_memberships.lh_first) != NULL) { LIST_REMOVE(imm, i6mm_chain); if (imm->i6mm_maddr) in6_delmulti(imm->i6mm_maddr); free(imm, M_IP6MADDR); } free(im6o, M_IP6MOPTS); } /* * Set IPv6 outgoing packet options based on advanced API. */ int ip6_setpktopts(control, opt, stickyopt, priv, uproto) struct mbuf *control; struct ip6_pktopts *opt, *stickyopt; int priv, uproto; { struct cmsghdr *cm = 0; if (control == NULL || opt == NULL) return (EINVAL); ip6_initpktopts(opt); if (stickyopt) { int error; /* * If stickyopt is provided, make a local copy of the options * for this particular packet, then override them by ancillary * objects. * XXX: copypktopts() does not copy the cached route to a next * hop (if any). This is not very good in terms of efficiency, * but we can allow this since this option should be rarely * used. */ if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0) return (error); } /* * XXX: Currently, we assume all the optional information is stored * in a single mbuf. */ if (control->m_next) return (EINVAL); for (; control->m_len; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { int error; if (control->m_len < CMSG_LEN(0)) return (EINVAL); cm = mtod(control, struct cmsghdr *); if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) return (EINVAL); if (cm->cmsg_level != IPPROTO_IPV6) continue; error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm), cm->cmsg_len - CMSG_LEN(0), opt, priv, 0, 1, uproto); if (error) return (error); } return (0); } /* * Set a particular packet option, as a sticky option or an ancillary data * item. "len" can be 0 only when it's a sticky option. * We have 4 cases of combination of "sticky" and "cmsg": * "sticky=0, cmsg=0": impossible * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data * "sticky=1, cmsg=0": RFC3542 socket option * "sticky=1, cmsg=1": RFC2292 socket option */ static int ip6_setpktopt(optname, buf, len, opt, priv, sticky, cmsg, uproto) int optname, len, priv, sticky, cmsg, uproto; u_char *buf; struct ip6_pktopts *opt; { int minmtupolicy, preftemp; if (!sticky && !cmsg) { #ifdef DIAGNOSTIC printf("ip6_setpktopt: impossible case\n"); #endif return (EINVAL); } /* * IPV6_2292xxx is for backward compatibility to RFC2292, and should * not be specified in the context of RFC3542. Conversely, * RFC3542 types should not be specified in the context of RFC2292. */ if (!cmsg) { switch (optname) { case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292NEXTHOP: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: case IPV6_2292PKTOPTIONS: return (ENOPROTOOPT); } } if (sticky && cmsg) { switch (optname) { case IPV6_PKTINFO: case IPV6_HOPLIMIT: case IPV6_NEXTHOP: case IPV6_HOPOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_USE_MIN_MTU: case IPV6_DONTFRAG: case IPV6_TCLASS: case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */ return (ENOPROTOOPT); } } switch (optname) { case IPV6_2292PKTINFO: case IPV6_PKTINFO: { struct ifnet *ifp = NULL; struct in6_pktinfo *pktinfo; if (len != sizeof(struct in6_pktinfo)) return (EINVAL); pktinfo = (struct in6_pktinfo *)buf; /* * An application can clear any sticky IPV6_PKTINFO option by * doing a "regular" setsockopt with ipi6_addr being * in6addr_any and ipi6_ifindex being zero. * [RFC 3542, Section 6] */ if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo && pktinfo->ipi6_ifindex == 0 && IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ip6_clearpktopts(opt, optname); break; } if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO && sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { return (EINVAL); } /* validate the interface index if specified. */ if (pktinfo->ipi6_ifindex > if_index || pktinfo->ipi6_ifindex < 0) { return (ENXIO); } if (pktinfo->ipi6_ifindex) { ifp = ifnet_byindex(pktinfo->ipi6_ifindex); if (ifp == NULL) return (ENXIO); } /* * We store the address anyway, and let in6_selectsrc() * validate the specified address. This is because ipi6_addr * may not have enough information about its scope zone, and * we may need additional information (such as outgoing * interface or the scope zone of a destination address) to * disambiguate the scope. * XXX: the delay of the validation may confuse the * application when it is used as a sticky option. */ if (opt->ip6po_pktinfo == NULL) { opt->ip6po_pktinfo = malloc(sizeof(*pktinfo), M_IP6OPT, M_NOWAIT); if (opt->ip6po_pktinfo == NULL) return (ENOBUFS); } bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo)); break; } case IPV6_2292HOPLIMIT: case IPV6_HOPLIMIT: { int *hlimp; /* * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT * to simplify the ordering among hoplimit options. */ if (optname == IPV6_HOPLIMIT && sticky) return (ENOPROTOOPT); if (len != sizeof(int)) return (EINVAL); hlimp = (int *)buf; if (*hlimp < -1 || *hlimp > 255) return (EINVAL); opt->ip6po_hlim = *hlimp; break; } case IPV6_TCLASS: { int tclass; if (len != sizeof(int)) return (EINVAL); tclass = *(int *)buf; if (tclass < -1 || tclass > 255) return (EINVAL); opt->ip6po_tclass = tclass; break; } case IPV6_2292NEXTHOP: case IPV6_NEXTHOP: if (!priv) return (EPERM); if (len == 0) { /* just remove the option */ ip6_clearpktopts(opt, IPV6_NEXTHOP); break; } /* check if cmsg_len is large enough for sa_len */ if (len < sizeof(struct sockaddr) || len < *buf) return (EINVAL); switch (((struct sockaddr *)buf)->sa_family) { case AF_INET6: { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf; int error; if (sa6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) { return (EINVAL); } if ((error = sa6_embedscope(sa6, ip6_use_defzone)) != 0) { return (error); } break; } case AF_LINK: /* should eventually be supported */ default: return (EAFNOSUPPORT); } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_NEXTHOP); opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT); if (opt->ip6po_nexthop == NULL) return (ENOBUFS); bcopy(buf, opt->ip6po_nexthop, *buf); break; case IPV6_2292HOPOPTS: case IPV6_HOPOPTS: { struct ip6_hbh *hbh; int hbhlen; /* * XXX: We don't allow a non-privileged user to set ANY HbH * options, since per-option restriction has too much * overhead. */ if (!priv) return (EPERM); if (len == 0) { ip6_clearpktopts(opt, IPV6_HOPOPTS); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_hbh)) return (EINVAL); hbh = (struct ip6_hbh *)buf; hbhlen = (hbh->ip6h_len + 1) << 3; if (len != hbhlen) return (EINVAL); /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_HOPOPTS); opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_hbh == NULL) return (ENOBUFS); bcopy(hbh, opt->ip6po_hbh, hbhlen); break; } case IPV6_2292DSTOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: { struct ip6_dest *dest, **newdest = NULL; int destlen; if (!priv) /* XXX: see the comment for IPV6_HOPOPTS */ return (EPERM); if (len == 0) { ip6_clearpktopts(opt, optname); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_dest)) return (EINVAL); dest = (struct ip6_dest *)buf; destlen = (dest->ip6d_len + 1) << 3; if (len != destlen) return (EINVAL); /* * Determine the position that the destination options header * should be inserted; before or after the routing header. */ switch (optname) { case IPV6_2292DSTOPTS: /* * The old advacned API is ambiguous on this point. * Our approach is to determine the position based * according to the existence of a routing header. * Note, however, that this depends on the order of the * extension headers in the ancillary data; the 1st * part of the destination options header must appear * before the routing header in the ancillary data, * too. * RFC3542 solved the ambiguity by introducing * separate ancillary data or option types. */ if (opt->ip6po_rthdr == NULL) newdest = &opt->ip6po_dest1; else newdest = &opt->ip6po_dest2; break; case IPV6_RTHDRDSTOPTS: newdest = &opt->ip6po_dest1; break; case IPV6_DSTOPTS: newdest = &opt->ip6po_dest2; break; } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, optname); *newdest = malloc(destlen, M_IP6OPT, M_NOWAIT); if (*newdest == NULL) return (ENOBUFS); bcopy(dest, *newdest, destlen); break; } case IPV6_2292RTHDR: case IPV6_RTHDR: { struct ip6_rthdr *rth; int rthlen; if (len == 0) { ip6_clearpktopts(opt, IPV6_RTHDR); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_rthdr)) return (EINVAL); rth = (struct ip6_rthdr *)buf; rthlen = (rth->ip6r_len + 1) << 3; if (len != rthlen) return (EINVAL); switch (rth->ip6r_type) { case IPV6_RTHDR_TYPE_0: if (rth->ip6r_len == 0) /* must contain one addr */ return (EINVAL); if (rth->ip6r_len % 2) /* length must be even */ return (EINVAL); if (rth->ip6r_len / 2 != rth->ip6r_segleft) return (EINVAL); break; default: return (EINVAL); /* not supported */ } /* turn off the previous option */ ip6_clearpktopts(opt, IPV6_RTHDR); opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_rthdr == NULL) return (ENOBUFS); bcopy(rth, opt->ip6po_rthdr, rthlen); break; } case IPV6_USE_MIN_MTU: if (len != sizeof(int)) return (EINVAL); minmtupolicy = *(int *)buf; if (minmtupolicy != IP6PO_MINMTU_MCASTONLY && minmtupolicy != IP6PO_MINMTU_DISABLE && minmtupolicy != IP6PO_MINMTU_ALL) { return (EINVAL); } opt->ip6po_minmtu = minmtupolicy; break; case IPV6_DONTFRAG: if (len != sizeof(int)) return (EINVAL); if (uproto == IPPROTO_TCP || *(int *)buf == 0) { /* * we ignore this option for TCP sockets. * (RFC3542 leaves this case unspecified.) */ opt->ip6po_flags &= ~IP6PO_DONTFRAG; } else opt->ip6po_flags |= IP6PO_DONTFRAG; break; case IPV6_PREFER_TEMPADDR: if (len != sizeof(int)) return (EINVAL); preftemp = *(int *)buf; if (preftemp != IP6PO_TEMPADDR_SYSTEM && preftemp != IP6PO_TEMPADDR_NOTPREFER && preftemp != IP6PO_TEMPADDR_PREFER) { return (EINVAL); } opt->ip6po_prefer_tempaddr = preftemp; break; default: return (ENOPROTOOPT); } /* end of switch */ return (0); } /* * Routine called from ip6_output() to loop back a copy of an IP6 multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be &loif -- easier than replicating that code here. */ void ip6_mloopback(ifp, m, dst) struct ifnet *ifp; struct mbuf *m; struct sockaddr_in6 *dst; { struct mbuf *copym; struct ip6_hdr *ip6; copym = m_copy(m, 0, M_COPYALL); if (copym == NULL) return; /* * Make sure to deep-copy IPv6 header portion in case the data * is in an mbuf cluster, so that we can safely override the IPv6 * header portion later. */ if ((copym->m_flags & M_EXT) != 0 || copym->m_len < sizeof(struct ip6_hdr)) { copym = m_pullup(copym, sizeof(struct ip6_hdr)); if (copym == NULL) return; } #ifdef DIAGNOSTIC if (copym->m_len < sizeof(*ip6)) { m_freem(copym); return; } #endif ip6 = mtod(copym, struct ip6_hdr *); /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); (void)if_simloop(ifp, copym, dst->sin6_family, 0); } /* * Chop IPv6 header off from the payload. */ static int ip6_splithdr(m, exthdrs) struct mbuf *m; struct ip6_exthdrs *exthdrs; { struct mbuf *mh; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); if (m->m_len > sizeof(*ip6)) { MGETHDR(mh, M_DONTWAIT, MT_HEADER); if (mh == 0) { m_freem(m); return ENOBUFS; } M_MOVE_PKTHDR(mh, m); MH_ALIGN(mh, sizeof(*ip6)); m->m_len -= sizeof(*ip6); m->m_data += sizeof(*ip6); mh->m_next = m; m = mh; m->m_len = sizeof(*ip6); bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6)); } exthdrs->ip6e_ip6 = m; return 0; } /* * Compute IPv6 extension header length. */ int ip6_optlen(in6p) struct in6pcb *in6p; { int len; if (!in6p->in6p_outputopts) return 0; len = 0; #define elen(x) \ (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) len += elen(in6p->in6p_outputopts->ip6po_hbh); if (in6p->in6p_outputopts->ip6po_rthdr) /* dest1 is valid with rthdr only */ len += elen(in6p->in6p_outputopts->ip6po_dest1); len += elen(in6p->in6p_outputopts->ip6po_rthdr); len += elen(in6p->in6p_outputopts->ip6po_dest2); return len; #undef elen }