Index: head/sys/netinet6/ip6_output.c =================================================================== --- head/sys/netinet6/ip6_output.c (revision 280954) +++ head/sys/netinet6/ip6_output.c (revision 280955) @@ -1,2973 +1,2974 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipfw.h" #include "opt_ipsec.h" #include "opt_sctp.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #include #include #endif /* IPSEC */ #ifdef SCTP #include #include #endif #include #include #ifdef FLOWTABLE #include #endif extern int in6_mcast_loop; struct ip6_exthdrs { struct mbuf *ip6e_ip6; struct mbuf *ip6e_hbh; struct mbuf *ip6e_dest1; struct mbuf *ip6e_rthdr; struct mbuf *ip6e_dest2; }; static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **, struct ucred *, int); static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *, struct socket *, struct sockopt *); static int ip6_getpcbopt(struct ip6_pktopts *, int, struct sockopt *); static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *, struct ucred *, int, int, int); static int ip6_copyexthdr(struct mbuf **, caddr_t, int); static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int, struct ip6_frag **); static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t); static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *); static int ip6_getpmtu(struct route_in6 *, struct route_in6 *, struct ifnet *, struct in6_addr *, u_long *, int *, u_int); static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int); /* * Make an extension header from option data. hp is the source, and * mp is the destination. */ #define MAKE_EXTHDR(hp, mp) \ do { \ if (hp) { \ struct ip6_ext *eh = (struct ip6_ext *)(hp); \ error = ip6_copyexthdr((mp), (caddr_t)(hp), \ ((eh)->ip6e_len + 1) << 3); \ if (error) \ goto freehdrs; \ } \ } while (/*CONSTCOND*/ 0) /* * Form a chain of extension headers. * m is the extension header mbuf * mp is the previous mbuf in the chain * p is the next header * i is the type of option. */ #define MAKE_CHAIN(m, mp, p, i)\ do {\ if (m) {\ if (!hdrsplit) \ panic("assumption failed: hdr not split"); \ *mtod((m), u_char *) = *(p);\ *(p) = (i);\ p = mtod((m), u_char *);\ (m)->m_next = (mp)->m_next;\ (mp)->m_next = (m);\ (mp) = (m);\ }\ } while (/*CONSTCOND*/ 0) void in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset) { u_short csum; csum = in_cksum_skip(m, offset + plen, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(u_short) > m->m_len) { printf("%s: delayed m_pullup, m->len: %d plen %u off %u " "csum_flags=%b\n", __func__, m->m_len, plen, offset, (int)m->m_pkthdr.csum_flags, CSUM_BITS); /* * XXX this should not happen, but if it does, the correct * behavior may be to insert the checksum in the appropriate * next mbuf in the chain. */ return; } *(u_short *)(m->m_data + offset) = csum; } int ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto, - int mtu) + int mtu, uint32_t id) { struct mbuf *m, **mnext, *m_frgpart; struct ip6_hdr *ip6, *mhip6; struct ip6_frag *ip6f; int off; int error; int tlen = m0->m_pkthdr.len; - uint32_t id = htonl(ip6_randomid()); m = m0; ip6 = mtod(m, struct ip6_hdr *); mnext = &m->m_nextpkt; for (off = hlen; off < tlen; off += mtu) { m = m_gethdr(M_NOWAIT, MT_DATA); if (!m) { IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } m->m_flags = m0->m_flags & M_COPYFLAGS; *mnext = m; mnext = &m->m_nextpkt; m->m_data += max_linkhdr; mhip6 = mtod(m, struct ip6_hdr *); *mhip6 = *ip6; m->m_len = sizeof(*mhip6); error = ip6_insertfraghdr(m0, m, hlen, &ip6f); if (error) { IP6STAT_INC(ip6s_odropped); return (error); } ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7)); if (off + mtu >= tlen) mtu = tlen - off; else ip6f->ip6f_offlg |= IP6F_MORE_FRAG; mhip6->ip6_plen = htons((u_short)(mtu + hlen + sizeof(*ip6f) - sizeof(struct ip6_hdr))); if ((m_frgpart = m_copy(m0, off, mtu)) == 0) { IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } m_cat(m, m_frgpart); m->m_pkthdr.len = mtu + hlen + sizeof(*ip6f); m->m_pkthdr.fibnum = m0->m_pkthdr.fibnum; m->m_pkthdr.rcvif = NULL; ip6f->ip6f_reserved = 0; ip6f->ip6f_ident = id; ip6f->ip6f_nxt = nextproto; IP6STAT_INC(ip6s_ofragments); in6_ifstat_inc(ifp, ifs6_out_fragcreat); } return (0); } /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 * header (with pri, len, nxt, hlim, src, dst). * This function may modify ver and hlim only. * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route_in6 ro is present and has ro_rt initialized, route lookup would be * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, * then result of route lookup is stored in ro->ro_rt. * * type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and * nd_ifinfo.linkmtu is u_int32_t. so we use u_long to hold largest one, * which is rt_mtu. * * ifpp - XXX: just for statistics */ /* * XXX TODO: no flowid is assigned for outbound flows? */ int ip6_output(struct mbuf *m0, struct ip6_pktopts *opt, struct route_in6 *ro, int flags, struct ip6_moptions *im6o, struct ifnet **ifpp, struct inpcb *inp) { struct ip6_hdr *ip6; struct ifnet *ifp, *origifp; struct mbuf *m = m0; struct mbuf *mprev = NULL; int hlen, tlen, len; struct route_in6 ip6route; struct rtentry *rt = NULL; struct sockaddr_in6 *dst, src_sa, dst_sa; struct in6_addr odst; int error = 0; struct in6_ifaddr *ia = NULL; u_long mtu; int alwaysfrag, dontfrag; u_int32_t optlen = 0, plen = 0, unfragpartlen = 0; struct ip6_exthdrs exthdrs; struct in6_addr finaldst, src0, dst0; u_int32_t zone; struct route_in6 *ro_pmtu = NULL; int hdrsplit = 0; int sw_csum, tso; int needfiblookup; uint32_t fibnum; struct m_tag *fwd_tag = NULL; + uint32_t id; ip6 = mtod(m, struct ip6_hdr *); if (ip6 == NULL) { printf ("ip6 is NULL"); goto bad; } if (inp != NULL) { M_SETFIB(m, inp->inp_inc.inc_fibnum); if ((flags & IP_NODEFAULTFLOWID) == 0) { /* unconditionally set flowid */ m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } } finaldst = ip6->ip6_dst; bzero(&exthdrs, sizeof(exthdrs)); if (opt) { /* Hop-by-Hop options header */ MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh); /* Destination options header(1st part) */ if (opt->ip6po_rthdr) { /* * Destination options header(1st part) * This only makes sense with a routing header. * See Section 9.2 of RFC 3542. * Disabling this part just for MIP6 convenience is * a bad idea. We need to think carefully about a * way to make the advanced API coexist with MIP6 * options, which might automatically be inserted in * the kernel. */ MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1); } /* Routing header */ MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr); /* Destination options header(2nd part) */ MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2); } #ifdef IPSEC /* * IPSec checking which handles several cases. * FAST IPSEC: We re-injected the packet. * XXX: need scope argument. */ switch(ip6_ipsec_output(&m, inp, &error)) { case 1: /* Bad packet */ goto freehdrs; case -1: /* IPSec done */ goto done; case 0: /* No IPSec */ default: break; } #endif /* IPSEC */ /* * Calculate the total length of the extension header chain. * Keep the length of the unfragmentable part for fragmentation. */ optlen = 0; if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len; if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len; if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len; unfragpartlen = optlen + sizeof(struct ip6_hdr); /* NOTE: we don't add AH/ESP length here (done in ip6_ipsec_output) */ if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len; /* * If there is at least one extension header, * separate IP6 header from the payload. */ if (optlen && !hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* adjust mbuf packet header length */ m->m_pkthdr.len += optlen; plen = m->m_pkthdr.len - sizeof(*ip6); /* If this is a jumbo payload, insert a jumbo payload option. */ if (plen > IPV6_MAXPACKET) { if (!hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0) goto freehdrs; ip6->ip6_plen = 0; } else ip6->ip6_plen = htons(plen); /* * Concatenate headers and fill in next header fields. * Here we have, on "m" * IPv6 payload * and we insert headers accordingly. Finally, we should be getting: * IPv6 hbh dest1 rthdr ah* [esp* dest2 payload] * * during the header composing process, "m" points to IPv6 header. * "mprev" points to an extension header prior to esp. */ u_char *nexthdrp = &ip6->ip6_nxt; mprev = m; /* * we treat dest2 specially. this makes IPsec processing * much easier. the goal here is to make mprev point the * mbuf prior to dest2. * * result: IPv6 dest2 payload * m and mprev will point to IPv6 header. */ if (exthdrs.ip6e_dest2) { if (!hdrsplit) panic("assumption failed: hdr not split"); exthdrs.ip6e_dest2->m_next = m->m_next; m->m_next = exthdrs.ip6e_dest2; *mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_DSTOPTS; } /* * result: IPv6 hbh dest1 rthdr dest2 payload * m will point to IPv6 header. mprev will point to the * extension header prior to dest2 (rthdr in the above case). */ MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS); MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp, IPPROTO_DSTOPTS); MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp, IPPROTO_ROUTING); /* * If there is a routing header, discard the packet. */ if (exthdrs.ip6e_rthdr) { error = EINVAL; goto bad; } /* Source address validation */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && (flags & IPV6_UNSPECSRC) == 0) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } IP6STAT_INC(ip6s_localout); /* * Route packet. */ if (ro == 0) { ro = &ip6route; bzero((caddr_t)ro, sizeof(*ro)); } ro_pmtu = ro; if (opt && opt->ip6po_rthdr) ro = &opt->ip6po_route; dst = (struct sockaddr_in6 *)&ro->ro_dst; #ifdef FLOWTABLE if (ro->ro_rt == NULL) (void )flowtable_lookup(AF_INET6, m, (struct route *)ro); #endif fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); again: /* * if specified, try to fill in the traffic class field. * do not override if a non-zero value is already set. * we check the diffserv field and the ecn field separately. */ if (opt && opt->ip6po_tclass >= 0) { int mask = 0; if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0) mask |= 0xfc; if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0) mask |= 0x03; if (mask != 0) ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); } /* fill in or override the hop limit field, if necessary. */ if (opt && opt->ip6po_hlim != -1) ip6->ip6_hlim = opt->ip6po_hlim & 0xff; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (im6o != NULL) ip6->ip6_hlim = im6o->im6o_multicast_hlim; else ip6->ip6_hlim = V_ip6_defmcasthlim; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); if (ro->ro_rt && fwd_tag == NULL) { rt = ro->ro_rt; ifp = ro->ro_rt->rt_ifp; } else { if (fwd_tag == NULL) { bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; } error = in6_selectroute_fib(&dst_sa, opt, im6o, ro, &ifp, &rt, fibnum); if (error != 0) { if (ifp != NULL) in6_ifstat_inc(ifp, ifs6_out_discard); goto bad; } } if (rt == NULL) { /* * If in6_selectroute() does not return a route entry, * dst may not have been updated. */ *dst = dst_sa; /* XXX */ } /* * then rt (for unicast) and ifp must be non-NULL valid values. */ if ((flags & IPV6_FORWARDING) == 0) { /* XXX: the FORWARDING flag can be set for mrouting. */ in6_ifstat_inc(ifp, ifs6_out_request); } if (rt != NULL) { ia = (struct in6_ifaddr *)(rt->rt_ifa); counter_u64_add(rt->rt_pksent, 1); } /* * The outgoing interface must be in the zone of source and * destination addresses. */ origifp = ifp; src0 = ip6->ip6_src; if (in6_setscope(&src0, origifp, &zone)) goto badscope; bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = ip6->ip6_src; if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id) goto badscope; dst0 = ip6->ip6_dst; if (in6_setscope(&dst0, origifp, &zone)) goto badscope; /* re-initialize to be sure */ bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id) { goto badscope; } /* We should use ia_ifp to support the case of * sending packets to an address of our own. */ if (ia != NULL && ia->ia_ifp) ifp = ia->ia_ifp; /* scope check is done. */ goto routefound; badscope: IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(origifp, ifs6_out_discard); if (error == 0) error = EHOSTUNREACH; /* XXX */ goto bad; routefound: if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (opt && opt->ip6po_nextroute.ro_rt) { /* * The nexthop is explicitly specified by the * application. We assume the next hop is an IPv6 * address. */ dst = (struct sockaddr_in6 *)opt->ip6po_nexthop; } else if ((rt->rt_flags & RTF_GATEWAY)) dst = (struct sockaddr_in6 *)rt->rt_gateway; } if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */ } else { m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST; in6_ifstat_inc(ifp, ifs6_out_mcast); /* * Confirm that the outgoing interface supports multicast. */ if (!(ifp->if_flags & IFF_MULTICAST)) { IP6STAT_INC(ip6s_noroute); in6_ifstat_inc(ifp, ifs6_out_discard); error = ENETUNREACH; goto bad; } if ((im6o == NULL && in6_mcast_loop) || (im6o && im6o->im6o_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we have not joined * the address; protocols will filter it later, * thus deferring a hash lookup and lock acquisition * at the expense of an m_copym(). */ ip6_mloopback(ifp, m, dst); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IPV6_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip6_mloopback(), * above, will be forwarded by the ip6_input() routine, * if necessary. */ if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) { /* * XXX: ip6_mforward expects that rcvif is NULL * when it is called from the originating path. * However, it may not always be the case. */ m->m_pkthdr.rcvif = NULL; if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a hoplimit of zero may be looped back, * above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip6_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) { m_freem(m); goto done; } } /* * Fill the outgoing inteface to tell the upper layer * to increment per-interface statistics. */ if (ifpp) *ifpp = ifp; /* Determine path MTU. */ if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu, &alwaysfrag, fibnum)) != 0) goto bad; /* * The caller of this function may specify to use the minimum MTU * in some cases. * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU * setting. The logic is a bit complicated; by default, unicast * packets will follow path MTU while multicast packets will be sent at * the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets * including unicast ones will be sent at the minimum MTU. Multicast * packets will always be sent at the minimum MTU unless * IP6PO_MINMTU_DISABLE is explicitly specified. * See RFC 3542 for more details. */ if (mtu > IPV6_MMTU) { if ((flags & IPV6_MINMTU)) mtu = IPV6_MMTU; else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL) mtu = IPV6_MMTU; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && (opt == NULL || opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) { mtu = IPV6_MMTU; } } /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); /* * If the outgoing packet contains a hop-by-hop options header, * it must be examined and processed even by the source node. * (RFC 2460, section 4.) */ if (exthdrs.ip6e_hbh) { struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *); u_int32_t dummy; /* XXX unused */ u_int32_t plen = 0; /* XXX: ip6_process will check the value */ #ifdef DIAGNOSTIC if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len) panic("ip6e_hbh is not contiguous"); #endif /* * XXX: if we have to send an ICMPv6 error to the sender, * we need the M_LOOP flag since icmp6_error() expects * the IPv6 and the hop-by-hop options header are * contiguous unless the flag is set. */ m->m_flags |= M_LOOP; m->m_pkthdr.rcvif = ifp; if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1), ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh), &dummy, &plen) < 0) { /* m was already freed at this point */ error = EINVAL;/* better error? */ goto done; } m->m_flags &= ~M_LOOP; /* XXX */ m->m_pkthdr.rcvif = NULL; } /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet6_pfil_hook)) goto passout; odst = ip6->ip6_dst; /* Run through list of hooks for output packets. */ error = pfil_run_hooks(&V_inet6_pfil_hook, &m, ifp, PFIL_OUT, inp); if (error != 0 || m == NULL) goto done; ip6 = mtod(m, struct ip6_hdr *); needfiblookup = 0; /* See if destination IP address was changed by packet filter. */ if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip6_input(). */ if (in6_localip(&ip6->ip6_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } else needfiblookup = 1; /* Redo the routing table lookup. */ } /* See if fib was changed by packet filter. */ if (fibnum != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; fibnum = M_GETFIB(m); RO_RTFREE(ro); needfiblookup = 1; } if (needfiblookup) goto again; /* See if local, if yes, send it to netisr. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } /* Or forward to some other address? */ if ((m->m_flags & M_IP6_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { dst = (struct sockaddr_in6 *)&ro->ro_dst; bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP6_NEXTHOP; m_tag_delete(m, fwd_tag); goto again; } passout: /* * Send the packet to the outgoing interface. * If necessary, do IPv6 fragmentation before sending. * * the logic here is rather complex: * 1: normal case (dontfrag == 0, alwaysfrag == 0) * 1-a: send as is if tlen <= path mtu * 1-b: fragment if tlen > path mtu * * 2: if user asks us not to fragment (dontfrag == 1) * 2-a: send as is if tlen <= interface mtu * 2-b: error if tlen > interface mtu * * 3: if we always need to attach fragment header (alwaysfrag == 1) * always fragment * * 4: if dontfrag == 1 && alwaysfrag == 1 * error, as we cannot handle this conflicting request */ sw_csum = m->m_pkthdr.csum_flags; if (!hdrsplit) { tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0; sw_csum &= ~ifp->if_hwassist; } else tso = 0; /* * If we added extension headers, we will not do TSO and calculate the * checksums ourselves for now. * XXX-BZ Need a framework to know when the NIC can handle it, even * with ext. hdrs. */ if (sw_csum & CSUM_DELAY_DATA_IPV6) { sw_csum &= ~CSUM_DELAY_DATA_IPV6; in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr)); } #ifdef SCTP if (sw_csum & CSUM_SCTP_IPV6) { sw_csum &= ~CSUM_SCTP_IPV6; sctp_delayed_cksum(m, sizeof(struct ip6_hdr)); } #endif m->m_pkthdr.csum_flags &= ifp->if_hwassist; tlen = m->m_pkthdr.len; if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso) dontfrag = 1; else dontfrag = 0; if (dontfrag && alwaysfrag) { /* case 4 */ /* conflicting request - can't transmit */ error = EMSGSIZE; goto bad; } if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) { /* case 2-b */ /* * Even if the DONTFRAG option is specified, we cannot send the * packet when the data length is larger than the MTU of the * outgoing interface. * Notify the error by sending IPV6_PATHMTU ancillary data if * application wanted to know the MTU value. Also return an * error code (this is not described in the API spec). */ if (inp != NULL) ip6_notify_pmtu(inp, &dst_sa, (u_int32_t)mtu); error = EMSGSIZE; goto bad; } /* * transmit packet without fragmentation */ if (dontfrag || (!alwaysfrag && tlen <= mtu)) { /* case 1-a and 2-a */ struct in6_ifaddr *ia6; ip6 = mtod(m, struct ip6_hdr *); ia6 = in6_ifawithifp(ifp, &ip6->ip6_src); if (ia6) { /* Record statistics for this interface address. */ counter_u64_add(ia6->ia_ifa.ifa_opackets, 1); counter_u64_add(ia6->ia_ifa.ifa_obytes, m->m_pkthdr.len); ifa_free(&ia6->ia_ifa); } error = nd6_output(ifp, origifp, m, dst, ro->ro_rt); goto done; } /* * try to fragment the packet. case 1-b and 3 */ if (mtu < IPV6_MMTU) { /* path MTU cannot be less than IPV6_MMTU */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else if (ip6->ip6_plen == 0) { /* jumbo payload cannot be fragmented */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else { u_char nextproto; /* * Too large for the destination or interface; * fragment if possible. * Must be able to put at least 8 bytes per fragment. */ hlen = unfragpartlen; if (mtu > IPV6_MAXPACKET) mtu = IPV6_MAXPACKET; len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7; if (len < 8) { error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } /* * If the interface will not calculate checksums on * fragmented packets, then do it here. * XXX-BZ handle the hw offloading case. Need flags. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { in6_delayed_cksum(m, plen, hlen); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) { sctp_delayed_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6; } #endif /* * Change the next header field of the last header in the * unfragmentable part. */ if (exthdrs.ip6e_rthdr) { nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *); *mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_dest1) { nextproto = *mtod(exthdrs.ip6e_dest1, u_char *); *mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_hbh) { nextproto = *mtod(exthdrs.ip6e_hbh, u_char *); *mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT; } else { nextproto = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_FRAGMENT; } /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto * chain. */ m0 = m; - if ((error = ip6_fragment(ifp, m, hlen, nextproto, len))) + id = htonl(ip6_randomid()); + if ((error = ip6_fragment(ifp, m, hlen, nextproto, len, id))) goto sendorfree; in6_ifstat_inc(ifp, ifs6_out_fragok); } /* * Remove leading garbages. */ sendorfree: m = m0->m_nextpkt; m0->m_nextpkt = 0; m_freem(m0); for (m0 = m; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } error = nd6_output(ifp, origifp, m, dst, ro->ro_rt); } else m_freem(m); } if (error == 0) IP6STAT_INC(ip6s_fragmented); done: if (ro == &ip6route) RO_RTFREE(ro); if (ro_pmtu == &ip6route) RO_RTFREE(ro_pmtu); return (error); freehdrs: m_freem(exthdrs.ip6e_hbh); /* m_freem will check if mbuf is 0 */ m_freem(exthdrs.ip6e_dest1); m_freem(exthdrs.ip6e_rthdr); m_freem(exthdrs.ip6e_dest2); /* FALLTHROUGH */ bad: if (m) m_freem(m); goto done; } static int ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen) { struct mbuf *m; if (hlen > MCLBYTES) return (ENOBUFS); /* XXX */ if (hlen > MLEN) m = m_getcl(M_NOWAIT, MT_DATA, 0); else m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_len = hlen; if (hdr) bcopy(hdr, mtod(m, caddr_t), hlen); *mp = m; return (0); } /* * Insert jumbo payload option. */ static int ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen) { struct mbuf *mopt; u_char *optbuf; u_int32_t v; #define JUMBOOPTLEN 8 /* length of jumbo payload option and padding */ /* * If there is no hop-by-hop options header, allocate new one. * If there is one but it doesn't have enough space to store the * jumbo payload option, allocate a cluster to store the whole options. * Otherwise, use it to store the options. */ if (exthdrs->ip6e_hbh == 0) { mopt = m_get(M_NOWAIT, MT_DATA); if (mopt == NULL) return (ENOBUFS); mopt->m_len = JUMBOOPTLEN; optbuf = mtod(mopt, u_char *); optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */ exthdrs->ip6e_hbh = mopt; } else { struct ip6_hbh *hbh; mopt = exthdrs->ip6e_hbh; if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) { /* * XXX assumption: * - exthdrs->ip6e_hbh is not referenced from places * other than exthdrs. * - exthdrs->ip6e_hbh is not an mbuf chain. */ int oldoptlen = mopt->m_len; struct mbuf *n; /* * XXX: give up if the whole (new) hbh header does * not fit even in an mbuf cluster. */ if (oldoptlen + JUMBOOPTLEN > MCLBYTES) return (ENOBUFS); /* * As a consequence, we must always prepare a cluster * at this point. */ n = m_getcl(M_NOWAIT, MT_DATA, 0); if (n == NULL) return (ENOBUFS); n->m_len = oldoptlen + JUMBOOPTLEN; bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t), oldoptlen); optbuf = mtod(n, caddr_t) + oldoptlen; m_freem(mopt); mopt = exthdrs->ip6e_hbh = n; } else { optbuf = mtod(mopt, u_char *) + mopt->m_len; mopt->m_len += JUMBOOPTLEN; } optbuf[0] = IP6OPT_PADN; optbuf[1] = 1; /* * Adjust the header length according to the pad and * the jumbo payload option. */ hbh = mtod(mopt, struct ip6_hbh *); hbh->ip6h_len += (JUMBOOPTLEN >> 3); } /* fill in the option. */ optbuf[2] = IP6OPT_JUMBO; optbuf[3] = 4; v = (u_int32_t)htonl(plen + JUMBOOPTLEN); bcopy(&v, &optbuf[4], sizeof(u_int32_t)); /* finally, adjust the packet header length */ exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN; return (0); #undef JUMBOOPTLEN } /* * Insert fragment header and copy unfragmentable header portions. */ static int ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen, struct ip6_frag **frghdrp) { struct mbuf *n, *mlast; if (hlen > sizeof(struct ip6_hdr)) { n = m_copym(m0, sizeof(struct ip6_hdr), hlen - sizeof(struct ip6_hdr), M_NOWAIT); if (n == 0) return (ENOBUFS); m->m_next = n; } else n = m; /* Search for the last mbuf of unfragmentable part. */ for (mlast = n; mlast->m_next; mlast = mlast->m_next) ; if (M_WRITABLE(mlast) && M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) { /* use the trailing space of the last mbuf for the fragment hdr */ *frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) + mlast->m_len); mlast->m_len += sizeof(struct ip6_frag); m->m_pkthdr.len += sizeof(struct ip6_frag); } else { /* allocate a new mbuf for the fragment header */ struct mbuf *mfrg; mfrg = m_get(M_NOWAIT, MT_DATA); if (mfrg == NULL) return (ENOBUFS); mfrg->m_len = sizeof(struct ip6_frag); *frghdrp = mtod(mfrg, struct ip6_frag *); mlast->m_next = mfrg; } return (0); } static int ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro, struct ifnet *ifp, struct in6_addr *dst, u_long *mtup, int *alwaysfragp, u_int fibnum) { u_int32_t mtu = 0; int alwaysfrag = 0; int error = 0; if (ro_pmtu != ro) { /* The first hop and the final destination may differ. */ struct sockaddr_in6 *sa6_dst = (struct sockaddr_in6 *)&ro_pmtu->ro_dst; if (ro_pmtu->ro_rt && ((ro_pmtu->ro_rt->rt_flags & RTF_UP) == 0 || !IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))) { RTFREE(ro_pmtu->ro_rt); ro_pmtu->ro_rt = (struct rtentry *)NULL; } if (ro_pmtu->ro_rt == NULL) { bzero(sa6_dst, sizeof(*sa6_dst)); sa6_dst->sin6_family = AF_INET6; sa6_dst->sin6_len = sizeof(struct sockaddr_in6); sa6_dst->sin6_addr = *dst; in6_rtalloc(ro_pmtu, fibnum); } } if (ro_pmtu->ro_rt) { u_int32_t ifmtu; struct in_conninfo inc; bzero(&inc, sizeof(inc)); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (ifp == NULL) ifp = ro_pmtu->ro_rt->rt_ifp; ifmtu = IN6_LINKMTU(ifp); mtu = tcp_hc_getmtu(&inc); if (mtu) mtu = min(mtu, ro_pmtu->ro_rt->rt_mtu); else mtu = ro_pmtu->ro_rt->rt_mtu; if (mtu == 0) mtu = ifmtu; else if (mtu < IPV6_MMTU) { /* * RFC2460 section 5, last paragraph: * if we record ICMPv6 too big message with * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU * or smaller, with framgent header attached. * (fragment header is needed regardless from the * packet size, for translators to identify packets) */ alwaysfrag = 1; mtu = IPV6_MMTU; } } else if (ifp) { mtu = IN6_LINKMTU(ifp); } else error = EHOSTUNREACH; /* XXX */ *mtup = mtu; if (alwaysfragp) *alwaysfragp = alwaysfrag; return (error); } /* * IP6 socket option processing. */ int ip6_ctloutput(struct socket *so, struct sockopt *sopt) { int optdatalen, uproto; void *optdata; struct inpcb *in6p = sotoinpcb(so); int error, optval; int level, op, optname; int optlen; struct thread *td; #ifdef RSS uint32_t rss_bucket; int retval; #endif level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; td = sopt->sopt_td; error = 0; optval = 0; uproto = (int)so->so_proto->pr_protocol; if (level != IPPROTO_IPV6) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(in6p); if ((so->so_options & SO_REUSEADDR) != 0) in6p->inp_flags2 |= INP_REUSEADDR; else in6p->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(in6p); error = 0; break; case SO_REUSEPORT: INP_WLOCK(in6p); if ((so->so_options & SO_REUSEPORT) != 0) in6p->inp_flags2 |= INP_REUSEPORT; else in6p->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(in6p); error = 0; break; case SO_SETFIB: INP_WLOCK(in6p); in6p->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(in6p); error = 0; break; default: break; } } } else { /* level == IPPROTO_IPV6 */ switch (op) { case SOPT_SET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif { struct mbuf *m; error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; error = ip6_pcbopts(&in6p->in6p_outputopts, m, so, sopt); m_freem(m); /* XXX */ break; } /* * Use of some Hop-by-Hop options or some * Destination options, might require special * privilege. That is, normal applications * (without special privilege) might be forbidden * from setting certain options in outgoing packets, * and might never see certain options in received * packets. [RFC 2292 Section 6] * KAME specific note: * KAME prevents non-privileged users from sending or * receiving ANY hbh/dst options in order to avoid * overhead of parsing options in the kernel. */ case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) break; } /* FALLTHROUGH */ case IPV6_UNICAST_HOPS: case IPV6_HOPLIMIT: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_RECVTCLASS: case IPV6_V6ONLY: case IPV6_AUTOFLOWLABEL: case IPV6_BINDANY: case IPV6_BINDMULTI: #ifdef RSS case IPV6_RSS_LISTEN_BUCKET: #endif if (optname == IPV6_BINDANY && td != NULL) { error = priv_check(td, PRIV_NETINET_BINDANY); if (error) break; } if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_UNICAST_HOPS: if (optval < -1 || optval >= 256) error = EINVAL; else { /* -1 = kernel default */ in6p->in6p_hops = optval; if ((in6p->inp_vflag & INP_IPV4) != 0) in6p->inp_ip_ttl = optval; } break; #define OPTSET(bit) \ do { \ INP_WLOCK(in6p); \ if (optval) \ in6p->inp_flags |= (bit); \ else \ in6p->inp_flags &= ~(bit); \ INP_WUNLOCK(in6p); \ } while (/*CONSTCOND*/ 0) #define OPTSET2292(bit) \ do { \ INP_WLOCK(in6p); \ in6p->inp_flags |= IN6P_RFC2292; \ if (optval) \ in6p->inp_flags |= (bit); \ else \ in6p->inp_flags &= ~(bit); \ INP_WUNLOCK(in6p); \ } while (/*CONSTCOND*/ 0) #define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0) #define OPTSET2(bit, val) do { \ INP_WLOCK(in6p); \ if (val) \ in6p->inp_flags2 |= bit; \ else \ in6p->inp_flags2 &= ~bit; \ INP_WUNLOCK(in6p); \ } while (0) #define OPTBIT2(bit) (in6p->inp_flags2 & (bit) ? 1 : 0) case IPV6_RECVPKTINFO: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_PKTINFO); break; case IPV6_HOPLIMIT: { struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } optp = &in6p->in6p_outputopts; error = ip6_pcbopt(IPV6_HOPLIMIT, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); break; } case IPV6_RECVHOPLIMIT: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_HOPLIMIT); break; case IPV6_RECVHOPOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_RTHDRDSTOPTS); break; case IPV6_RECVRTHDR: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: /* * We ignore this option for TCP * sockets. * (RFC3542 leaves this case * unspecified.) */ if (uproto != IPPROTO_TCP) OPTSET(IN6P_MTU); break; case IPV6_V6ONLY: /* * make setsockopt(IPV6_V6ONLY) * available only prior to bind(2). * see ipng mailing list, Jun 22 2001. */ if (in6p->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) { error = EINVAL; break; } OPTSET(IN6P_IPV6_V6ONLY); if (optval) in6p->inp_vflag &= ~INP_IPV4; else in6p->inp_vflag |= INP_IPV4; break; case IPV6_RECVTCLASS: /* cannot mix with RFC2292 XXX */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: OPTSET(IN6P_AUTOFLOWLABEL); break; case IPV6_BINDANY: OPTSET(INP_BINDANY); break; case IPV6_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; #ifdef RSS case IPV6_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { in6p->inp_rss_listen_bucket = optval; OPTSET2(INP_RSS_BUCKET_SET, 1); } else { error = EINVAL; } break; #endif } break; case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: if (optlen != sizeof(optval)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; { struct ip6_pktopts **optp; optp = &in6p->in6p_outputopts; error = ip6_pcbopt(optname, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: /* RFC 2292 */ if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_2292PKTINFO: OPTSET2292(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: OPTSET2292(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: /* * Check super-user privilege. * See comments for IPV6_RECVHOPOPTS. */ if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_HOPOPTS); break; case IPV6_2292DSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */ break; case IPV6_2292RTHDR: OPTSET2292(IN6P_RTHDR); break; } break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: { /* new advanced API (RFC3542) */ u_char *optbuf; u_char optbuf_storage[MCLBYTES]; int optlen; struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } /* * We only ensure valsize is not too large * here. Further validation will be done * later. */ error = sooptcopyin(sopt, optbuf_storage, sizeof(optbuf_storage), 0); if (error) break; optlen = sopt->sopt_valsize; optbuf = optbuf_storage; optp = &in6p->in6p_outputopts; error = ip6_pcbopt(optname, optbuf, optlen, optp, (td != NULL) ? td->td_ucred : NULL, uproto); break; } #undef OPTSET case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: case IPV6_MSFILTER: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = ip6_setmoptions(in6p, sopt); break; case IPV6_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(in6p); switch (optval) { case IPV6_PORTRANGE_DEFAULT: in6p->inp_flags &= ~(INP_LOWPORT); in6p->inp_flags &= ~(INP_HIGHPORT); break; case IPV6_PORTRANGE_HIGH: in6p->inp_flags &= ~(INP_LOWPORT); in6p->inp_flags |= INP_HIGHPORT; break; case IPV6_PORTRANGE_LOW: in6p->inp_flags &= ~(INP_HIGHPORT); in6p->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(in6p); break; #ifdef IPSEC case IPV6_IPSEC_POLICY: { caddr_t req; struct mbuf *m; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; req = mtod(m, caddr_t); error = ipsec_set_policy(in6p, optname, req, m->m_len, (sopt->sopt_td != NULL) ? sopt->sopt_td->td_ucred : NULL); m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif /* * RFC3542 (effectively) deprecated the * semantics of the 2292-style pktoptions. * Since it was not reliable in nature (i.e., * applications had to expect the lack of some * information after all), it would make sense * to simplify this part by always returning * empty data. */ sopt->sopt_valsize = 0; break; case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: case IPV6_UNICAST_HOPS: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_V6ONLY: case IPV6_PORTRANGE: case IPV6_RECVTCLASS: case IPV6_AUTOFLOWLABEL: case IPV6_BINDANY: case IPV6_FLOWID: case IPV6_FLOWTYPE: #ifdef RSS case IPV6_RSSBUCKETID: #endif switch (optname) { case IPV6_RECVHOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: optval = OPTBIT(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: optval = OPTBIT(IN6P_RTHDRDSTOPTS); break; case IPV6_UNICAST_HOPS: optval = in6p->in6p_hops; break; case IPV6_RECVPKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_RECVHOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_RECVRTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: optval = OPTBIT(IN6P_MTU); break; case IPV6_V6ONLY: optval = OPTBIT(IN6P_IPV6_V6ONLY); break; case IPV6_PORTRANGE: { int flags; flags = in6p->inp_flags; if (flags & INP_HIGHPORT) optval = IPV6_PORTRANGE_HIGH; else if (flags & INP_LOWPORT) optval = IPV6_PORTRANGE_LOW; else optval = 0; break; } case IPV6_RECVTCLASS: optval = OPTBIT(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: optval = OPTBIT(IN6P_AUTOFLOWLABEL); break; case IPV6_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IPV6_FLOWID: optval = in6p->inp_flowid; break; case IPV6_FLOWTYPE: optval = in6p->inp_flowtype; break; #ifdef RSS case IPV6_RSSBUCKETID: retval = rss_hash2bucket(in6p->inp_flowid, in6p->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; #endif case IPV6_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; } if (error) break; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PATHMTU: { u_long pmtu = 0; struct ip6_mtuinfo mtuinfo; struct route_in6 sro; bzero(&sro, sizeof(sro)); if (!(so->so_state & SS_ISCONNECTED)) return (ENOTCONN); /* * XXX: we dot not consider the case of source * routing, or optional information to specify * the outgoing interface. */ error = ip6_getpmtu(&sro, NULL, NULL, &in6p->in6p_faddr, &pmtu, NULL, so->so_fibnum); if (sro.ro_rt) RTFREE(sro.ro_rt); if (error) break; if (pmtu > IPV6_MAXPACKET) pmtu = IPV6_MAXPACKET; bzero(&mtuinfo, sizeof(mtuinfo)); mtuinfo.ip6m_mtu = (u_int32_t)pmtu; optdata = (void *)&mtuinfo; optdatalen = sizeof(mtuinfo); error = sooptcopyout(sopt, optdata, optdatalen); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292RTHDR: case IPV6_2292DSTOPTS: switch (optname) { case IPV6_2292PKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_2292RTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_2292DSTOPTS: optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: error = ip6_getpcbopt(in6p->in6p_outputopts, optname, sopt); break; case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_MSFILTER: error = ip6_getmoptions(in6p, sopt); break; #ifdef IPSEC case IPV6_IPSEC_POLICY: { caddr_t req = NULL; size_t len = 0; struct mbuf *m = NULL; struct mbuf **mp = &m; size_t ovalsize = sopt->sopt_valsize; caddr_t oval = (caddr_t)sopt->sopt_val; error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; sopt->sopt_valsize = ovalsize; sopt->sopt_val = oval; if (m) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec_get_policy(in6p, req, len, mp); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0 && m) m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } } return (error); } int ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0, optval, optlen; const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum); struct inpcb *in6p = sotoinpcb(so); int level, op, optname; level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; if (level != IPPROTO_IPV6) { return (EINVAL); } switch (optname) { case IPV6_CHECKSUM: /* * For ICMPv6 sockets, no modification allowed for checksum * offset, permit "no change" values to help existing apps. * * RFC3542 says: "An attempt to set IPV6_CHECKSUM * for an ICMPv6 socket will fail." * The current behavior does not meet RFC3542. */ switch (op) { case SOPT_SET: if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; if ((optval % 2) != 0) { /* the API assumes even offset values */ error = EINVAL; } else if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (optval != icmp6off) error = EINVAL; } else in6p->in6p_cksum = optval; break; case SOPT_GET: if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) optval = icmp6off; else optval = in6p->in6p_cksum; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = EINVAL; break; } break; default: error = ENOPROTOOPT; break; } return (error); } /* * Set up IP6 options in pcb for insertion in output packets or * specifying behavior of outgoing packets. */ static int ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m, struct socket *so, struct sockopt *sopt) { struct ip6_pktopts *opt = *pktopt; int error = 0; struct thread *td = sopt->sopt_td; /* turn off any old options. */ if (opt) { #ifdef DIAGNOSTIC if (opt->ip6po_pktinfo || opt->ip6po_nexthop || opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 || opt->ip6po_rhinfo.ip6po_rhi_rthdr) printf("ip6_pcbopts: all specified options are cleared.\n"); #endif ip6_clearpktopts(opt, -1); } else opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK); *pktopt = NULL; if (!m || m->m_len == 0) { /* * Only turning off any previous options, regardless of * whether the opt is just created or given. */ free(opt, M_IP6OPT); return (0); } /* set options specified by user. */ if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ? td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) { ip6_clearpktopts(opt, -1); /* XXX: discard all options */ free(opt, M_IP6OPT); return (error); } *pktopt = opt; return (0); } /* * initialize ip6_pktopts. beware that there are non-zero default values in * the struct. */ void ip6_initpktopts(struct ip6_pktopts *opt) { bzero(opt, sizeof(*opt)); opt->ip6po_hlim = -1; /* -1 means default hop limit */ opt->ip6po_tclass = -1; /* -1 means default traffic class */ opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY; opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM; } static int ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, struct ucred *cred, int uproto) { struct ip6_pktopts *opt; if (*pktopt == NULL) { *pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT, M_WAITOK); ip6_initpktopts(*pktopt); } opt = *pktopt; return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto)); } static int ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt) { void *optdata = NULL; int optdatalen = 0; struct ip6_ext *ip6e; int error = 0; struct in6_pktinfo null_pktinfo; int deftclass = 0, on; int defminmtu = IP6PO_MINMTU_MCASTONLY; int defpreftemp = IP6PO_TEMPADDR_SYSTEM; switch (optname) { case IPV6_PKTINFO: if (pktopt && pktopt->ip6po_pktinfo) optdata = (void *)pktopt->ip6po_pktinfo; else { /* XXX: we don't have to do this every time... */ bzero(&null_pktinfo, sizeof(null_pktinfo)); optdata = (void *)&null_pktinfo; } optdatalen = sizeof(struct in6_pktinfo); break; case IPV6_TCLASS: if (pktopt && pktopt->ip6po_tclass >= 0) optdata = (void *)&pktopt->ip6po_tclass; else optdata = (void *)&deftclass; optdatalen = sizeof(int); break; case IPV6_HOPOPTS: if (pktopt && pktopt->ip6po_hbh) { optdata = (void *)pktopt->ip6po_hbh; ip6e = (struct ip6_ext *)pktopt->ip6po_hbh; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_RTHDR: if (pktopt && pktopt->ip6po_rthdr) { optdata = (void *)pktopt->ip6po_rthdr; ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_RTHDRDSTOPTS: if (pktopt && pktopt->ip6po_dest1) { optdata = (void *)pktopt->ip6po_dest1; ip6e = (struct ip6_ext *)pktopt->ip6po_dest1; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_DSTOPTS: if (pktopt && pktopt->ip6po_dest2) { optdata = (void *)pktopt->ip6po_dest2; ip6e = (struct ip6_ext *)pktopt->ip6po_dest2; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_NEXTHOP: if (pktopt && pktopt->ip6po_nexthop) { optdata = (void *)pktopt->ip6po_nexthop; optdatalen = pktopt->ip6po_nexthop->sa_len; } break; case IPV6_USE_MIN_MTU: if (pktopt) optdata = (void *)&pktopt->ip6po_minmtu; else optdata = (void *)&defminmtu; optdatalen = sizeof(int); break; case IPV6_DONTFRAG: if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG)) on = 1; else on = 0; optdata = (void *)&on; optdatalen = sizeof(on); break; case IPV6_PREFER_TEMPADDR: if (pktopt) optdata = (void *)&pktopt->ip6po_prefer_tempaddr; else optdata = (void *)&defpreftemp; optdatalen = sizeof(int); break; default: /* should not happen */ #ifdef DIAGNOSTIC panic("ip6_getpcbopt: unexpected option\n"); #endif return (ENOPROTOOPT); } error = sooptcopyout(sopt, optdata, optdatalen); return (error); } void ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname) { if (pktopt == NULL) return; if (optname == -1 || optname == IPV6_PKTINFO) { if (pktopt->ip6po_pktinfo) free(pktopt->ip6po_pktinfo, M_IP6OPT); pktopt->ip6po_pktinfo = NULL; } if (optname == -1 || optname == IPV6_HOPLIMIT) pktopt->ip6po_hlim = -1; if (optname == -1 || optname == IPV6_TCLASS) pktopt->ip6po_tclass = -1; if (optname == -1 || optname == IPV6_NEXTHOP) { if (pktopt->ip6po_nextroute.ro_rt) { RTFREE(pktopt->ip6po_nextroute.ro_rt); pktopt->ip6po_nextroute.ro_rt = NULL; } if (pktopt->ip6po_nexthop) free(pktopt->ip6po_nexthop, M_IP6OPT); pktopt->ip6po_nexthop = NULL; } if (optname == -1 || optname == IPV6_HOPOPTS) { if (pktopt->ip6po_hbh) free(pktopt->ip6po_hbh, M_IP6OPT); pktopt->ip6po_hbh = NULL; } if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) { if (pktopt->ip6po_dest1) free(pktopt->ip6po_dest1, M_IP6OPT); pktopt->ip6po_dest1 = NULL; } if (optname == -1 || optname == IPV6_RTHDR) { if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT); pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL; if (pktopt->ip6po_route.ro_rt) { RTFREE(pktopt->ip6po_route.ro_rt); pktopt->ip6po_route.ro_rt = NULL; } } if (optname == -1 || optname == IPV6_DSTOPTS) { if (pktopt->ip6po_dest2) free(pktopt->ip6po_dest2, M_IP6OPT); pktopt->ip6po_dest2 = NULL; } } #define PKTOPT_EXTHDRCPY(type) \ do {\ if (src->type) {\ int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\ dst->type = malloc(hlen, M_IP6OPT, canwait);\ if (dst->type == NULL && canwait == M_NOWAIT)\ goto bad;\ bcopy(src->type, dst->type, hlen);\ }\ } while (/*CONSTCOND*/ 0) static int copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait) { if (dst == NULL || src == NULL) { printf("ip6_clearpktopts: invalid argument\n"); return (EINVAL); } dst->ip6po_hlim = src->ip6po_hlim; dst->ip6po_tclass = src->ip6po_tclass; dst->ip6po_flags = src->ip6po_flags; dst->ip6po_minmtu = src->ip6po_minmtu; dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr; if (src->ip6po_pktinfo) { dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo), M_IP6OPT, canwait); if (dst->ip6po_pktinfo == NULL) goto bad; *dst->ip6po_pktinfo = *src->ip6po_pktinfo; } if (src->ip6po_nexthop) { dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len, M_IP6OPT, canwait); if (dst->ip6po_nexthop == NULL) goto bad; bcopy(src->ip6po_nexthop, dst->ip6po_nexthop, src->ip6po_nexthop->sa_len); } PKTOPT_EXTHDRCPY(ip6po_hbh); PKTOPT_EXTHDRCPY(ip6po_dest1); PKTOPT_EXTHDRCPY(ip6po_dest2); PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */ return (0); bad: ip6_clearpktopts(dst, -1); return (ENOBUFS); } #undef PKTOPT_EXTHDRCPY struct ip6_pktopts * ip6_copypktopts(struct ip6_pktopts *src, int canwait) { int error; struct ip6_pktopts *dst; dst = malloc(sizeof(*dst), M_IP6OPT, canwait); if (dst == NULL) return (NULL); ip6_initpktopts(dst); if ((error = copypktopts(dst, src, canwait)) != 0) { free(dst, M_IP6OPT); return (NULL); } return (dst); } void ip6_freepcbopts(struct ip6_pktopts *pktopt) { if (pktopt == NULL) return; ip6_clearpktopts(pktopt, -1); free(pktopt, M_IP6OPT); } /* * Set IPv6 outgoing packet options based on advanced API. */ int ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt, struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto) { struct cmsghdr *cm = 0; if (control == NULL || opt == NULL) return (EINVAL); ip6_initpktopts(opt); if (stickyopt) { int error; /* * If stickyopt is provided, make a local copy of the options * for this particular packet, then override them by ancillary * objects. * XXX: copypktopts() does not copy the cached route to a next * hop (if any). This is not very good in terms of efficiency, * but we can allow this since this option should be rarely * used. */ if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0) return (error); } /* * XXX: Currently, we assume all the optional information is stored * in a single mbuf. */ if (control->m_next) return (EINVAL); for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { int error; if (control->m_len < CMSG_LEN(0)) return (EINVAL); cm = mtod(control, struct cmsghdr *); if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) return (EINVAL); if (cm->cmsg_level != IPPROTO_IPV6) continue; error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm), cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto); if (error) return (error); } return (0); } /* * Set a particular packet option, as a sticky option or an ancillary data * item. "len" can be 0 only when it's a sticky option. * We have 4 cases of combination of "sticky" and "cmsg": * "sticky=0, cmsg=0": impossible * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data * "sticky=1, cmsg=0": RFC3542 socket option * "sticky=1, cmsg=1": RFC2292 socket option */ static int ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, struct ucred *cred, int sticky, int cmsg, int uproto) { int minmtupolicy, preftemp; int error; if (!sticky && !cmsg) { #ifdef DIAGNOSTIC printf("ip6_setpktopt: impossible case\n"); #endif return (EINVAL); } /* * IPV6_2292xxx is for backward compatibility to RFC2292, and should * not be specified in the context of RFC3542. Conversely, * RFC3542 types should not be specified in the context of RFC2292. */ if (!cmsg) { switch (optname) { case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292NEXTHOP: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: case IPV6_2292PKTOPTIONS: return (ENOPROTOOPT); } } if (sticky && cmsg) { switch (optname) { case IPV6_PKTINFO: case IPV6_HOPLIMIT: case IPV6_NEXTHOP: case IPV6_HOPOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_USE_MIN_MTU: case IPV6_DONTFRAG: case IPV6_TCLASS: case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */ return (ENOPROTOOPT); } } switch (optname) { case IPV6_2292PKTINFO: case IPV6_PKTINFO: { struct ifnet *ifp = NULL; struct in6_pktinfo *pktinfo; if (len != sizeof(struct in6_pktinfo)) return (EINVAL); pktinfo = (struct in6_pktinfo *)buf; /* * An application can clear any sticky IPV6_PKTINFO option by * doing a "regular" setsockopt with ipi6_addr being * in6addr_any and ipi6_ifindex being zero. * [RFC 3542, Section 6] */ if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo && pktinfo->ipi6_ifindex == 0 && IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ip6_clearpktopts(opt, optname); break; } if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO && sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { return (EINVAL); } if (IN6_IS_ADDR_MULTICAST(&pktinfo->ipi6_addr)) return (EINVAL); /* validate the interface index if specified. */ if (pktinfo->ipi6_ifindex > V_if_index) return (ENXIO); if (pktinfo->ipi6_ifindex) { ifp = ifnet_byindex(pktinfo->ipi6_ifindex); if (ifp == NULL) return (ENXIO); } if (ifp != NULL && ( ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) return (ENETDOWN); if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { struct in6_ifaddr *ia; ia = in6ifa_ifpwithaddr(ifp, &pktinfo->ipi6_addr); if (ia == NULL) return (EADDRNOTAVAIL); ifa_free(&ia->ia_ifa); } /* * We store the address anyway, and let in6_selectsrc() * validate the specified address. This is because ipi6_addr * may not have enough information about its scope zone, and * we may need additional information (such as outgoing * interface or the scope zone of a destination address) to * disambiguate the scope. * XXX: the delay of the validation may confuse the * application when it is used as a sticky option. */ if (opt->ip6po_pktinfo == NULL) { opt->ip6po_pktinfo = malloc(sizeof(*pktinfo), M_IP6OPT, M_NOWAIT); if (opt->ip6po_pktinfo == NULL) return (ENOBUFS); } bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo)); break; } case IPV6_2292HOPLIMIT: case IPV6_HOPLIMIT: { int *hlimp; /* * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT * to simplify the ordering among hoplimit options. */ if (optname == IPV6_HOPLIMIT && sticky) return (ENOPROTOOPT); if (len != sizeof(int)) return (EINVAL); hlimp = (int *)buf; if (*hlimp < -1 || *hlimp > 255) return (EINVAL); opt->ip6po_hlim = *hlimp; break; } case IPV6_TCLASS: { int tclass; if (len != sizeof(int)) return (EINVAL); tclass = *(int *)buf; if (tclass < -1 || tclass > 255) return (EINVAL); opt->ip6po_tclass = tclass; break; } case IPV6_2292NEXTHOP: case IPV6_NEXTHOP: if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS, 0); if (error) return (error); } if (len == 0) { /* just remove the option */ ip6_clearpktopts(opt, IPV6_NEXTHOP); break; } /* check if cmsg_len is large enough for sa_len */ if (len < sizeof(struct sockaddr) || len < *buf) return (EINVAL); switch (((struct sockaddr *)buf)->sa_family) { case AF_INET6: { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf; int error; if (sa6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) { return (EINVAL); } if ((error = sa6_embedscope(sa6, V_ip6_use_defzone)) != 0) { return (error); } break; } case AF_LINK: /* should eventually be supported */ default: return (EAFNOSUPPORT); } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_NEXTHOP); opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT); if (opt->ip6po_nexthop == NULL) return (ENOBUFS); bcopy(buf, opt->ip6po_nexthop, *buf); break; case IPV6_2292HOPOPTS: case IPV6_HOPOPTS: { struct ip6_hbh *hbh; int hbhlen; /* * XXX: We don't allow a non-privileged user to set ANY HbH * options, since per-option restriction has too much * overhead. */ if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS, 0); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, IPV6_HOPOPTS); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_hbh)) return (EINVAL); hbh = (struct ip6_hbh *)buf; hbhlen = (hbh->ip6h_len + 1) << 3; if (len != hbhlen) return (EINVAL); /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_HOPOPTS); opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_hbh == NULL) return (ENOBUFS); bcopy(hbh, opt->ip6po_hbh, hbhlen); break; } case IPV6_2292DSTOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: { struct ip6_dest *dest, **newdest = NULL; int destlen; if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */ error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS, 0); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, optname); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_dest)) return (EINVAL); dest = (struct ip6_dest *)buf; destlen = (dest->ip6d_len + 1) << 3; if (len != destlen) return (EINVAL); /* * Determine the position that the destination options header * should be inserted; before or after the routing header. */ switch (optname) { case IPV6_2292DSTOPTS: /* * The old advacned API is ambiguous on this point. * Our approach is to determine the position based * according to the existence of a routing header. * Note, however, that this depends on the order of the * extension headers in the ancillary data; the 1st * part of the destination options header must appear * before the routing header in the ancillary data, * too. * RFC3542 solved the ambiguity by introducing * separate ancillary data or option types. */ if (opt->ip6po_rthdr == NULL) newdest = &opt->ip6po_dest1; else newdest = &opt->ip6po_dest2; break; case IPV6_RTHDRDSTOPTS: newdest = &opt->ip6po_dest1; break; case IPV6_DSTOPTS: newdest = &opt->ip6po_dest2; break; } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, optname); *newdest = malloc(destlen, M_IP6OPT, M_NOWAIT); if (*newdest == NULL) return (ENOBUFS); bcopy(dest, *newdest, destlen); break; } case IPV6_2292RTHDR: case IPV6_RTHDR: { struct ip6_rthdr *rth; int rthlen; if (len == 0) { ip6_clearpktopts(opt, IPV6_RTHDR); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_rthdr)) return (EINVAL); rth = (struct ip6_rthdr *)buf; rthlen = (rth->ip6r_len + 1) << 3; if (len != rthlen) return (EINVAL); switch (rth->ip6r_type) { case IPV6_RTHDR_TYPE_0: if (rth->ip6r_len == 0) /* must contain one addr */ return (EINVAL); if (rth->ip6r_len % 2) /* length must be even */ return (EINVAL); if (rth->ip6r_len / 2 != rth->ip6r_segleft) return (EINVAL); break; default: return (EINVAL); /* not supported */ } /* turn off the previous option */ ip6_clearpktopts(opt, IPV6_RTHDR); opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_rthdr == NULL) return (ENOBUFS); bcopy(rth, opt->ip6po_rthdr, rthlen); break; } case IPV6_USE_MIN_MTU: if (len != sizeof(int)) return (EINVAL); minmtupolicy = *(int *)buf; if (minmtupolicy != IP6PO_MINMTU_MCASTONLY && minmtupolicy != IP6PO_MINMTU_DISABLE && minmtupolicy != IP6PO_MINMTU_ALL) { return (EINVAL); } opt->ip6po_minmtu = minmtupolicy; break; case IPV6_DONTFRAG: if (len != sizeof(int)) return (EINVAL); if (uproto == IPPROTO_TCP || *(int *)buf == 0) { /* * we ignore this option for TCP sockets. * (RFC3542 leaves this case unspecified.) */ opt->ip6po_flags &= ~IP6PO_DONTFRAG; } else opt->ip6po_flags |= IP6PO_DONTFRAG; break; case IPV6_PREFER_TEMPADDR: if (len != sizeof(int)) return (EINVAL); preftemp = *(int *)buf; if (preftemp != IP6PO_TEMPADDR_SYSTEM && preftemp != IP6PO_TEMPADDR_NOTPREFER && preftemp != IP6PO_TEMPADDR_PREFER) { return (EINVAL); } opt->ip6po_prefer_tempaddr = preftemp; break; default: return (ENOPROTOOPT); } /* end of switch */ return (0); } /* * Routine called from ip6_output() to loop back a copy of an IP6 multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be &loif -- easier than replicating that code here. */ void ip6_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in6 *dst) { struct mbuf *copym; struct ip6_hdr *ip6; copym = m_copy(m, 0, M_COPYALL); if (copym == NULL) return; /* * Make sure to deep-copy IPv6 header portion in case the data * is in an mbuf cluster, so that we can safely override the IPv6 * header portion later. */ if (!M_WRITABLE(copym) || copym->m_len < sizeof(struct ip6_hdr)) { copym = m_pullup(copym, sizeof(struct ip6_hdr)); if (copym == NULL) return; } #ifdef DIAGNOSTIC if (copym->m_len < sizeof(*ip6)) { m_freem(copym); return; } #endif ip6 = mtod(copym, struct ip6_hdr *); /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); (void)if_simloop(ifp, copym, dst->sin6_family, 0); } /* * Chop IPv6 header off from the payload. */ static int ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs) { struct mbuf *mh; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); if (m->m_len > sizeof(*ip6)) { mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { m_freem(m); return ENOBUFS; } m_move_pkthdr(mh, m); M_ALIGN(mh, sizeof(*ip6)); m->m_len -= sizeof(*ip6); m->m_data += sizeof(*ip6); mh->m_next = m; m = mh; m->m_len = sizeof(*ip6); bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6)); } exthdrs->ip6e_ip6 = m; return 0; } /* * Compute IPv6 extension header length. */ int ip6_optlen(struct inpcb *in6p) { int len; if (!in6p->in6p_outputopts) return 0; len = 0; #define elen(x) \ (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) len += elen(in6p->in6p_outputopts->ip6po_hbh); if (in6p->in6p_outputopts->ip6po_rthdr) /* dest1 is valid with rthdr only */ len += elen(in6p->in6p_outputopts->ip6po_dest1); len += elen(in6p->in6p_outputopts->ip6po_rthdr); len += elen(in6p->in6p_outputopts->ip6po_dest2); return len; #undef elen } Index: head/sys/netinet6/ip6_var.h =================================================================== --- head/sys/netinet6/ip6_var.h (revision 280954) +++ head/sys/netinet6/ip6_var.h (revision 280955) @@ -1,425 +1,426 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_var.h,v 1.62 2001/05/03 14:51:48 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_var.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET6_IP6_VAR_H_ #define _NETINET6_IP6_VAR_H_ /* * IP6 reassembly queue structure. Each fragment * being reassembled is attached to one of these structures. */ struct ip6q { struct ip6asfrag *ip6q_down; struct ip6asfrag *ip6q_up; u_int32_t ip6q_ident; u_int8_t ip6q_nxt; u_int8_t ip6q_ecn; u_int8_t ip6q_ttl; struct in6_addr ip6q_src, ip6q_dst; struct ip6q *ip6q_next; struct ip6q *ip6q_prev; int ip6q_unfrglen; /* len of unfragmentable part */ #ifdef notyet u_char *ip6q_nxtp; #endif int ip6q_nfrag; /* # of fragments */ struct label *ip6q_label; }; struct ip6asfrag { struct ip6asfrag *ip6af_down; struct ip6asfrag *ip6af_up; struct mbuf *ip6af_m; int ip6af_offset; /* offset in ip6af_m to next header */ int ip6af_frglen; /* fragmentable part length */ int ip6af_off; /* fragment offset */ u_int16_t ip6af_mff; /* more fragment bit in frag off */ }; #define IP6_REASS_MBUF(ip6af) (*(struct mbuf **)&((ip6af)->ip6af_m)) /* * Structure attached to inpcb.in6p_moptions and * passed to ip6_output when IPv6 multicast options are in use. * This structure is lazy-allocated. */ struct ip6_moptions { struct ifnet *im6o_multicast_ifp; /* ifp for outgoing multicasts */ u_char im6o_multicast_hlim; /* hoplimit for outgoing multicasts */ u_char im6o_multicast_loop; /* 1 >= hear sends if a member */ u_short im6o_num_memberships; /* no. memberships this socket */ u_short im6o_max_memberships; /* max memberships this socket */ struct in6_multi **im6o_membership; /* group memberships */ struct in6_mfilter *im6o_mfilters; /* source filters */ }; /* * Control options for outgoing packets */ /* Routing header related info */ struct ip6po_rhinfo { struct ip6_rthdr *ip6po_rhi_rthdr; /* Routing header */ struct route_in6 ip6po_rhi_route; /* Route to the 1st hop */ }; #define ip6po_rthdr ip6po_rhinfo.ip6po_rhi_rthdr #define ip6po_route ip6po_rhinfo.ip6po_rhi_route /* Nexthop related info */ struct ip6po_nhinfo { struct sockaddr *ip6po_nhi_nexthop; struct route_in6 ip6po_nhi_route; /* Route to the nexthop */ }; #define ip6po_nexthop ip6po_nhinfo.ip6po_nhi_nexthop #define ip6po_nextroute ip6po_nhinfo.ip6po_nhi_route struct ip6_pktopts { struct mbuf *ip6po_m; /* Pointer to mbuf storing the data */ int ip6po_hlim; /* Hoplimit for outgoing packets */ /* Outgoing IF/address information */ struct in6_pktinfo *ip6po_pktinfo; /* Next-hop address information */ struct ip6po_nhinfo ip6po_nhinfo; struct ip6_hbh *ip6po_hbh; /* Hop-by-Hop options header */ /* Destination options header (before a routing header) */ struct ip6_dest *ip6po_dest1; /* Routing header related info. */ struct ip6po_rhinfo ip6po_rhinfo; /* Destination options header (after a routing header) */ struct ip6_dest *ip6po_dest2; int ip6po_tclass; /* traffic class */ int ip6po_minmtu; /* fragment vs PMTU discovery policy */ #define IP6PO_MINMTU_MCASTONLY -1 /* default; send at min MTU for multicast*/ #define IP6PO_MINMTU_DISABLE 0 /* always perform pmtu disc */ #define IP6PO_MINMTU_ALL 1 /* always send at min MTU */ int ip6po_prefer_tempaddr; /* whether temporary addresses are preferred as source address */ #define IP6PO_TEMPADDR_SYSTEM -1 /* follow the system default */ #define IP6PO_TEMPADDR_NOTPREFER 0 /* not prefer temporary address */ #define IP6PO_TEMPADDR_PREFER 1 /* prefer temporary address */ int ip6po_flags; #if 0 /* parameters in this block is obsolete. do not reuse the values. */ #define IP6PO_REACHCONF 0x01 /* upper-layer reachability confirmation. */ #define IP6PO_MINMTU 0x02 /* use minimum MTU (IPV6_USE_MIN_MTU) */ #endif #define IP6PO_DONTFRAG 0x04 /* disable fragmentation (IPV6_DONTFRAG) */ #define IP6PO_USECOA 0x08 /* use care of address */ }; /* * Control options for incoming packets */ struct ip6stat { uint64_t ip6s_total; /* total packets received */ uint64_t ip6s_tooshort; /* packet too short */ uint64_t ip6s_toosmall; /* not enough data */ uint64_t ip6s_fragments; /* fragments received */ uint64_t ip6s_fragdropped; /* frags dropped(dups, out of space) */ uint64_t ip6s_fragtimeout; /* fragments timed out */ uint64_t ip6s_fragoverflow; /* fragments that exceeded limit */ uint64_t ip6s_forward; /* packets forwarded */ uint64_t ip6s_cantforward; /* packets rcvd for unreachable dest */ uint64_t ip6s_redirectsent; /* packets forwarded on same net */ uint64_t ip6s_delivered; /* datagrams delivered to upper level*/ uint64_t ip6s_localout; /* total ip packets generated here */ uint64_t ip6s_odropped; /* lost packets due to nobufs, etc. */ uint64_t ip6s_reassembled; /* total packets reassembled ok */ uint64_t ip6s_fragmented; /* datagrams successfully fragmented */ uint64_t ip6s_ofragments; /* output fragments created */ uint64_t ip6s_cantfrag; /* don't fragment flag was set, etc. */ uint64_t ip6s_badoptions; /* error in option processing */ uint64_t ip6s_noroute; /* packets discarded due to no route */ uint64_t ip6s_badvers; /* ip6 version != 6 */ uint64_t ip6s_rawout; /* total raw ip packets generated */ uint64_t ip6s_badscope; /* scope error */ uint64_t ip6s_notmember; /* don't join this multicast group */ #define IP6S_HDRCNT 256 /* headers count */ uint64_t ip6s_nxthist[IP6S_HDRCNT]; /* next header history */ uint64_t ip6s_m1; /* one mbuf */ #define IP6S_M2MMAX 32 uint64_t ip6s_m2m[IP6S_M2MMAX]; /* two or more mbuf */ uint64_t ip6s_mext1; /* one ext mbuf */ uint64_t ip6s_mext2m; /* two or more ext mbuf */ uint64_t ip6s_exthdrtoolong; /* ext hdr are not contiguous */ uint64_t ip6s_nogif; /* no match gif found */ uint64_t ip6s_toomanyhdr; /* discarded due to too many headers */ /* * statistics for improvement of the source address selection * algorithm: * XXX: hardcoded 16 = # of ip6 multicast scope types + 1 */ #define IP6S_RULESMAX 16 #define IP6S_SCOPECNT 16 /* number of times that address selection fails */ uint64_t ip6s_sources_none; /* number of times that an address on the outgoing I/F is chosen */ uint64_t ip6s_sources_sameif[IP6S_SCOPECNT]; /* number of times that an address on a non-outgoing I/F is chosen */ uint64_t ip6s_sources_otherif[IP6S_SCOPECNT]; /* * number of times that an address that has the same scope * from the destination is chosen. */ uint64_t ip6s_sources_samescope[IP6S_SCOPECNT]; /* * number of times that an address that has a different scope * from the destination is chosen. */ uint64_t ip6s_sources_otherscope[IP6S_SCOPECNT]; /* number of times that a deprecated address is chosen */ uint64_t ip6s_sources_deprecated[IP6S_SCOPECNT]; /* number of times that each rule of source selection is applied. */ uint64_t ip6s_sources_rule[IP6S_RULESMAX]; }; #ifdef _KERNEL #include VNET_PCPUSTAT_DECLARE(struct ip6stat, ip6stat); #define IP6STAT_ADD(name, val) \ VNET_PCPUSTAT_ADD(struct ip6stat, ip6stat, name, (val)) #define IP6STAT_SUB(name, val) IP6STAT_ADD(name, -(val)) #define IP6STAT_INC(name) IP6STAT_ADD(name, 1) #define IP6STAT_DEC(name) IP6STAT_SUB(name, 1) #endif #ifdef _KERNEL /* flags passed to ip6_output as last parameter */ #define IPV6_UNSPECSRC 0x01 /* allow :: as the source address */ #define IPV6_FORWARDING 0x02 /* most of IPv6 header exists */ #define IPV6_MINMTU 0x04 /* use minimum MTU (IPV6_USE_MIN_MTU) */ #ifdef __NO_STRICT_ALIGNMENT #define IP6_HDR_ALIGNED_P(ip) 1 #else #define IP6_HDR_ALIGNED_P(ip) ((((intptr_t) (ip)) & 3) == 0) #endif VNET_DECLARE(int, ip6_defhlim); /* default hop limit */ VNET_DECLARE(int, ip6_defmcasthlim); /* default multicast hop limit */ VNET_DECLARE(int, ip6_forwarding); /* act as router? */ VNET_DECLARE(int, ip6_use_deprecated); /* allow deprecated addr as source */ VNET_DECLARE(int, ip6_rr_prune); /* router renumbering prefix * walk list every 5 sec. */ VNET_DECLARE(int, ip6_mcast_pmtu); /* enable pMTU discovery for multicast? */ VNET_DECLARE(int, ip6_v6only); #define V_ip6_defhlim VNET(ip6_defhlim) #define V_ip6_defmcasthlim VNET(ip6_defmcasthlim) #define V_ip6_forwarding VNET(ip6_forwarding) #define V_ip6_use_deprecated VNET(ip6_use_deprecated) #define V_ip6_rr_prune VNET(ip6_rr_prune) #define V_ip6_mcast_pmtu VNET(ip6_mcast_pmtu) #define V_ip6_v6only VNET(ip6_v6only) VNET_DECLARE(struct socket *, ip6_mrouter); /* multicast routing daemon */ VNET_DECLARE(int, ip6_sendredirects); /* send IP redirects when forwarding? */ VNET_DECLARE(int, ip6_maxfragpackets); /* Maximum packets in reassembly * queue */ VNET_DECLARE(int, ip6_maxfrags); /* Maximum fragments in reassembly * queue */ VNET_DECLARE(int, ip6_accept_rtadv); /* Acts as a host not a router */ VNET_DECLARE(int, ip6_no_radr); /* No defroute from RA */ VNET_DECLARE(int, ip6_norbit_raif); /* Disable R-bit in NA on RA * receiving IF. */ VNET_DECLARE(int, ip6_rfc6204w3); /* Accept defroute from RA even when forwarding enabled */ VNET_DECLARE(int, ip6_log_interval); VNET_DECLARE(time_t, ip6_log_time); VNET_DECLARE(int, ip6_hdrnestlimit); /* upper limit of # of extension * headers */ VNET_DECLARE(int, ip6_dad_count); /* DupAddrDetectionTransmits */ #define V_ip6_mrouter VNET(ip6_mrouter) #define V_ip6_sendredirects VNET(ip6_sendredirects) #define V_ip6_maxfragpackets VNET(ip6_maxfragpackets) #define V_ip6_maxfrags VNET(ip6_maxfrags) #define V_ip6_accept_rtadv VNET(ip6_accept_rtadv) #define V_ip6_no_radr VNET(ip6_no_radr) #define V_ip6_norbit_raif VNET(ip6_norbit_raif) #define V_ip6_rfc6204w3 VNET(ip6_rfc6204w3) #define V_ip6_log_interval VNET(ip6_log_interval) #define V_ip6_log_time VNET(ip6_log_time) #define V_ip6_hdrnestlimit VNET(ip6_hdrnestlimit) #define V_ip6_dad_count VNET(ip6_dad_count) VNET_DECLARE(int, ip6_auto_flowlabel); VNET_DECLARE(int, ip6_auto_linklocal); #define V_ip6_auto_flowlabel VNET(ip6_auto_flowlabel) #define V_ip6_auto_linklocal VNET(ip6_auto_linklocal) VNET_DECLARE(int, ip6_use_tempaddr); /* Whether to use temporary addresses */ VNET_DECLARE(int, ip6_prefer_tempaddr); /* Whether to prefer temporary * addresses in the source address * selection */ #define V_ip6_use_tempaddr VNET(ip6_use_tempaddr) #define V_ip6_prefer_tempaddr VNET(ip6_prefer_tempaddr) VNET_DECLARE(int, ip6_use_defzone); /* Whether to use the default scope * zone when unspecified */ #define V_ip6_use_defzone VNET(ip6_use_defzone) VNET_DECLARE (struct pfil_head, inet6_pfil_hook); /* packet filter hooks */ #define V_inet6_pfil_hook VNET(inet6_pfil_hook) #ifdef IPSTEALTH VNET_DECLARE(int, ip6stealth); #define V_ip6stealth VNET(ip6stealth) #endif extern struct pr_usrreqs rip6_usrreqs; struct sockopt; struct inpcb; int icmp6_ctloutput(struct socket *, struct sockopt *sopt); struct in6_ifaddr; void ip6_init(void); #ifdef VIMAGE void ip6_destroy(void); #endif int ip6proto_register(short); int ip6proto_unregister(short); void ip6_input(struct mbuf *); void ip6_freepcbopts(struct ip6_pktopts *); int ip6_unknown_opt(u_int8_t *, struct mbuf *, int); char * ip6_get_prevhdr(struct mbuf *, int); int ip6_nexthdr(struct mbuf *, int, int, int *); int ip6_lasthdr(struct mbuf *, int, int, int *); extern int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *); int ip6_process_hopopts(struct mbuf *, u_int8_t *, int, u_int32_t *, u_int32_t *); struct mbuf **ip6_savecontrol_v4(struct inpcb *, struct mbuf *, struct mbuf **, int *); void ip6_savecontrol(struct inpcb *, struct mbuf *, struct mbuf **); void ip6_notify_pmtu(struct inpcb *, struct sockaddr_in6 *, u_int32_t); int ip6_sysctl(int *, u_int, void *, size_t *, void *, size_t); void ip6_forward(struct mbuf *, int); void ip6_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in6 *); int ip6_output(struct mbuf *, struct ip6_pktopts *, struct route_in6 *, int, struct ip6_moptions *, struct ifnet **, struct inpcb *); int ip6_ctloutput(struct socket *, struct sockopt *); int ip6_raw_ctloutput(struct socket *, struct sockopt *); void ip6_initpktopts(struct ip6_pktopts *); int ip6_setpktopts(struct mbuf *, struct ip6_pktopts *, struct ip6_pktopts *, struct ucred *, int); void ip6_clearpktopts(struct ip6_pktopts *, int); struct ip6_pktopts *ip6_copypktopts(struct ip6_pktopts *, int); int ip6_optlen(struct inpcb *); int ip6_deletefraghdr(struct mbuf *, int, int); -int ip6_fragment(struct ifnet *, struct mbuf *, int, u_char, int); +int ip6_fragment(struct ifnet *, struct mbuf *, int, u_char, int, + uint32_t); int route6_input(struct mbuf **, int *, int); void frag6_init(void); int frag6_input(struct mbuf **, int *, int); void frag6_slowtimo(void); void frag6_drain(void); void rip6_init(void); int rip6_input(struct mbuf **, int *, int); void rip6_ctlinput(int, struct sockaddr *, void *); int rip6_ctloutput(struct socket *, struct sockopt *); int rip6_output(struct mbuf *, struct socket *, ...); int rip6_usrreq(struct socket *, int, struct mbuf *, struct mbuf *, struct mbuf *, struct thread *); int dest6_input(struct mbuf **, int *, int); int none_input(struct mbuf **, int *, int); int in6_selectsrc(struct sockaddr_in6 *, struct ip6_pktopts *, struct inpcb *inp, struct route_in6 *, struct ucred *cred, struct ifnet **, struct in6_addr *); int in6_selectroute(struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, struct ifnet **, struct rtentry **); int in6_selectroute_fib(struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, struct ifnet **, struct rtentry **, u_int); u_int32_t ip6_randomid(void); u_int32_t ip6_randomflowlabel(void); void in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset); #endif /* _KERNEL */ #endif /* !_NETINET6_IP6_VAR_H_ */ Index: head/sys/netpfil/pf/pf_norm.c =================================================================== --- head/sys/netpfil/pf/pf_norm.c (revision 280954) +++ head/sys/netpfil/pf/pf_norm.c (revision 280955) @@ -1,2287 +1,2293 @@ /*- * Copyright 2001 Niels Provos * Copyright 2011 Alexander Bluhm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $OpenBSD: pf_norm.c,v 1.114 2009/01/29 14:11:45 henning Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_pf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif /* INET6 */ struct pf_frent { TAILQ_ENTRY(pf_frent) fr_next; struct mbuf *fe_m; uint16_t fe_hdrlen; /* ipv4 header lenght with ip options ipv6, extension, fragment header */ uint16_t fe_extoff; /* last extension header offset or 0 */ uint16_t fe_len; /* fragment length */ uint16_t fe_off; /* fragment offset */ uint16_t fe_mff; /* more fragment flag */ }; struct pf_fragment_cmp { struct pf_addr frc_src; struct pf_addr frc_dst; uint32_t frc_id; sa_family_t frc_af; uint8_t frc_proto; uint8_t frc_direction; }; struct pf_fragment { struct pf_fragment_cmp fr_key; #define fr_src fr_key.frc_src #define fr_dst fr_key.frc_dst #define fr_id fr_key.frc_id #define fr_af fr_key.frc_af #define fr_proto fr_key.frc_proto #define fr_direction fr_key.frc_direction RB_ENTRY(pf_fragment) fr_entry; TAILQ_ENTRY(pf_fragment) frag_next; uint8_t fr_flags; /* status flags */ #define PFFRAG_SEENLAST 0x0001 /* Seen the last fragment for this */ #define PFFRAG_NOBUFFER 0x0002 /* Non-buffering fragment cache */ #define PFFRAG_DROP 0x0004 /* Drop all fragments */ #define BUFFER_FRAGMENTS(fr) (!((fr)->fr_flags & PFFRAG_NOBUFFER)) uint16_t fr_max; /* fragment data max */ uint32_t fr_timeout; uint16_t fr_maxlen; /* maximum length of single fragment */ TAILQ_HEAD(pf_fragq, pf_frent) fr_queue; }; struct pf_fragment_tag { uint16_t ft_hdrlen; /* header length of reassembled pkt */ uint16_t ft_extoff; /* last extension header offset or 0 */ uint16_t ft_maxlen; /* maximum fragment payload length */ + uint32_t ft_id; /* fragment id */ }; static struct mtx pf_frag_mtx; #define PF_FRAG_LOCK() mtx_lock(&pf_frag_mtx) #define PF_FRAG_UNLOCK() mtx_unlock(&pf_frag_mtx) #define PF_FRAG_ASSERT() mtx_assert(&pf_frag_mtx, MA_OWNED) VNET_DEFINE(uma_zone_t, pf_state_scrub_z); /* XXX: shared with pfsync */ static VNET_DEFINE(uma_zone_t, pf_frent_z); #define V_pf_frent_z VNET(pf_frent_z) static VNET_DEFINE(uma_zone_t, pf_frag_z); #define V_pf_frag_z VNET(pf_frag_z) TAILQ_HEAD(pf_fragqueue, pf_fragment); TAILQ_HEAD(pf_cachequeue, pf_fragment); static VNET_DEFINE(struct pf_fragqueue, pf_fragqueue); #define V_pf_fragqueue VNET(pf_fragqueue) static VNET_DEFINE(struct pf_cachequeue, pf_cachequeue); #define V_pf_cachequeue VNET(pf_cachequeue) RB_HEAD(pf_frag_tree, pf_fragment); static VNET_DEFINE(struct pf_frag_tree, pf_frag_tree); #define V_pf_frag_tree VNET(pf_frag_tree) static VNET_DEFINE(struct pf_frag_tree, pf_cache_tree); #define V_pf_cache_tree VNET(pf_cache_tree) static int pf_frag_compare(struct pf_fragment *, struct pf_fragment *); static RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); static RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); static void pf_flush_fragments(void); static void pf_free_fragment(struct pf_fragment *); static void pf_remove_fragment(struct pf_fragment *); static int pf_normalize_tcpopt(struct pf_rule *, struct mbuf *, struct tcphdr *, int, sa_family_t); static struct pf_frent *pf_create_fragment(u_short *); static struct pf_fragment *pf_find_fragment(struct pf_fragment_cmp *key, struct pf_frag_tree *tree); static struct pf_fragment *pf_fillup_fragment(struct pf_fragment_cmp *, struct pf_frent *, u_short *); static int pf_isfull_fragment(struct pf_fragment *); static struct mbuf *pf_join_fragment(struct pf_fragment *); #ifdef INET static void pf_scrub_ip(struct mbuf **, uint32_t, uint8_t, uint8_t); static int pf_reassemble(struct mbuf **, struct ip *, int, u_short *); static struct mbuf *pf_fragcache(struct mbuf **, struct ip*, struct pf_fragment **, int, int, int *); #endif /* INET */ #ifdef INET6 static int pf_reassemble6(struct mbuf **, struct ip6_hdr *, struct ip6_frag *, uint16_t, uint16_t, int, u_short *); static void pf_scrub_ip6(struct mbuf **, uint8_t); #endif /* INET6 */ #define DPFPRINTF(x) do { \ if (V_pf_status.debug >= PF_DEBUG_MISC) { \ printf("%s: ", __func__); \ printf x ; \ } \ } while(0) #ifdef INET static void pf_ip2key(struct ip *ip, int dir, struct pf_fragment_cmp *key) { key->frc_src.v4 = ip->ip_src; key->frc_dst.v4 = ip->ip_dst; key->frc_af = AF_INET; key->frc_proto = ip->ip_p; key->frc_id = ip->ip_id; key->frc_direction = dir; } #endif /* INET */ void pf_normalize_init(void) { V_pf_frag_z = uma_zcreate("pf frags", sizeof(struct pf_fragment), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_frent_z = uma_zcreate("pf frag entries", sizeof(struct pf_frent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_state_scrub_z = uma_zcreate("pf state scrubs", sizeof(struct pf_state_scrub), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_limits[PF_LIMIT_FRAGS].zone = V_pf_frent_z; V_pf_limits[PF_LIMIT_FRAGS].limit = PFFRAG_FRENT_HIWAT; uma_zone_set_max(V_pf_frent_z, PFFRAG_FRENT_HIWAT); uma_zone_set_warning(V_pf_frent_z, "PF frag entries limit reached"); mtx_init(&pf_frag_mtx, "pf fragments", NULL, MTX_DEF); TAILQ_INIT(&V_pf_fragqueue); TAILQ_INIT(&V_pf_cachequeue); } void pf_normalize_cleanup(void) { uma_zdestroy(V_pf_state_scrub_z); uma_zdestroy(V_pf_frent_z); uma_zdestroy(V_pf_frag_z); mtx_destroy(&pf_frag_mtx); } static int pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b) { int diff; if ((diff = a->fr_id - b->fr_id) != 0) return (diff); if ((diff = a->fr_proto - b->fr_proto) != 0) return (diff); if ((diff = a->fr_af - b->fr_af) != 0) return (diff); if ((diff = pf_addr_cmp(&a->fr_src, &b->fr_src, a->fr_af)) != 0) return (diff); if ((diff = pf_addr_cmp(&a->fr_dst, &b->fr_dst, a->fr_af)) != 0) return (diff); return (0); } void pf_purge_expired_fragments(void) { struct pf_fragment *frag; u_int32_t expire = time_uptime - V_pf_default_rule.timeout[PFTM_FRAG]; PF_FRAG_LOCK(); while ((frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue)) != NULL) { KASSERT((BUFFER_FRAGMENTS(frag)), ("BUFFER_FRAGMENTS(frag) == 0: %s", __FUNCTION__)); if (frag->fr_timeout > expire) break; DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag)); pf_free_fragment(frag); } while ((frag = TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue)) != NULL) { KASSERT((!BUFFER_FRAGMENTS(frag)), ("BUFFER_FRAGMENTS(frag) != 0: %s", __FUNCTION__)); if (frag->fr_timeout > expire) break; DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag)); pf_free_fragment(frag); KASSERT((TAILQ_EMPTY(&V_pf_cachequeue) || TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue) != frag), ("!(TAILQ_EMPTY() || TAILQ_LAST() == farg): %s", __FUNCTION__)); } PF_FRAG_UNLOCK(); } /* * Try to flush old fragments to make space for new ones */ static void pf_flush_fragments(void) { struct pf_fragment *frag, *cache; int goal; PF_FRAG_ASSERT(); goal = uma_zone_get_cur(V_pf_frent_z) * 9 / 10; DPFPRINTF(("trying to free %d frag entriess\n", goal)); while (goal < uma_zone_get_cur(V_pf_frent_z)) { frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue); if (frag) pf_free_fragment(frag); cache = TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue); if (cache) pf_free_fragment(cache); if (frag == NULL && cache == NULL) break; } } /* Frees the fragments and all associated entries */ static void pf_free_fragment(struct pf_fragment *frag) { struct pf_frent *frent; PF_FRAG_ASSERT(); /* Free all fragments */ if (BUFFER_FRAGMENTS(frag)) { for (frent = TAILQ_FIRST(&frag->fr_queue); frent; frent = TAILQ_FIRST(&frag->fr_queue)) { TAILQ_REMOVE(&frag->fr_queue, frent, fr_next); m_freem(frent->fe_m); uma_zfree(V_pf_frent_z, frent); } } else { for (frent = TAILQ_FIRST(&frag->fr_queue); frent; frent = TAILQ_FIRST(&frag->fr_queue)) { TAILQ_REMOVE(&frag->fr_queue, frent, fr_next); KASSERT((TAILQ_EMPTY(&frag->fr_queue) || TAILQ_FIRST(&frag->fr_queue)->fe_off > frent->fe_len), ("! (TAILQ_EMPTY() || TAILQ_FIRST()->fe_off >" " frent->fe_len): %s", __func__)); uma_zfree(V_pf_frent_z, frent); } } pf_remove_fragment(frag); } static struct pf_fragment * pf_find_fragment(struct pf_fragment_cmp *key, struct pf_frag_tree *tree) { struct pf_fragment *frag; PF_FRAG_ASSERT(); frag = RB_FIND(pf_frag_tree, tree, (struct pf_fragment *)key); if (frag != NULL) { /* XXX Are we sure we want to update the timeout? */ frag->fr_timeout = time_uptime; if (BUFFER_FRAGMENTS(frag)) { TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next); TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next); } else { TAILQ_REMOVE(&V_pf_cachequeue, frag, frag_next); TAILQ_INSERT_HEAD(&V_pf_cachequeue, frag, frag_next); } } return (frag); } /* Removes a fragment from the fragment queue and frees the fragment */ static void pf_remove_fragment(struct pf_fragment *frag) { PF_FRAG_ASSERT(); if (BUFFER_FRAGMENTS(frag)) { RB_REMOVE(pf_frag_tree, &V_pf_frag_tree, frag); TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next); uma_zfree(V_pf_frag_z, frag); } else { RB_REMOVE(pf_frag_tree, &V_pf_cache_tree, frag); TAILQ_REMOVE(&V_pf_cachequeue, frag, frag_next); uma_zfree(V_pf_frag_z, frag); } } static struct pf_frent * pf_create_fragment(u_short *reason) { struct pf_frent *frent; PF_FRAG_ASSERT(); frent = uma_zalloc(V_pf_frent_z, M_NOWAIT); if (frent == NULL) { pf_flush_fragments(); frent = uma_zalloc(V_pf_frent_z, M_NOWAIT); if (frent == NULL) { REASON_SET(reason, PFRES_MEMORY); return (NULL); } } return (frent); } static struct pf_fragment * pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent, u_short *reason) { struct pf_frent *after, *next, *prev; struct pf_fragment *frag; uint16_t total; PF_FRAG_ASSERT(); /* No empty fragments. */ if (frent->fe_len == 0) { DPFPRINTF(("bad fragment: len 0")); goto bad_fragment; } /* All fragments are 8 byte aligned. */ if (frent->fe_mff && (frent->fe_len & 0x7)) { DPFPRINTF(("bad fragment: mff and len %d", frent->fe_len)); goto bad_fragment; } /* Respect maximum length, IP_MAXPACKET == IPV6_MAXPACKET. */ if (frent->fe_off + frent->fe_len > IP_MAXPACKET) { DPFPRINTF(("bad fragment: max packet %d", frent->fe_off + frent->fe_len)); goto bad_fragment; } DPFPRINTF((key->frc_af == AF_INET ? "reass frag %d @ %d-%d" : "reass frag %#08x @ %d-%d", key->frc_id, frent->fe_off, frent->fe_off + frent->fe_len)); /* Fully buffer all of the fragments in this fragment queue. */ frag = pf_find_fragment(key, &V_pf_frag_tree); /* Create a new reassembly queue for this packet. */ if (frag == NULL) { frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); if (frag == NULL) { pf_flush_fragments(); frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); if (frag == NULL) { REASON_SET(reason, PFRES_MEMORY); goto drop_fragment; } } *(struct pf_fragment_cmp *)frag = *key; frag->fr_timeout = time_second; frag->fr_maxlen = frent->fe_len; TAILQ_INIT(&frag->fr_queue); RB_INSERT(pf_frag_tree, &V_pf_frag_tree, frag); TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next); /* We do not have a previous fragment. */ TAILQ_INSERT_HEAD(&frag->fr_queue, frent, fr_next); return (frag); } KASSERT(!TAILQ_EMPTY(&frag->fr_queue), ("!TAILQ_EMPTY()->fr_queue")); /* Remember maximum fragment len for refragmentation. */ if (frent->fe_len > frag->fr_maxlen) frag->fr_maxlen = frent->fe_len; /* Maximum data we have seen already. */ total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; /* Non terminal fragments must have more fragments flag. */ if (frent->fe_off + frent->fe_len < total && !frent->fe_mff) goto bad_fragment; /* Check if we saw the last fragment already. */ if (!TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_mff) { if (frent->fe_off + frent->fe_len > total || (frent->fe_off + frent->fe_len == total && frent->fe_mff)) goto bad_fragment; } else { if (frent->fe_off + frent->fe_len == total && !frent->fe_mff) goto bad_fragment; } /* Find a fragment after the current one. */ prev = NULL; TAILQ_FOREACH(after, &frag->fr_queue, fr_next) { if (after->fe_off > frent->fe_off) break; prev = after; } KASSERT(prev != NULL || after != NULL, ("prev != NULL || after != NULL")); if (prev != NULL && prev->fe_off + prev->fe_len > frent->fe_off) { uint16_t precut; precut = prev->fe_off + prev->fe_len - frent->fe_off; if (precut >= frent->fe_len) goto bad_fragment; DPFPRINTF(("overlap -%d", precut)); m_adj(frent->fe_m, precut); frent->fe_off += precut; frent->fe_len -= precut; } for (; after != NULL && frent->fe_off + frent->fe_len > after->fe_off; after = next) { uint16_t aftercut; aftercut = frent->fe_off + frent->fe_len - after->fe_off; DPFPRINTF(("adjust overlap %d", aftercut)); if (aftercut < after->fe_len) { m_adj(after->fe_m, aftercut); after->fe_off += aftercut; after->fe_len -= aftercut; break; } /* This fragment is completely overlapped, lose it. */ next = TAILQ_NEXT(after, fr_next); m_freem(after->fe_m); TAILQ_REMOVE(&frag->fr_queue, after, fr_next); uma_zfree(V_pf_frent_z, after); } if (prev == NULL) TAILQ_INSERT_HEAD(&frag->fr_queue, frent, fr_next); else TAILQ_INSERT_AFTER(&frag->fr_queue, prev, frent, fr_next); return (frag); bad_fragment: REASON_SET(reason, PFRES_FRAG); drop_fragment: uma_zfree(V_pf_frent_z, frent); return (NULL); } static int pf_isfull_fragment(struct pf_fragment *frag) { struct pf_frent *frent, *next; uint16_t off, total; /* Check if we are completely reassembled */ if (TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_mff) return (0); /* Maximum data we have seen already */ total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; /* Check if we have all the data */ off = 0; for (frent = TAILQ_FIRST(&frag->fr_queue); frent; frent = next) { next = TAILQ_NEXT(frent, fr_next); off += frent->fe_len; if (off < total && (next == NULL || next->fe_off != off)) { DPFPRINTF(("missing fragment at %d, next %d, total %d", off, next == NULL ? -1 : next->fe_off, total)); return (0); } } DPFPRINTF(("%d < %d?", off, total)); if (off < total) return (0); KASSERT(off == total, ("off == total")); return (1); } static struct mbuf * pf_join_fragment(struct pf_fragment *frag) { struct mbuf *m, *m2; struct pf_frent *frent, *next; frent = TAILQ_FIRST(&frag->fr_queue); next = TAILQ_NEXT(frent, fr_next); /* Magic from ip_input. */ m = frent->fe_m; m2 = m->m_next; m->m_next = NULL; m_cat(m, m2); uma_zfree(V_pf_frent_z, frent); for (frent = next; frent != NULL; frent = next) { next = TAILQ_NEXT(frent, fr_next); m2 = frent->fe_m; /* Strip off ip header. */ m_adj(m2, frent->fe_hdrlen); uma_zfree(V_pf_frent_z, frent); m_cat(m, m2); } /* Remove from fragment queue. */ pf_remove_fragment(frag); return (m); } #ifdef INET static int pf_reassemble(struct mbuf **m0, struct ip *ip, int dir, u_short *reason) { struct mbuf *m = *m0; struct pf_frent *frent; struct pf_fragment *frag; struct pf_fragment_cmp key; uint16_t total, hdrlen; /* Get an entry for the fragment queue */ if ((frent = pf_create_fragment(reason)) == NULL) return (PF_DROP); frent->fe_m = m; frent->fe_hdrlen = ip->ip_hl << 2; frent->fe_extoff = 0; frent->fe_len = ntohs(ip->ip_len) - (ip->ip_hl << 2); frent->fe_off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3; frent->fe_mff = ntohs(ip->ip_off) & IP_MF; pf_ip2key(ip, dir, &key); if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) return (PF_DROP); /* The mbuf is part of the fragment entry, no direct free or access */ m = *m0 = NULL; if (!pf_isfull_fragment(frag)) return (PF_PASS); /* drop because *m0 is NULL, no error */ /* We have all the data */ frent = TAILQ_FIRST(&frag->fr_queue); KASSERT(frent != NULL, ("frent != NULL")); total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; hdrlen = frent->fe_hdrlen; m = *m0 = pf_join_fragment(frag); frag = NULL; if (m->m_flags & M_PKTHDR) { int plen = 0; for (m = *m0; m; m = m->m_next) plen += m->m_len; m = *m0; m->m_pkthdr.len = plen; } ip = mtod(m, struct ip *); ip->ip_len = htons(hdrlen + total); ip->ip_off &= ~(IP_MF|IP_OFFMASK); if (hdrlen + total > IP_MAXPACKET) { DPFPRINTF(("drop: too big: %d", total)); ip->ip_len = 0; REASON_SET(reason, PFRES_SHORT); /* PF_DROP requires a valid mbuf *m0 in pf_test() */ return (PF_DROP); } DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip->ip_len))); return (PF_PASS); } #endif /* INET */ #ifdef INET6 static int pf_reassemble6(struct mbuf **m0, struct ip6_hdr *ip6, struct ip6_frag *fraghdr, uint16_t hdrlen, uint16_t extoff, int dir, u_short *reason) { struct mbuf *m = *m0; struct pf_frent *frent; struct pf_fragment *frag; struct pf_fragment_cmp key; struct m_tag *mtag; struct pf_fragment_tag *ftag; int off; + uint32_t frag_id; uint16_t total, maxlen; uint8_t proto; PF_FRAG_LOCK(); /* Get an entry for the fragment queue. */ if ((frent = pf_create_fragment(reason)) == NULL) { PF_FRAG_UNLOCK(); return (PF_DROP); } frent->fe_m = m; frent->fe_hdrlen = hdrlen; frent->fe_extoff = extoff; frent->fe_len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - hdrlen; frent->fe_off = ntohs(fraghdr->ip6f_offlg & IP6F_OFF_MASK); frent->fe_mff = fraghdr->ip6f_offlg & IP6F_MORE_FRAG; key.frc_src.v6 = ip6->ip6_src; key.frc_dst.v6 = ip6->ip6_dst; key.frc_af = AF_INET6; /* Only the first fragment's protocol is relevant. */ key.frc_proto = 0; key.frc_id = fraghdr->ip6f_ident; key.frc_direction = dir; if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) { PF_FRAG_UNLOCK(); return (PF_DROP); } /* The mbuf is part of the fragment entry, no direct free or access. */ m = *m0 = NULL; if (!pf_isfull_fragment(frag)) { PF_FRAG_UNLOCK(); return (PF_PASS); /* Drop because *m0 is NULL, no error. */ } /* We have all the data. */ extoff = frent->fe_extoff; maxlen = frag->fr_maxlen; + frag_id = frag->fr_id; frent = TAILQ_FIRST(&frag->fr_queue); KASSERT(frent != NULL, ("frent != NULL")); total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; hdrlen = frent->fe_hdrlen - sizeof(struct ip6_frag); m = *m0 = pf_join_fragment(frag); frag = NULL; PF_FRAG_UNLOCK(); /* Take protocol from first fragment header. */ m = m_getptr(m, hdrlen + offsetof(struct ip6_frag, ip6f_nxt), &off); KASSERT(m, ("%s: short mbuf chain", __func__)); proto = *(mtod(m, caddr_t) + off); m = *m0; /* Delete frag6 header */ if (ip6_deletefraghdr(m, hdrlen, M_NOWAIT) != 0) goto fail; if (m->m_flags & M_PKTHDR) { int plen = 0; for (m = *m0; m; m = m->m_next) plen += m->m_len; m = *m0; m->m_pkthdr.len = plen; } if ((mtag = m_tag_get(PF_REASSEMBLED, sizeof(struct pf_fragment_tag), M_NOWAIT)) == NULL) goto fail; ftag = (struct pf_fragment_tag *)(mtag + 1); ftag->ft_hdrlen = hdrlen; ftag->ft_extoff = extoff; ftag->ft_maxlen = maxlen; + ftag->ft_id = frag_id; m_tag_prepend(m, mtag); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(hdrlen - sizeof(struct ip6_hdr) + total); if (extoff) { /* Write protocol into next field of last extension header. */ m = m_getptr(m, extoff + offsetof(struct ip6_ext, ip6e_nxt), &off); KASSERT(m, ("%s: short mbuf chain", __func__)); *(mtod(m, char *) + off) = proto; m = *m0; } else ip6->ip6_nxt = proto; if (hdrlen - sizeof(struct ip6_hdr) + total > IPV6_MAXPACKET) { DPFPRINTF(("drop: too big: %d", total)); ip6->ip6_plen = 0; REASON_SET(reason, PFRES_SHORT); /* PF_DROP requires a valid mbuf *m0 in pf_test6(). */ return (PF_DROP); } DPFPRINTF(("complete: %p(%d)", m, ntohs(ip6->ip6_plen))); return (PF_PASS); fail: REASON_SET(reason, PFRES_MEMORY); /* PF_DROP requires a valid mbuf *m0 in pf_test6(), will free later. */ return (PF_DROP); } #endif /* INET6 */ #ifdef INET static struct mbuf * pf_fragcache(struct mbuf **m0, struct ip *h, struct pf_fragment **frag, int mff, int drop, int *nomem) { struct mbuf *m = *m0; struct pf_frent *frp, *fra, *cur = NULL; int ip_len = ntohs(h->ip_len) - (h->ip_hl << 2); u_int16_t off = ntohs(h->ip_off) << 3; u_int16_t max = ip_len + off; int hosed = 0; PF_FRAG_ASSERT(); KASSERT((*frag == NULL || !BUFFER_FRAGMENTS(*frag)), ("!(*frag == NULL || !BUFFER_FRAGMENTS(*frag)): %s", __FUNCTION__)); /* Create a new range queue for this packet */ if (*frag == NULL) { *frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); if (*frag == NULL) { pf_flush_fragments(); *frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); if (*frag == NULL) goto no_mem; } /* Get an entry for the queue */ cur = uma_zalloc(V_pf_frent_z, M_NOWAIT); if (cur == NULL) { uma_zfree(V_pf_frag_z, *frag); *frag = NULL; goto no_mem; } (*frag)->fr_flags = PFFRAG_NOBUFFER; (*frag)->fr_max = 0; (*frag)->fr_src.v4 = h->ip_src; (*frag)->fr_dst.v4 = h->ip_dst; (*frag)->fr_id = h->ip_id; (*frag)->fr_timeout = time_uptime; cur->fe_off = off; cur->fe_len = max; /* TODO: fe_len = max - off ? */ TAILQ_INIT(&(*frag)->fr_queue); TAILQ_INSERT_HEAD(&(*frag)->fr_queue, cur, fr_next); RB_INSERT(pf_frag_tree, &V_pf_cache_tree, *frag); TAILQ_INSERT_HEAD(&V_pf_cachequeue, *frag, frag_next); DPFPRINTF(("fragcache[%d]: new %d-%d\n", h->ip_id, off, max)); goto pass; } /* * Find a fragment after the current one: * - off contains the real shifted offset. */ frp = NULL; TAILQ_FOREACH(fra, &(*frag)->fr_queue, fr_next) { if (fra->fe_off > off) break; frp = fra; } KASSERT((frp != NULL || fra != NULL), ("!(frp != NULL || fra != NULL): %s", __FUNCTION__)); if (frp != NULL) { int precut; precut = frp->fe_len - off; if (precut >= ip_len) { /* Fragment is entirely a duplicate */ DPFPRINTF(("fragcache[%d]: dead (%d-%d) %d-%d\n", h->ip_id, frp->fe_off, frp->fe_len, off, max)); goto drop_fragment; } if (precut == 0) { /* They are adjacent. Fixup cache entry */ DPFPRINTF(("fragcache[%d]: adjacent (%d-%d) %d-%d\n", h->ip_id, frp->fe_off, frp->fe_len, off, max)); frp->fe_len = max; } else if (precut > 0) { /* The first part of this payload overlaps with a * fragment that has already been passed. * Need to trim off the first part of the payload. * But to do so easily, we need to create another * mbuf to throw the original header into. */ DPFPRINTF(("fragcache[%d]: chop %d (%d-%d) %d-%d\n", h->ip_id, precut, frp->fe_off, frp->fe_len, off, max)); off += precut; max -= precut; /* Update the previous frag to encompass this one */ frp->fe_len = max; if (!drop) { /* XXX Optimization opportunity * This is a very heavy way to trim the payload. * we could do it much faster by diddling mbuf * internals but that would be even less legible * than this mbuf magic. For my next trick, * I'll pull a rabbit out of my laptop. */ *m0 = m_dup(m, M_NOWAIT); if (*m0 == NULL) goto no_mem; /* From KAME Project : We have missed this! */ m_adj(*m0, (h->ip_hl << 2) - (*m0)->m_pkthdr.len); KASSERT(((*m0)->m_next == NULL), ("(*m0)->m_next != NULL: %s", __FUNCTION__)); m_adj(m, precut + (h->ip_hl << 2)); m_cat(*m0, m); m = *m0; if (m->m_flags & M_PKTHDR) { int plen = 0; struct mbuf *t; for (t = m; t; t = t->m_next) plen += t->m_len; m->m_pkthdr.len = plen; } h = mtod(m, struct ip *); KASSERT(((int)m->m_len == ntohs(h->ip_len) - precut), ("m->m_len != ntohs(h->ip_len) - precut: %s", __FUNCTION__)); h->ip_off = htons(ntohs(h->ip_off) + (precut >> 3)); h->ip_len = htons(ntohs(h->ip_len) - precut); } else { hosed++; } } else { /* There is a gap between fragments */ DPFPRINTF(("fragcache[%d]: gap %d (%d-%d) %d-%d\n", h->ip_id, -precut, frp->fe_off, frp->fe_len, off, max)); cur = uma_zalloc(V_pf_frent_z, M_NOWAIT); if (cur == NULL) goto no_mem; cur->fe_off = off; cur->fe_len = max; TAILQ_INSERT_AFTER(&(*frag)->fr_queue, frp, cur, fr_next); } } if (fra != NULL) { int aftercut; int merge = 0; aftercut = max - fra->fe_off; if (aftercut == 0) { /* Adjacent fragments */ DPFPRINTF(("fragcache[%d]: adjacent %d-%d (%d-%d)\n", h->ip_id, off, max, fra->fe_off, fra->fe_len)); fra->fe_off = off; merge = 1; } else if (aftercut > 0) { /* Need to chop off the tail of this fragment */ DPFPRINTF(("fragcache[%d]: chop %d %d-%d (%d-%d)\n", h->ip_id, aftercut, off, max, fra->fe_off, fra->fe_len)); fra->fe_off = off; max -= aftercut; merge = 1; if (!drop) { m_adj(m, -aftercut); if (m->m_flags & M_PKTHDR) { int plen = 0; struct mbuf *t; for (t = m; t; t = t->m_next) plen += t->m_len; m->m_pkthdr.len = plen; } h = mtod(m, struct ip *); KASSERT(((int)m->m_len == ntohs(h->ip_len) - aftercut), ("m->m_len != ntohs(h->ip_len) - aftercut: %s", __FUNCTION__)); h->ip_len = htons(ntohs(h->ip_len) - aftercut); } else { hosed++; } } else if (frp == NULL) { /* There is a gap between fragments */ DPFPRINTF(("fragcache[%d]: gap %d %d-%d (%d-%d)\n", h->ip_id, -aftercut, off, max, fra->fe_off, fra->fe_len)); cur = uma_zalloc(V_pf_frent_z, M_NOWAIT); if (cur == NULL) goto no_mem; cur->fe_off = off; cur->fe_len = max; TAILQ_INSERT_HEAD(&(*frag)->fr_queue, cur, fr_next); } /* Need to glue together two separate fragment descriptors */ if (merge) { if (cur && fra->fe_off <= cur->fe_len) { /* Need to merge in a previous 'cur' */ DPFPRINTF(("fragcache[%d]: adjacent(merge " "%d-%d) %d-%d (%d-%d)\n", h->ip_id, cur->fe_off, cur->fe_len, off, max, fra->fe_off, fra->fe_len)); fra->fe_off = cur->fe_off; TAILQ_REMOVE(&(*frag)->fr_queue, cur, fr_next); uma_zfree(V_pf_frent_z, cur); cur = NULL; } else if (frp && fra->fe_off <= frp->fe_len) { /* Need to merge in a modified 'frp' */ KASSERT((cur == NULL), ("cur != NULL: %s", __FUNCTION__)); DPFPRINTF(("fragcache[%d]: adjacent(merge " "%d-%d) %d-%d (%d-%d)\n", h->ip_id, frp->fe_off, frp->fe_len, off, max, fra->fe_off, fra->fe_len)); fra->fe_off = frp->fe_off; TAILQ_REMOVE(&(*frag)->fr_queue, frp, fr_next); uma_zfree(V_pf_frent_z, frp); frp = NULL; } } } if (hosed) { /* * We must keep tracking the overall fragment even when * we're going to drop it anyway so that we know when to * free the overall descriptor. Thus we drop the frag late. */ goto drop_fragment; } pass: /* Update maximum data size */ if ((*frag)->fr_max < max) (*frag)->fr_max = max; /* This is the last segment */ if (!mff) (*frag)->fr_flags |= PFFRAG_SEENLAST; /* Check if we are completely reassembled */ if (((*frag)->fr_flags & PFFRAG_SEENLAST) && TAILQ_FIRST(&(*frag)->fr_queue)->fe_off == 0 && TAILQ_FIRST(&(*frag)->fr_queue)->fe_len == (*frag)->fr_max) { /* Remove from fragment queue */ DPFPRINTF(("fragcache[%d]: done 0-%d\n", h->ip_id, (*frag)->fr_max)); pf_free_fragment(*frag); *frag = NULL; } return (m); no_mem: *nomem = 1; /* Still need to pay attention to !IP_MF */ if (!mff && *frag != NULL) (*frag)->fr_flags |= PFFRAG_SEENLAST; m_freem(m); return (NULL); drop_fragment: /* Still need to pay attention to !IP_MF */ if (!mff && *frag != NULL) (*frag)->fr_flags |= PFFRAG_SEENLAST; if (drop) { /* This fragment has been deemed bad. Don't reass */ if (((*frag)->fr_flags & PFFRAG_DROP) == 0) DPFPRINTF(("fragcache[%d]: dropping overall fragment\n", h->ip_id)); (*frag)->fr_flags |= PFFRAG_DROP; } m_freem(m); return (NULL); } #endif /* INET */ #ifdef INET6 int pf_refragment6(struct ifnet *ifp, struct mbuf **m0, struct m_tag *mtag) { struct mbuf *m = *m0, *t; struct pf_fragment_tag *ftag = (struct pf_fragment_tag *)(mtag + 1); struct pf_pdesc pd; + uint32_t frag_id; uint16_t hdrlen, extoff, maxlen; uint8_t proto; int error, action; hdrlen = ftag->ft_hdrlen; extoff = ftag->ft_extoff; maxlen = ftag->ft_maxlen; + frag_id = ftag->ft_id; m_tag_delete(m, mtag); mtag = NULL; ftag = NULL; if (extoff) { int off; /* Use protocol from next field of last extension header */ m = m_getptr(m, extoff + offsetof(struct ip6_ext, ip6e_nxt), &off); KASSERT((m != NULL), ("pf_refragment6: short mbuf chain")); proto = *(mtod(m, caddr_t) + off); *(mtod(m, char *) + off) = IPPROTO_FRAGMENT; m = *m0; } else { struct ip6_hdr *hdr; hdr = mtod(m, struct ip6_hdr *); proto = hdr->ip6_nxt; hdr->ip6_nxt = IPPROTO_FRAGMENT; } /* * Maxlen may be less than 8 if there was only a single * fragment. As it was fragmented before, add a fragment * header also for a single fragment. If total or maxlen * is less than 8, ip6_fragment() will return EMSGSIZE and * we drop the packet. */ - error = ip6_fragment(ifp, m, hdrlen, proto, maxlen); + error = ip6_fragment(ifp, m, hdrlen, proto, maxlen, frag_id); m = (*m0)->m_nextpkt; (*m0)->m_nextpkt = NULL; if (error == 0) { /* The first mbuf contains the unfragmented packet. */ m_freem(*m0); *m0 = NULL; action = PF_PASS; } else { /* Drop expects an mbuf to free. */ DPFPRINTF(("refragment error %d", error)); action = PF_DROP; } for (t = m; m; m = t) { t = m->m_nextpkt; m->m_nextpkt = NULL; memset(&pd, 0, sizeof(pd)); pd.pf_mtag = pf_find_mtag(m); if (error == 0) ip6_forward(m, 0); else m_freem(m); } return (action); } #endif /* INET6 */ #ifdef INET int pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason, struct pf_pdesc *pd) { struct mbuf *m = *m0; struct pf_rule *r; struct pf_fragment *frag = NULL; struct pf_fragment_cmp key; struct ip *h = mtod(m, struct ip *); int mff = (ntohs(h->ip_off) & IP_MF); int hlen = h->ip_hl << 2; u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3; u_int16_t max; int ip_len; int ip_off; int tag = -1; int verdict; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != AF_INET) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != h->ip_p) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, (struct pf_addr *)&h->ip_src.s_addr, AF_INET, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (PF_MISMATCHAW(&r->dst.addr, (struct pf_addr *)&h->ip_dst.s_addr, AF_INET, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->match_tag && !pf_match_tag(m, r, &tag, pd->pf_mtag ? pd->pf_mtag->tag : 0)) r = TAILQ_NEXT(r, entries); else break; } if (r == NULL || r->action == PF_NOSCRUB) return (PF_PASS); else { r->packets[dir == PF_OUT]++; r->bytes[dir == PF_OUT] += pd->tot_len; } /* Check for illegal packets */ if (hlen < (int)sizeof(struct ip)) goto drop; if (hlen > ntohs(h->ip_len)) goto drop; /* Clear IP_DF if the rule uses the no-df option */ if (r->rule_flag & PFRULE_NODF && h->ip_off & htons(IP_DF)) { u_int16_t ip_off = h->ip_off; h->ip_off &= htons(~IP_DF); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); } /* We will need other tests here */ if (!fragoff && !mff) goto no_fragment; /* We're dealing with a fragment now. Don't allow fragments * with IP_DF to enter the cache. If the flag was cleared by * no-df above, fine. Otherwise drop it. */ if (h->ip_off & htons(IP_DF)) { DPFPRINTF(("IP_DF\n")); goto bad; } ip_len = ntohs(h->ip_len) - hlen; ip_off = (ntohs(h->ip_off) & IP_OFFMASK) << 3; /* All fragments are 8 byte aligned */ if (mff && (ip_len & 0x7)) { DPFPRINTF(("mff and %d\n", ip_len)); goto bad; } /* Respect maximum length */ if (fragoff + ip_len > IP_MAXPACKET) { DPFPRINTF(("max packet %d\n", fragoff + ip_len)); goto bad; } max = fragoff + ip_len; if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) { /* Fully buffer all of the fragments */ PF_FRAG_LOCK(); pf_ip2key(h, dir, &key); frag = pf_find_fragment(&key, &V_pf_frag_tree); /* Check if we saw the last fragment already */ if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) && max > frag->fr_max) goto bad; /* Might return a completely reassembled mbuf, or NULL */ DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max)); verdict = pf_reassemble(m0, h, dir, reason); PF_FRAG_UNLOCK(); if (verdict != PF_PASS) return (PF_DROP); m = *m0; if (m == NULL) return (PF_DROP); if (frag != NULL && (frag->fr_flags & PFFRAG_DROP)) goto drop; h = mtod(m, struct ip *); } else { /* non-buffering fragment cache (drops or masks overlaps) */ int nomem = 0; if (dir == PF_OUT && pd->pf_mtag && pd->pf_mtag->flags & PF_TAG_FRAGCACHE) { /* * Already passed the fragment cache in the * input direction. If we continued, it would * appear to be a dup and would be dropped. */ goto fragment_pass; } PF_FRAG_LOCK(); pf_ip2key(h, dir, &key); frag = pf_find_fragment(&key, &V_pf_cache_tree); /* Check if we saw the last fragment already */ if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) && max > frag->fr_max) { if (r->rule_flag & PFRULE_FRAGDROP) frag->fr_flags |= PFFRAG_DROP; goto bad; } *m0 = m = pf_fragcache(m0, h, &frag, mff, (r->rule_flag & PFRULE_FRAGDROP) ? 1 : 0, &nomem); PF_FRAG_UNLOCK(); if (m == NULL) { if (nomem) goto no_mem; goto drop; } if (dir == PF_IN) { /* Use mtag from copied and trimmed mbuf chain. */ pd->pf_mtag = pf_get_mtag(m); if (pd->pf_mtag == NULL) { m_freem(m); *m0 = NULL; goto no_mem; } pd->pf_mtag->flags |= PF_TAG_FRAGCACHE; } if (frag != NULL && (frag->fr_flags & PFFRAG_DROP)) goto drop; goto fragment_pass; } no_fragment: /* At this point, only IP_DF is allowed in ip_off */ if (h->ip_off & ~htons(IP_DF)) { u_int16_t ip_off = h->ip_off; h->ip_off &= htons(IP_DF); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); } /* not missing a return here */ fragment_pass: pf_scrub_ip(&m, r->rule_flag, r->min_ttl, r->set_tos); if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) pd->flags |= PFDESC_IP_REAS; return (PF_PASS); no_mem: REASON_SET(reason, PFRES_MEMORY); if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); drop: REASON_SET(reason, PFRES_NORM); if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); bad: DPFPRINTF(("dropping bad fragment\n")); /* Free associated fragments */ if (frag != NULL) { pf_free_fragment(frag); PF_FRAG_UNLOCK(); } REASON_SET(reason, PFRES_FRAG); if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); } #endif #ifdef INET6 int pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason, struct pf_pdesc *pd) { struct mbuf *m = *m0; struct pf_rule *r; struct ip6_hdr *h = mtod(m, struct ip6_hdr *); int extoff; int off; struct ip6_ext ext; struct ip6_opt opt; struct ip6_opt_jumbo jumbo; struct ip6_frag frag; u_int32_t jumbolen = 0, plen; int optend; int ooff; u_int8_t proto; int terminal; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != AF_INET6) r = r->skip[PF_SKIP_AF].ptr; #if 0 /* header chain! */ else if (r->proto && r->proto != h->ip6_nxt) r = r->skip[PF_SKIP_PROTO].ptr; #endif else if (PF_MISMATCHAW(&r->src.addr, (struct pf_addr *)&h->ip6_src, AF_INET6, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (PF_MISMATCHAW(&r->dst.addr, (struct pf_addr *)&h->ip6_dst, AF_INET6, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else break; } if (r == NULL || r->action == PF_NOSCRUB) return (PF_PASS); else { r->packets[dir == PF_OUT]++; r->bytes[dir == PF_OUT] += pd->tot_len; } /* Check for illegal packets */ if (sizeof(struct ip6_hdr) + IPV6_MAXPACKET < m->m_pkthdr.len) goto drop; extoff = 0; off = sizeof(struct ip6_hdr); proto = h->ip6_nxt; terminal = 0; do { switch (proto) { case IPPROTO_FRAGMENT: goto fragment; break; case IPPROTO_AH: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL, NULL, AF_INET6)) goto shortpkt; extoff = off; if (proto == IPPROTO_AH) off += (ext.ip6e_len + 2) * 4; else off += (ext.ip6e_len + 1) * 8; proto = ext.ip6e_nxt; break; case IPPROTO_HOPOPTS: if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL, NULL, AF_INET6)) goto shortpkt; extoff = off; optend = off + (ext.ip6e_len + 1) * 8; ooff = off + sizeof(ext); do { if (!pf_pull_hdr(m, ooff, &opt.ip6o_type, sizeof(opt.ip6o_type), NULL, NULL, AF_INET6)) goto shortpkt; if (opt.ip6o_type == IP6OPT_PAD1) { ooff++; continue; } if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt), NULL, NULL, AF_INET6)) goto shortpkt; if (ooff + sizeof(opt) + opt.ip6o_len > optend) goto drop; switch (opt.ip6o_type) { case IP6OPT_JUMBO: if (h->ip6_plen != 0) goto drop; if (!pf_pull_hdr(m, ooff, &jumbo, sizeof(jumbo), NULL, NULL, AF_INET6)) goto shortpkt; memcpy(&jumbolen, jumbo.ip6oj_jumbo_len, sizeof(jumbolen)); jumbolen = ntohl(jumbolen); if (jumbolen <= IPV6_MAXPACKET) goto drop; if (sizeof(struct ip6_hdr) + jumbolen != m->m_pkthdr.len) goto drop; break; default: break; } ooff += sizeof(opt) + opt.ip6o_len; } while (ooff < optend); off = optend; proto = ext.ip6e_nxt; break; default: terminal = 1; break; } } while (!terminal); /* jumbo payload option must be present, or plen > 0 */ if (ntohs(h->ip6_plen) == 0) plen = jumbolen; else plen = ntohs(h->ip6_plen); if (plen == 0) goto drop; if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len) goto shortpkt; pf_scrub_ip6(&m, r->min_ttl); return (PF_PASS); fragment: /* Jumbo payload packets cannot be fragmented. */ plen = ntohs(h->ip6_plen); if (plen == 0 || jumbolen) goto drop; if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len) goto shortpkt; if (!pf_pull_hdr(m, off, &frag, sizeof(frag), NULL, NULL, AF_INET6)) goto shortpkt; /* Offset now points to data portion. */ off += sizeof(frag); /* Returns PF_DROP or *m0 is NULL or completely reassembled mbuf. */ if (pf_reassemble6(m0, h, &frag, off, extoff, dir, reason) != PF_PASS) return (PF_DROP); m = *m0; if (m == NULL) return (PF_DROP); pd->flags |= PFDESC_IP_REAS; return (PF_PASS); shortpkt: REASON_SET(reason, PFRES_SHORT); if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); drop: REASON_SET(reason, PFRES_NORM); if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); } #endif /* INET6 */ int pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff, int off, void *h, struct pf_pdesc *pd) { struct pf_rule *r, *rm = NULL; struct tcphdr *th = pd->hdr.tcp; int rewrite = 0; u_short reason; u_int8_t flags; sa_family_t af = pd->af; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (r->src.port_op && !pf_match_port(r->src.port_op, r->src.port[0], r->src.port[1], th->th_sport)) r = r->skip[PF_SKIP_SRC_PORT].ptr; else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->dst.port_op && !pf_match_port(r->dst.port_op, r->dst.port[0], r->dst.port[1], th->th_dport)) r = r->skip[PF_SKIP_DST_PORT].ptr; else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match( pf_osfp_fingerprint(pd, m, off, th), r->os_fingerprint)) r = TAILQ_NEXT(r, entries); else { rm = r; break; } } if (rm == NULL || rm->action == PF_NOSCRUB) return (PF_PASS); else { r->packets[dir == PF_OUT]++; r->bytes[dir == PF_OUT] += pd->tot_len; } if (rm->rule_flag & PFRULE_REASSEMBLE_TCP) pd->flags |= PFDESC_TCP_NORM; flags = th->th_flags; if (flags & TH_SYN) { /* Illegal packet */ if (flags & TH_RST) goto tcp_drop; if (flags & TH_FIN) flags &= ~TH_FIN; } else { /* Illegal packet */ if (!(flags & (TH_ACK|TH_RST))) goto tcp_drop; } if (!(flags & TH_ACK)) { /* These flags are only valid if ACK is set */ if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG)) goto tcp_drop; } /* Check for illegal header length */ if (th->th_off < (sizeof(struct tcphdr) >> 2)) goto tcp_drop; /* If flags changed, or reserved data set, then adjust */ if (flags != th->th_flags || th->th_x2 != 0) { u_int16_t ov, nv; ov = *(u_int16_t *)(&th->th_ack + 1); th->th_flags = flags; th->th_x2 = 0; nv = *(u_int16_t *)(&th->th_ack + 1); th->th_sum = pf_cksum_fixup(th->th_sum, ov, nv, 0); rewrite = 1; } /* Remove urgent pointer, if TH_URG is not set */ if (!(flags & TH_URG) && th->th_urp) { th->th_sum = pf_cksum_fixup(th->th_sum, th->th_urp, 0, 0); th->th_urp = 0; rewrite = 1; } /* Process options */ if (r->max_mss && pf_normalize_tcpopt(r, m, th, off, pd->af)) rewrite = 1; /* copy back packet headers if we sanitized */ if (rewrite) m_copyback(m, off, sizeof(*th), (caddr_t)th); return (PF_PASS); tcp_drop: REASON_SET(&reason, PFRES_NORM); if (rm != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET, dir, reason, r, NULL, NULL, pd, 1); return (PF_DROP); } int pf_normalize_tcp_init(struct mbuf *m, int off, struct pf_pdesc *pd, struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst) { u_int32_t tsval, tsecr; u_int8_t hdr[60]; u_int8_t *opt; KASSERT((src->scrub == NULL), ("pf_normalize_tcp_init: src->scrub != NULL")); src->scrub = uma_zalloc(V_pf_state_scrub_z, M_ZERO | M_NOWAIT); if (src->scrub == NULL) return (1); switch (pd->af) { #ifdef INET case AF_INET: { struct ip *h = mtod(m, struct ip *); src->scrub->pfss_ttl = h->ip_ttl; break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { struct ip6_hdr *h = mtod(m, struct ip6_hdr *); src->scrub->pfss_ttl = h->ip6_hlim; break; } #endif /* INET6 */ } /* * All normalizations below are only begun if we see the start of * the connections. They must all set an enabled bit in pfss_flags */ if ((th->th_flags & TH_SYN) == 0) return (0); if (th->th_off > (sizeof(struct tcphdr) >> 2) && src->scrub && pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) { /* Diddle with TCP options */ int hlen; opt = hdr + sizeof(struct tcphdr); hlen = (th->th_off << 2) - sizeof(struct tcphdr); while (hlen >= TCPOLEN_TIMESTAMP) { switch (*opt) { case TCPOPT_EOL: /* FALLTHROUGH */ case TCPOPT_NOP: opt++; hlen--; break; case TCPOPT_TIMESTAMP: if (opt[1] >= TCPOLEN_TIMESTAMP) { src->scrub->pfss_flags |= PFSS_TIMESTAMP; src->scrub->pfss_ts_mod = htonl(arc4random()); /* note PFSS_PAWS not set yet */ memcpy(&tsval, &opt[2], sizeof(u_int32_t)); memcpy(&tsecr, &opt[6], sizeof(u_int32_t)); src->scrub->pfss_tsval0 = ntohl(tsval); src->scrub->pfss_tsval = ntohl(tsval); src->scrub->pfss_tsecr = ntohl(tsecr); getmicrouptime(&src->scrub->pfss_last); } /* FALLTHROUGH */ default: hlen -= MAX(opt[1], 2); opt += MAX(opt[1], 2); break; } } } return (0); } void pf_normalize_tcp_cleanup(struct pf_state *state) { if (state->src.scrub) uma_zfree(V_pf_state_scrub_z, state->src.scrub); if (state->dst.scrub) uma_zfree(V_pf_state_scrub_z, state->dst.scrub); /* Someday... flush the TCP segment reassembly descriptors. */ } int pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd, u_short *reason, struct tcphdr *th, struct pf_state *state, struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback) { struct timeval uptime; u_int32_t tsval, tsecr; u_int tsval_from_last; u_int8_t hdr[60]; u_int8_t *opt; int copyback = 0; int got_ts = 0; KASSERT((src->scrub || dst->scrub), ("%s: src->scrub && dst->scrub!", __func__)); /* * Enforce the minimum TTL seen for this connection. Negate a common * technique to evade an intrusion detection system and confuse * firewall state code. */ switch (pd->af) { #ifdef INET case AF_INET: { if (src->scrub) { struct ip *h = mtod(m, struct ip *); if (h->ip_ttl > src->scrub->pfss_ttl) src->scrub->pfss_ttl = h->ip_ttl; h->ip_ttl = src->scrub->pfss_ttl; } break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { if (src->scrub) { struct ip6_hdr *h = mtod(m, struct ip6_hdr *); if (h->ip6_hlim > src->scrub->pfss_ttl) src->scrub->pfss_ttl = h->ip6_hlim; h->ip6_hlim = src->scrub->pfss_ttl; } break; } #endif /* INET6 */ } if (th->th_off > (sizeof(struct tcphdr) >> 2) && ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) || (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) && pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) { /* Diddle with TCP options */ int hlen; opt = hdr + sizeof(struct tcphdr); hlen = (th->th_off << 2) - sizeof(struct tcphdr); while (hlen >= TCPOLEN_TIMESTAMP) { switch (*opt) { case TCPOPT_EOL: /* FALLTHROUGH */ case TCPOPT_NOP: opt++; hlen--; break; case TCPOPT_TIMESTAMP: /* Modulate the timestamps. Can be used for * NAT detection, OS uptime determination or * reboot detection. */ if (got_ts) { /* Huh? Multiple timestamps!? */ if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("multiple TS??")); pf_print_state(state); printf("\n"); } REASON_SET(reason, PFRES_TS); return (PF_DROP); } if (opt[1] >= TCPOLEN_TIMESTAMP) { memcpy(&tsval, &opt[2], sizeof(u_int32_t)); if (tsval && src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) { tsval = ntohl(tsval); pf_change_a(&opt[2], &th->th_sum, htonl(tsval + src->scrub->pfss_ts_mod), 0); copyback = 1; } /* Modulate TS reply iff valid (!0) */ memcpy(&tsecr, &opt[6], sizeof(u_int32_t)); if (tsecr && dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) { tsecr = ntohl(tsecr) - dst->scrub->pfss_ts_mod; pf_change_a(&opt[6], &th->th_sum, htonl(tsecr), 0); copyback = 1; } got_ts = 1; } /* FALLTHROUGH */ default: hlen -= MAX(opt[1], 2); opt += MAX(opt[1], 2); break; } } if (copyback) { /* Copyback the options, caller copys back header */ *writeback = 1; m_copyback(m, off + sizeof(struct tcphdr), (th->th_off << 2) - sizeof(struct tcphdr), hdr + sizeof(struct tcphdr)); } } /* * Must invalidate PAWS checks on connections idle for too long. * The fastest allowed timestamp clock is 1ms. That turns out to * be about 24 days before it wraps. XXX Right now our lowerbound * TS echo check only works for the first 12 days of a connection * when the TS has exhausted half its 32bit space */ #define TS_MAX_IDLE (24*24*60*60) #define TS_MAX_CONN (12*24*60*60) /* XXX remove when better tsecr check */ getmicrouptime(&uptime); if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE || time_uptime - state->creation > TS_MAX_CONN)) { if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("src idled out of PAWS\n")); pf_print_state(state); printf("\n"); } src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS) | PFSS_PAWS_IDLED; } if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) && uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) { if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("dst idled out of PAWS\n")); pf_print_state(state); printf("\n"); } dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS) | PFSS_PAWS_IDLED; } if (got_ts && src->scrub && dst->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && (dst->scrub->pfss_flags & PFSS_PAWS)) { /* Validate that the timestamps are "in-window". * RFC1323 describes TCP Timestamp options that allow * measurement of RTT (round trip time) and PAWS * (protection against wrapped sequence numbers). PAWS * gives us a set of rules for rejecting packets on * long fat pipes (packets that were somehow delayed * in transit longer than the time it took to send the * full TCP sequence space of 4Gb). We can use these * rules and infer a few others that will let us treat * the 32bit timestamp and the 32bit echoed timestamp * as sequence numbers to prevent a blind attacker from * inserting packets into a connection. * * RFC1323 tells us: * - The timestamp on this packet must be greater than * or equal to the last value echoed by the other * endpoint. The RFC says those will be discarded * since it is a dup that has already been acked. * This gives us a lowerbound on the timestamp. * timestamp >= other last echoed timestamp * - The timestamp will be less than or equal to * the last timestamp plus the time between the * last packet and now. The RFC defines the max * clock rate as 1ms. We will allow clocks to be * up to 10% fast and will allow a total difference * or 30 seconds due to a route change. And this * gives us an upperbound on the timestamp. * timestamp <= last timestamp + max ticks * We have to be careful here. Windows will send an * initial timestamp of zero and then initialize it * to a random value after the 3whs; presumably to * avoid a DoS by having to call an expensive RNG * during a SYN flood. Proof MS has at least one * good security geek. * * - The TCP timestamp option must also echo the other * endpoints timestamp. The timestamp echoed is the * one carried on the earliest unacknowledged segment * on the left edge of the sequence window. The RFC * states that the host will reject any echoed * timestamps that were larger than any ever sent. * This gives us an upperbound on the TS echo. * tescr <= largest_tsval * - The lowerbound on the TS echo is a little more * tricky to determine. The other endpoint's echoed * values will not decrease. But there may be * network conditions that re-order packets and * cause our view of them to decrease. For now the * only lowerbound we can safely determine is that * the TS echo will never be less than the original * TS. XXX There is probably a better lowerbound. * Remove TS_MAX_CONN with better lowerbound check. * tescr >= other original TS * * It is also important to note that the fastest * timestamp clock of 1ms will wrap its 32bit space in * 24 days. So we just disable TS checking after 24 * days of idle time. We actually must use a 12d * connection limit until we can come up with a better * lowerbound to the TS echo check. */ struct timeval delta_ts; int ts_fudge; /* * PFTM_TS_DIFF is how many seconds of leeway to allow * a host's timestamp. This can happen if the previous * packet got delayed in transit for much longer than * this packet. */ if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0) ts_fudge = V_pf_default_rule.timeout[PFTM_TS_DIFF]; /* Calculate max ticks since the last timestamp */ #define TS_MAXFREQ 1100 /* RFC max TS freq of 1Khz + 10% skew */ #define TS_MICROSECS 1000000 /* microseconds per second */ delta_ts = uptime; timevalsub(&delta_ts, &src->scrub->pfss_last); tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ; tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ); if ((src->state >= TCPS_ESTABLISHED && dst->state >= TCPS_ESTABLISHED) && (SEQ_LT(tsval, dst->scrub->pfss_tsecr) || SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) || (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) || SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) { /* Bad RFC1323 implementation or an insertion attack. * * - Solaris 2.6 and 2.7 are known to send another ACK * after the FIN,FIN|ACK,ACK closing that carries * an old timestamp. */ DPFPRINTF(("Timestamp failed %c%c%c%c\n", SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ', SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) ? '1' : ' ', SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ', SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' ')); DPFPRINTF((" tsval: %u tsecr: %u +ticks: %u " "idle: %jus %lums\n", tsval, tsecr, tsval_from_last, (uintmax_t)delta_ts.tv_sec, delta_ts.tv_usec / 1000)); DPFPRINTF((" src->tsval: %u tsecr: %u\n", src->scrub->pfss_tsval, src->scrub->pfss_tsecr)); DPFPRINTF((" dst->tsval: %u tsecr: %u tsval0: %u" "\n", dst->scrub->pfss_tsval, dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0)); if (V_pf_status.debug >= PF_DEBUG_MISC) { pf_print_state(state); pf_print_flags(th->th_flags); printf("\n"); } REASON_SET(reason, PFRES_TS); return (PF_DROP); } /* XXX I'd really like to require tsecr but it's optional */ } else if (!got_ts && (th->th_flags & TH_RST) == 0 && ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED) || pd->p_len > 0 || (th->th_flags & TH_SYN)) && src->scrub && dst->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && (dst->scrub->pfss_flags & PFSS_PAWS)) { /* Didn't send a timestamp. Timestamps aren't really useful * when: * - connection opening or closing (often not even sent). * but we must not let an attacker to put a FIN on a * data packet to sneak it through our ESTABLISHED check. * - on a TCP reset. RFC suggests not even looking at TS. * - on an empty ACK. The TS will not be echoed so it will * probably not help keep the RTT calculation in sync and * there isn't as much danger when the sequence numbers * got wrapped. So some stacks don't include TS on empty * ACKs :-( * * To minimize the disruption to mostly RFC1323 conformant * stacks, we will only require timestamps on data packets. * * And what do ya know, we cannot require timestamps on data * packets. There appear to be devices that do legitimate * TCP connection hijacking. There are HTTP devices that allow * a 3whs (with timestamps) and then buffer the HTTP request. * If the intermediate device has the HTTP response cache, it * will spoof the response but not bother timestamping its * packets. So we can look for the presence of a timestamp in * the first data packet and if there, require it in all future * packets. */ if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) { /* * Hey! Someone tried to sneak a packet in. Or the * stack changed its RFC1323 behavior?!?! */ if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("Did not receive expected RFC1323 " "timestamp\n")); pf_print_state(state); pf_print_flags(th->th_flags); printf("\n"); } REASON_SET(reason, PFRES_TS); return (PF_DROP); } } /* * We will note if a host sends his data packets with or without * timestamps. And require all data packets to contain a timestamp * if the first does. PAWS implicitly requires that all data packets be * timestamped. But I think there are middle-man devices that hijack * TCP streams immediately after the 3whs and don't timestamp their * packets (seen in a WWW accelerator or cache). */ if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags & (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) { if (got_ts) src->scrub->pfss_flags |= PFSS_DATA_TS; else { src->scrub->pfss_flags |= PFSS_DATA_NOTS; if (V_pf_status.debug >= PF_DEBUG_MISC && dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) { /* Don't warn if other host rejected RFC1323 */ DPFPRINTF(("Broken RFC1323 stack did not " "timestamp data packet. Disabled PAWS " "security.\n")); pf_print_state(state); pf_print_flags(th->th_flags); printf("\n"); } } } /* * Update PAWS values */ if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags & (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) { getmicrouptime(&src->scrub->pfss_last); if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) || (src->scrub->pfss_flags & PFSS_PAWS) == 0) src->scrub->pfss_tsval = tsval; if (tsecr) { if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) || (src->scrub->pfss_flags & PFSS_PAWS) == 0) src->scrub->pfss_tsecr = tsecr; if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 && (SEQ_LT(tsval, src->scrub->pfss_tsval0) || src->scrub->pfss_tsval0 == 0)) { /* tsval0 MUST be the lowest timestamp */ src->scrub->pfss_tsval0 = tsval; } /* Only fully initialized after a TS gets echoed */ if ((src->scrub->pfss_flags & PFSS_PAWS) == 0) src->scrub->pfss_flags |= PFSS_PAWS; } } /* I have a dream.... TCP segment reassembly.... */ return (0); } static int pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th, int off, sa_family_t af) { u_int16_t *mss; int thoff; int opt, cnt, optlen = 0; int rewrite = 0; u_char opts[TCP_MAXOLEN]; u_char *optp = opts; thoff = th->th_off << 2; cnt = thoff - sizeof(struct tcphdr); if (cnt > 0 && !pf_pull_hdr(m, off + sizeof(*th), opts, cnt, NULL, NULL, af)) return (rewrite); for (; cnt > 0; cnt -= optlen, optp += optlen) { opt = optp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = optp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: mss = (u_int16_t *)(optp + 2); if ((ntohs(*mss)) > r->max_mss) { th->th_sum = pf_cksum_fixup(th->th_sum, *mss, htons(r->max_mss), 0); *mss = htons(r->max_mss); rewrite = 1; } break; default: break; } } if (rewrite) m_copyback(m, off + sizeof(*th), thoff - sizeof(*th), opts); return (rewrite); } #ifdef INET static void pf_scrub_ip(struct mbuf **m0, u_int32_t flags, u_int8_t min_ttl, u_int8_t tos) { struct mbuf *m = *m0; struct ip *h = mtod(m, struct ip *); /* Clear IP_DF if no-df was requested */ if (flags & PFRULE_NODF && h->ip_off & htons(IP_DF)) { u_int16_t ip_off = h->ip_off; h->ip_off &= htons(~IP_DF); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); } /* Enforce a minimum ttl, may cause endless packet loops */ if (min_ttl && h->ip_ttl < min_ttl) { u_int16_t ip_ttl = h->ip_ttl; h->ip_ttl = min_ttl; h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0); } /* Enforce tos */ if (flags & PFRULE_SET_TOS) { u_int16_t ov, nv; ov = *(u_int16_t *)h; h->ip_tos = tos; nv = *(u_int16_t *)h; h->ip_sum = pf_cksum_fixup(h->ip_sum, ov, nv, 0); } /* random-id, but not for fragments */ if (flags & PFRULE_RANDOMID && !(h->ip_off & ~htons(IP_DF))) { u_int16_t ip_id = h->ip_id; h->ip_id = ip_randomid(); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_id, h->ip_id, 0); } } #endif /* INET */ #ifdef INET6 static void pf_scrub_ip6(struct mbuf **m0, u_int8_t min_ttl) { struct mbuf *m = *m0; struct ip6_hdr *h = mtod(m, struct ip6_hdr *); /* Enforce a minimum ttl, may cause endless packet loops */ if (min_ttl && h->ip6_hlim < min_ttl) h->ip6_hlim = min_ttl; } #endif