Index: head/sys/contrib/ipfilter/netinet/fil.c =================================================================== --- head/sys/contrib/ipfilter/netinet/fil.c (revision 305823) +++ head/sys/contrib/ipfilter/netinet/fil.c (revision 305824) @@ -1,10285 +1,10285 @@ /* $FreeBSD$ */ /* * Copyright (C) 2012 by Darren Reed. * * See the IPFILTER.LICENCE file for details on licencing. * * Copyright 2008 Sun Microsystems. * * $Id$ * */ #if defined(KERNEL) || defined(_KERNEL) # undef KERNEL # undef _KERNEL # define KERNEL 1 # define _KERNEL 1 #endif #include #include #include #include #if defined(_KERNEL) && defined(__FreeBSD_version) && \ (__FreeBSD_version >= 220000) # if (__FreeBSD_version >= 400000) # if !defined(IPFILTER_LKM) # include "opt_inet6.h" # endif # if (__FreeBSD_version == 400019) # define CSUM_DELAY_DATA # endif # endif # include #else # include #endif #if (defined(__SVR4) || defined(__svr4__)) && defined(sun) # include #endif #if !defined(_AIX51) # include #endif #if defined(_KERNEL) # include # include #else # include # include # include # include # include # define _KERNEL # ifdef __OpenBSD__ struct file; # endif # include # undef _KERNEL #endif #if !defined(__SVR4) && !defined(__svr4__) && !defined(__hpux) && \ !defined(linux) # include #else # if !defined(linux) # include # endif # if (SOLARIS2 < 5) && defined(sun) # include # endif #endif #ifdef __hpux # define _NET_ROUTE_INCLUDED #endif #if !defined(linux) # include #endif #include #include #ifdef sun # include #endif #include #include #include #if defined(__sgi) && defined(IFF_DRVRLOCK) /* IRIX 6 */ # include # include #endif #include #if (!defined(__sgi) && !defined(AIX)) || defined(_KERNEL) # include # include #endif #ifdef __hpux # undef _NET_ROUTE_INCLUDED #endif #ifdef __osf__ # undef _RADIX_H_ #endif #include "netinet/ip_compat.h" #ifdef USE_INET6 # include # if !SOLARIS && defined(_KERNEL) && !defined(__osf__) && !defined(__hpux) # include # endif #endif #include "netinet/ip_fil.h" #include "netinet/ip_nat.h" #include "netinet/ip_frag.h" #include "netinet/ip_state.h" #include "netinet/ip_proxy.h" #include "netinet/ip_auth.h" #ifdef IPFILTER_SCAN # include "netinet/ip_scan.h" #endif #include "netinet/ip_sync.h" #include "netinet/ip_lookup.h" #include "netinet/ip_pool.h" #include "netinet/ip_htable.h" #ifdef IPFILTER_COMPILED # include "netinet/ip_rules.h" #endif #if defined(IPFILTER_BPF) && defined(_KERNEL) # include #endif #if defined(__FreeBSD_version) && (__FreeBSD_version >= 300000) # include #endif #include "netinet/ipl.h" #if defined(__NetBSD__) && (__NetBSD_Version__ >= 104230000) # include extern struct callout ipf_slowtimer_ch; #endif #if defined(__OpenBSD__) # include extern struct timeout ipf_slowtimer_ch; #endif /* END OF INCLUDES */ #if !defined(lint) static const char sccsid[] = "@(#)fil.c 1.36 6/5/96 (C) 1993-2000 Darren Reed"; static const char rcsid[] = "@(#)$FreeBSD$"; /* static const char rcsid[] = "@(#)$Id: fil.c,v 2.243.2.125 2007/10/10 09:27:20 darrenr Exp $"; */ #endif #ifndef _KERNEL # include "ipf.h" # include "ipt.h" extern int opts; extern int blockreason; #endif /* _KERNEL */ #define LBUMP(x) softc->x++ #define LBUMPD(x, y) do { softc->x.y++; DT(y); } while (0) static INLINE int ipf_check_ipf __P((fr_info_t *, frentry_t *, int)); static u_32_t ipf_checkcipso __P((fr_info_t *, u_char *, int)); static u_32_t ipf_checkripso __P((u_char *)); static u_32_t ipf_decaps __P((fr_info_t *, u_32_t, int)); #ifdef IPFILTER_LOG static frentry_t *ipf_dolog __P((fr_info_t *, u_32_t *)); #endif static int ipf_flushlist __P((ipf_main_softc_t *, int *, frentry_t **)); static int ipf_flush_groups __P((ipf_main_softc_t *, frgroup_t **, int)); static ipfunc_t ipf_findfunc __P((ipfunc_t)); static void *ipf_findlookup __P((ipf_main_softc_t *, int, frentry_t *, i6addr_t *, i6addr_t *)); static frentry_t *ipf_firewall __P((fr_info_t *, u_32_t *)); static int ipf_fr_matcharray __P((fr_info_t *, int *)); static int ipf_frruleiter __P((ipf_main_softc_t *, void *, int, void *)); static void ipf_funcfini __P((ipf_main_softc_t *, frentry_t *)); static int ipf_funcinit __P((ipf_main_softc_t *, frentry_t *)); static int ipf_geniter __P((ipf_main_softc_t *, ipftoken_t *, ipfgeniter_t *)); static void ipf_getstat __P((ipf_main_softc_t *, struct friostat *, int)); static int ipf_group_flush __P((ipf_main_softc_t *, frgroup_t *)); static void ipf_group_free __P((frgroup_t *)); static int ipf_grpmapfini __P((struct ipf_main_softc_s *, frentry_t *)); static int ipf_grpmapinit __P((struct ipf_main_softc_s *, frentry_t *)); static frentry_t *ipf_nextrule __P((ipf_main_softc_t *, int, int, frentry_t *, int)); static int ipf_portcheck __P((frpcmp_t *, u_32_t)); static INLINE int ipf_pr_ah __P((fr_info_t *)); static INLINE void ipf_pr_esp __P((fr_info_t *)); static INLINE void ipf_pr_gre __P((fr_info_t *)); static INLINE void ipf_pr_udp __P((fr_info_t *)); static INLINE void ipf_pr_tcp __P((fr_info_t *)); static INLINE void ipf_pr_icmp __P((fr_info_t *)); static INLINE void ipf_pr_ipv4hdr __P((fr_info_t *)); static INLINE void ipf_pr_short __P((fr_info_t *, int)); static INLINE int ipf_pr_tcpcommon __P((fr_info_t *)); static INLINE int ipf_pr_udpcommon __P((fr_info_t *)); static void ipf_rule_delete __P((ipf_main_softc_t *, frentry_t *f, int, int)); static void ipf_rule_expire_insert __P((ipf_main_softc_t *, frentry_t *, int)); static int ipf_synclist __P((ipf_main_softc_t *, frentry_t *, void *)); static void ipf_token_flush __P((ipf_main_softc_t *)); static void ipf_token_unlink __P((ipf_main_softc_t *, ipftoken_t *)); static ipftuneable_t *ipf_tune_findbyname __P((ipftuneable_t *, const char *)); static ipftuneable_t *ipf_tune_findbycookie __P((ipftuneable_t **, void *, void **)); static int ipf_updateipid __P((fr_info_t *)); static int ipf_settimeout __P((struct ipf_main_softc_s *, struct ipftuneable *, ipftuneval_t *)); #if !defined(_KERNEL) || (!defined(__NetBSD__) && !defined(__OpenBSD__) && \ !defined(__FreeBSD__)) || \ FREEBSD_LT_REV(501000) || NETBSD_LT_REV(105000000) || \ OPENBSD_LT_REV(200006) static int ppsratecheck(struct timeval *, int *, int); #endif /* * bit values for identifying presence of individual IP options * All of these tables should be ordered by increasing key value on the left * hand side to allow for binary searching of the array and include a trailer * with a 0 for the bitmask for linear searches to easily find the end with. */ static const struct optlist ipopts[20] = { { IPOPT_NOP, 0x000001 }, { IPOPT_RR, 0x000002 }, { IPOPT_ZSU, 0x000004 }, { IPOPT_MTUP, 0x000008 }, { IPOPT_MTUR, 0x000010 }, { IPOPT_ENCODE, 0x000020 }, { IPOPT_TS, 0x000040 }, { IPOPT_TR, 0x000080 }, { IPOPT_SECURITY, 0x000100 }, { IPOPT_LSRR, 0x000200 }, { IPOPT_E_SEC, 0x000400 }, { IPOPT_CIPSO, 0x000800 }, { IPOPT_SATID, 0x001000 }, { IPOPT_SSRR, 0x002000 }, { IPOPT_ADDEXT, 0x004000 }, { IPOPT_VISA, 0x008000 }, { IPOPT_IMITD, 0x010000 }, { IPOPT_EIP, 0x020000 }, { IPOPT_FINN, 0x040000 }, { 0, 0x000000 } }; #ifdef USE_INET6 static const struct optlist ip6exthdr[] = { { IPPROTO_HOPOPTS, 0x000001 }, { IPPROTO_IPV6, 0x000002 }, { IPPROTO_ROUTING, 0x000004 }, { IPPROTO_FRAGMENT, 0x000008 }, { IPPROTO_ESP, 0x000010 }, { IPPROTO_AH, 0x000020 }, { IPPROTO_NONE, 0x000040 }, { IPPROTO_DSTOPTS, 0x000080 }, { IPPROTO_MOBILITY, 0x000100 }, { 0, 0 } }; #endif /* * bit values for identifying presence of individual IP security options */ static const struct optlist secopt[8] = { { IPSO_CLASS_RES4, 0x01 }, { IPSO_CLASS_TOPS, 0x02 }, { IPSO_CLASS_SECR, 0x04 }, { IPSO_CLASS_RES3, 0x08 }, { IPSO_CLASS_CONF, 0x10 }, { IPSO_CLASS_UNCL, 0x20 }, { IPSO_CLASS_RES2, 0x40 }, { IPSO_CLASS_RES1, 0x80 } }; char ipfilter_version[] = IPL_VERSION; int ipf_features = 0 #ifdef IPFILTER_LKM | IPF_FEAT_LKM #endif #ifdef IPFILTER_LOG | IPF_FEAT_LOG #endif | IPF_FEAT_LOOKUP #ifdef IPFILTER_BPF | IPF_FEAT_BPF #endif #ifdef IPFILTER_COMPILED | IPF_FEAT_COMPILED #endif #ifdef IPFILTER_CKSUM | IPF_FEAT_CKSUM #endif | IPF_FEAT_SYNC #ifdef IPFILTER_SCAN | IPF_FEAT_SCAN #endif #ifdef USE_INET6 | IPF_FEAT_IPV6 #endif ; /* * Table of functions available for use with call rules. */ static ipfunc_resolve_t ipf_availfuncs[] = { { "srcgrpmap", ipf_srcgrpmap, ipf_grpmapinit, ipf_grpmapfini }, { "dstgrpmap", ipf_dstgrpmap, ipf_grpmapinit, ipf_grpmapfini }, { "", NULL, NULL, NULL } }; static ipftuneable_t ipf_main_tuneables[] = { { { (void *)offsetof(struct ipf_main_softc_s, ipf_flags) }, "ipf_flags", 0, 0xffffffff, stsizeof(ipf_main_softc_t, ipf_flags), 0, NULL, NULL }, { { (void *)offsetof(struct ipf_main_softc_s, ipf_active) }, "active", 0, 0, stsizeof(ipf_main_softc_t, ipf_active), IPFT_RDONLY, NULL, NULL }, { { (void *)offsetof(ipf_main_softc_t, ipf_control_forwarding) }, "control_forwarding", 0, 1, stsizeof(ipf_main_softc_t, ipf_control_forwarding), 0, NULL, NULL }, { { (void *)offsetof(ipf_main_softc_t, ipf_update_ipid) }, "update_ipid", 0, 1, stsizeof(ipf_main_softc_t, ipf_update_ipid), 0, NULL, NULL }, { { (void *)offsetof(ipf_main_softc_t, ipf_chksrc) }, "chksrc", 0, 1, stsizeof(ipf_main_softc_t, ipf_chksrc), 0, NULL, NULL }, { { (void *)offsetof(ipf_main_softc_t, ipf_minttl) }, "min_ttl", 0, 1, stsizeof(ipf_main_softc_t, ipf_minttl), 0, NULL, NULL }, { { (void *)offsetof(ipf_main_softc_t, ipf_icmpminfragmtu) }, "icmp_minfragmtu", 0, 1, stsizeof(ipf_main_softc_t, ipf_icmpminfragmtu), 0, NULL, NULL }, { { (void *)offsetof(ipf_main_softc_t, ipf_pass) }, "default_pass", 0, 0xffffffff, stsizeof(ipf_main_softc_t, ipf_pass), 0, NULL, NULL }, { { (void *)offsetof(ipf_main_softc_t, ipf_tcpidletimeout) }, "tcp_idle_timeout", 1, 0x7fffffff, stsizeof(ipf_main_softc_t, ipf_tcpidletimeout), 0, NULL, ipf_settimeout }, { { (void *)offsetof(ipf_main_softc_t, ipf_tcpclosewait) }, "tcp_close_wait", 1, 0x7fffffff, stsizeof(ipf_main_softc_t, ipf_tcpclosewait), 0, NULL, ipf_settimeout }, { { (void *)offsetof(ipf_main_softc_t, ipf_tcplastack) }, "tcp_last_ack", 1, 0x7fffffff, stsizeof(ipf_main_softc_t, ipf_tcplastack), 0, NULL, ipf_settimeout }, { { (void *)offsetof(ipf_main_softc_t, ipf_tcptimeout) }, "tcp_timeout", 1, 0x7fffffff, stsizeof(ipf_main_softc_t, ipf_tcptimeout), 0, NULL, ipf_settimeout }, { { (void *)offsetof(ipf_main_softc_t, ipf_tcpsynsent) }, "tcp_syn_sent", 1, 0x7fffffff, stsizeof(ipf_main_softc_t, ipf_tcpsynsent), 0, NULL, ipf_settimeout }, { { (void *)offsetof(ipf_main_softc_t, ipf_tcpsynrecv) }, "tcp_syn_received", 1, 0x7fffffff, stsizeof(ipf_main_softc_t, ipf_tcpsynrecv), 0, NULL, ipf_settimeout }, { { (void *)offsetof(ipf_main_softc_t, ipf_tcpclosed) }, "tcp_closed", 1, 0x7fffffff, stsizeof(ipf_main_softc_t, ipf_tcpclosed), 0, NULL, ipf_settimeout }, { { (void *)offsetof(ipf_main_softc_t, ipf_tcphalfclosed) }, "tcp_half_closed", 1, 0x7fffffff, stsizeof(ipf_main_softc_t, ipf_tcphalfclosed), 0, NULL, ipf_settimeout }, { { (void *)offsetof(ipf_main_softc_t, ipf_tcptimewait) }, "tcp_time_wait", 1, 0x7fffffff, stsizeof(ipf_main_softc_t, ipf_tcptimewait), 0, NULL, ipf_settimeout }, { { (void *)offsetof(ipf_main_softc_t, ipf_udptimeout) }, "udp_timeout", 1, 0x7fffffff, stsizeof(ipf_main_softc_t, ipf_udptimeout), 0, NULL, ipf_settimeout }, { { (void *)offsetof(ipf_main_softc_t, ipf_udpacktimeout) }, "udp_ack_timeout", 1, 0x7fffffff, stsizeof(ipf_main_softc_t, ipf_udpacktimeout), 0, NULL, ipf_settimeout }, { { (void *)offsetof(ipf_main_softc_t, ipf_icmptimeout) }, "icmp_timeout", 1, 0x7fffffff, stsizeof(ipf_main_softc_t, ipf_icmptimeout), 0, NULL, ipf_settimeout }, { { (void *)offsetof(ipf_main_softc_t, ipf_icmpacktimeout) }, "icmp_ack_timeout", 1, 0x7fffffff, stsizeof(ipf_main_softc_t, ipf_icmpacktimeout), 0, NULL, ipf_settimeout }, { { (void *)offsetof(ipf_main_softc_t, ipf_iptimeout) }, "ip_timeout", 1, 0x7fffffff, stsizeof(ipf_main_softc_t, ipf_iptimeout), 0, NULL, ipf_settimeout }, #if defined(INSTANCES) && defined(_KERNEL) { { (void *)offsetof(ipf_main_softc_t, ipf_get_loopback) }, "intercept_loopback", 0, 1, stsizeof(ipf_main_softc_t, ipf_get_loopback), 0, NULL, ipf_set_loopback }, #endif { { 0 }, NULL, 0, 0, 0, 0, NULL, NULL } }; /* * The next section of code is a a collection of small routines that set * fields in the fr_info_t structure passed based on properties of the * current packet. There are different routines for the same protocol * for each of IPv4 and IPv6. Adding a new protocol, for which there * will "special" inspection for setup, is now more easily done by adding * a new routine and expanding the ipf_pr_ipinit*() function rather than by * adding more code to a growing switch statement. */ #ifdef USE_INET6 static INLINE int ipf_pr_ah6 __P((fr_info_t *)); static INLINE void ipf_pr_esp6 __P((fr_info_t *)); static INLINE void ipf_pr_gre6 __P((fr_info_t *)); static INLINE void ipf_pr_udp6 __P((fr_info_t *)); static INLINE void ipf_pr_tcp6 __P((fr_info_t *)); static INLINE void ipf_pr_icmp6 __P((fr_info_t *)); static INLINE void ipf_pr_ipv6hdr __P((fr_info_t *)); static INLINE void ipf_pr_short6 __P((fr_info_t *, int)); static INLINE int ipf_pr_hopopts6 __P((fr_info_t *)); static INLINE int ipf_pr_mobility6 __P((fr_info_t *)); static INLINE int ipf_pr_routing6 __P((fr_info_t *)); static INLINE int ipf_pr_dstopts6 __P((fr_info_t *)); static INLINE int ipf_pr_fragment6 __P((fr_info_t *)); static INLINE struct ip6_ext *ipf_pr_ipv6exthdr __P((fr_info_t *, int, int)); /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_short6 */ /* Returns: void */ /* Parameters: fin(I) - pointer to packet information */ /* xmin(I) - minimum header size */ /* */ /* IPv6 Only */ /* This is function enforces the 'is a packet too short to be legit' rule */ /* for IPv6 and marks the packet with FI_SHORT if so. See function comment */ /* for ipf_pr_short() for more details. */ /* ------------------------------------------------------------------------ */ static INLINE void ipf_pr_short6(fin, xmin) fr_info_t *fin; int xmin; { if (fin->fin_dlen < xmin) fin->fin_flx |= FI_SHORT; } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_ipv6hdr */ /* Returns: void */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* IPv6 Only */ /* Copy values from the IPv6 header into the fr_info_t struct and call the */ /* per-protocol analyzer if it exists. In validating the packet, a protocol*/ /* analyzer may pullup or free the packet itself so we need to be vigiliant */ /* of that possibility arising. */ /* ------------------------------------------------------------------------ */ static INLINE void ipf_pr_ipv6hdr(fin) fr_info_t *fin; { ip6_t *ip6 = (ip6_t *)fin->fin_ip; int p, go = 1, i, hdrcount; fr_ip_t *fi = &fin->fin_fi; fin->fin_off = 0; fi->fi_tos = 0; fi->fi_optmsk = 0; fi->fi_secmsk = 0; fi->fi_auth = 0; p = ip6->ip6_nxt; fin->fin_crc = p; fi->fi_ttl = ip6->ip6_hlim; fi->fi_src.in6 = ip6->ip6_src; fin->fin_crc += fi->fi_src.i6[0]; fin->fin_crc += fi->fi_src.i6[1]; fin->fin_crc += fi->fi_src.i6[2]; fin->fin_crc += fi->fi_src.i6[3]; fi->fi_dst.in6 = ip6->ip6_dst; fin->fin_crc += fi->fi_dst.i6[0]; fin->fin_crc += fi->fi_dst.i6[1]; fin->fin_crc += fi->fi_dst.i6[2]; fin->fin_crc += fi->fi_dst.i6[3]; fin->fin_id = 0; if (IN6_IS_ADDR_MULTICAST(&fi->fi_dst.in6)) fin->fin_flx |= FI_MULTICAST|FI_MBCAST; hdrcount = 0; while (go && !(fin->fin_flx & FI_SHORT)) { switch (p) { case IPPROTO_UDP : ipf_pr_udp6(fin); go = 0; break; case IPPROTO_TCP : ipf_pr_tcp6(fin); go = 0; break; case IPPROTO_ICMPV6 : ipf_pr_icmp6(fin); go = 0; break; case IPPROTO_GRE : ipf_pr_gre6(fin); go = 0; break; case IPPROTO_HOPOPTS : p = ipf_pr_hopopts6(fin); break; case IPPROTO_MOBILITY : p = ipf_pr_mobility6(fin); break; case IPPROTO_DSTOPTS : p = ipf_pr_dstopts6(fin); break; case IPPROTO_ROUTING : p = ipf_pr_routing6(fin); break; case IPPROTO_AH : p = ipf_pr_ah6(fin); break; case IPPROTO_ESP : ipf_pr_esp6(fin); go = 0; break; case IPPROTO_IPV6 : for (i = 0; ip6exthdr[i].ol_bit != 0; i++) if (ip6exthdr[i].ol_val == p) { fin->fin_flx |= ip6exthdr[i].ol_bit; break; } go = 0; break; case IPPROTO_NONE : go = 0; break; case IPPROTO_FRAGMENT : p = ipf_pr_fragment6(fin); /* * Given that the only fragments we want to let through * (where fin_off != 0) are those where the non-first * fragments only have data, we can safely stop looking * at headers if this is a non-leading fragment. */ if (fin->fin_off != 0) go = 0; break; default : go = 0; break; } hdrcount++; /* * It is important to note that at this point, for the * extension headers (go != 0), the entire header may not have * been pulled up when the code gets to this point. This is * only done for "go != 0" because the other header handlers * will all pullup their complete header. The other indicator * of an incomplete packet is that this was just an extension * header. */ if ((go != 0) && (p != IPPROTO_NONE) && (ipf_pr_pullup(fin, 0) == -1)) { p = IPPROTO_NONE; break; } } /* * Some of the above functions, like ipf_pr_esp6(), can call ipf_pullup * and destroy whatever packet was here. The caller of this function * expects us to return if there is a problem with ipf_pullup. */ if (fin->fin_m == NULL) { ipf_main_softc_t *softc = fin->fin_main_soft; LBUMPD(ipf_stats[fin->fin_out], fr_v6_bad); return; } fi->fi_p = p; /* * IPv6 fragment case 1 - see comment for ipf_pr_fragment6(). * "go != 0" imples the above loop hasn't arrived at a layer 4 header. */ if ((go != 0) && (fin->fin_flx & FI_FRAG) && (fin->fin_off == 0)) { ipf_main_softc_t *softc = fin->fin_main_soft; fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_ipv6_frag_1, fr_info_t *, fin, int, go); LBUMPD(ipf_stats[fin->fin_out], fr_v6_badfrag); LBUMP(ipf_stats[fin->fin_out].fr_v6_bad); } } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_ipv6exthdr */ /* Returns: struct ip6_ext * - pointer to the start of the next header */ /* or NULL if there is a prolblem. */ /* Parameters: fin(I) - pointer to packet information */ /* multiple(I) - flag indicating yes/no if multiple occurances */ /* of this extension header are allowed. */ /* proto(I) - protocol number for this extension header */ /* */ /* IPv6 Only */ /* This function embodies a number of common checks that all IPv6 extension */ /* headers must be subjected to. For example, making sure the packet is */ /* big enough for it to be in, checking if it is repeated and setting a */ /* flag to indicate its presence. */ /* ------------------------------------------------------------------------ */ static INLINE struct ip6_ext * ipf_pr_ipv6exthdr(fin, multiple, proto) fr_info_t *fin; int multiple, proto; { ipf_main_softc_t *softc = fin->fin_main_soft; struct ip6_ext *hdr; u_short shift; int i; fin->fin_flx |= FI_V6EXTHDR; /* 8 is default length of extension hdr */ if ((fin->fin_dlen - 8) < 0) { fin->fin_flx |= FI_SHORT; LBUMPD(ipf_stats[fin->fin_out], fr_v6_ext_short); return NULL; } if (ipf_pr_pullup(fin, 8) == -1) { LBUMPD(ipf_stats[fin->fin_out], fr_v6_ext_pullup); return NULL; } hdr = fin->fin_dp; switch (proto) { case IPPROTO_FRAGMENT : shift = 8; break; default : shift = 8 + (hdr->ip6e_len << 3); break; } if (shift > fin->fin_dlen) { /* Nasty extension header length? */ fin->fin_flx |= FI_BAD; DT3(ipf_fi_bad_pr_ipv6exthdr_len, fr_info_t *, fin, u_short, shift, u_short, fin->fin_dlen); LBUMPD(ipf_stats[fin->fin_out], fr_v6_ext_hlen); return NULL; } fin->fin_dp = (char *)fin->fin_dp + shift; fin->fin_dlen -= shift; /* * If we have seen a fragment header, do not set any flags to indicate * the presence of this extension header as it has no impact on the * end result until after it has been defragmented. */ if (fin->fin_flx & FI_FRAG) return hdr; for (i = 0; ip6exthdr[i].ol_bit != 0; i++) if (ip6exthdr[i].ol_val == proto) { /* * Most IPv6 extension headers are only allowed once. */ if ((multiple == 0) && ((fin->fin_optmsk & ip6exthdr[i].ol_bit) != 0)) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_ipv6exthdr_once, fr_info_t *, fin, u_int, (fin->fin_optmsk & ip6exthdr[i].ol_bit)); } else fin->fin_optmsk |= ip6exthdr[i].ol_bit; break; } return hdr; } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_hopopts6 */ /* Returns: int - value of the next header or IPPROTO_NONE if error */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* IPv6 Only */ /* This is function checks pending hop by hop options extension header */ /* ------------------------------------------------------------------------ */ static INLINE int ipf_pr_hopopts6(fin) fr_info_t *fin; { struct ip6_ext *hdr; hdr = ipf_pr_ipv6exthdr(fin, 0, IPPROTO_HOPOPTS); if (hdr == NULL) return IPPROTO_NONE; return hdr->ip6e_nxt; } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_mobility6 */ /* Returns: int - value of the next header or IPPROTO_NONE if error */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* IPv6 Only */ /* This is function checks the IPv6 mobility extension header */ /* ------------------------------------------------------------------------ */ static INLINE int ipf_pr_mobility6(fin) fr_info_t *fin; { struct ip6_ext *hdr; hdr = ipf_pr_ipv6exthdr(fin, 0, IPPROTO_MOBILITY); if (hdr == NULL) return IPPROTO_NONE; return hdr->ip6e_nxt; } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_routing6 */ /* Returns: int - value of the next header or IPPROTO_NONE if error */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* IPv6 Only */ /* This is function checks pending routing extension header */ /* ------------------------------------------------------------------------ */ static INLINE int ipf_pr_routing6(fin) fr_info_t *fin; { struct ip6_routing *hdr; hdr = (struct ip6_routing *)ipf_pr_ipv6exthdr(fin, 0, IPPROTO_ROUTING); if (hdr == NULL) return IPPROTO_NONE; switch (hdr->ip6r_type) { case 0 : /* * Nasty extension header length? */ if (((hdr->ip6r_len >> 1) < hdr->ip6r_segleft) || (hdr->ip6r_segleft && (hdr->ip6r_len & 1))) { ipf_main_softc_t *softc = fin->fin_main_soft; fin->fin_flx |= FI_BAD; DT1(ipf_fi_bad_routing6, fr_info_t *, fin); LBUMPD(ipf_stats[fin->fin_out], fr_v6_rh_bad); return IPPROTO_NONE; } break; default : break; } return hdr->ip6r_nxt; } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_fragment6 */ /* Returns: int - value of the next header or IPPROTO_NONE if error */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* IPv6 Only */ /* Examine the IPv6 fragment header and extract fragment offset information.*/ /* */ /* Fragments in IPv6 are extraordinarily difficult to deal with - much more */ /* so than in IPv4. There are 5 cases of fragments with IPv6 that all */ /* packets with a fragment header can fit into. They are as follows: */ /* */ /* 1. [IPv6][0-n EH][FH][0-n EH] (no L4HDR present) */ /* 2. [IPV6][0-n EH][FH][0-n EH][L4HDR part] (short) */ /* 3. [IPV6][0-n EH][FH][L4HDR part][0-n data] (short) */ /* 4. [IPV6][0-n EH][FH][0-n EH][L4HDR][0-n data] */ /* 5. [IPV6][0-n EH][FH][data] */ /* */ /* IPV6 = IPv6 header, FH = Fragment Header, */ /* 0-n EH = 0 or more extension headers, 0-n data = 0 or more bytes of data */ /* */ /* Packets that match 1, 2, 3 will be dropped as the only reasonable */ /* scenario in which they happen is in extreme circumstances that are most */ /* likely to be an indication of an attack rather than normal traffic. */ /* A type 3 packet may be sent by an attacked after a type 4 packet. There */ /* are two rules that can be used to guard against type 3 packets: L4 */ /* headers must always be in a packet that has the offset field set to 0 */ /* and no packet is allowed to overlay that where offset = 0. */ /* ------------------------------------------------------------------------ */ static INLINE int ipf_pr_fragment6(fin) fr_info_t *fin; { ipf_main_softc_t *softc = fin->fin_main_soft; struct ip6_frag *frag; fin->fin_flx |= FI_FRAG; frag = (struct ip6_frag *)ipf_pr_ipv6exthdr(fin, 0, IPPROTO_FRAGMENT); if (frag == NULL) { LBUMPD(ipf_stats[fin->fin_out], fr_v6_frag_bad); return IPPROTO_NONE; } if ((frag->ip6f_offlg & IP6F_MORE_FRAG) != 0) { /* * Any fragment that isn't the last fragment must have its * length as a multiple of 8. */ if ((fin->fin_plen & 7) != 0) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_frag_not_8, fr_info_t *, fin, u_int, (fin->fin_plen & 7)); } } fin->fin_fraghdr = frag; fin->fin_id = frag->ip6f_ident; fin->fin_off = ntohs(frag->ip6f_offlg & IP6F_OFF_MASK); if (fin->fin_off != 0) fin->fin_flx |= FI_FRAGBODY; /* * Jumbograms aren't handled, so the max. length is 64k */ if ((fin->fin_off << 3) + fin->fin_dlen > 65535) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_jumbogram, fr_info_t *, fin, u_int, ((fin->fin_off << 3) + fin->fin_dlen)); } /* * We don't know where the transport layer header (or whatever is next * is), as it could be behind destination options (amongst others) so * return the fragment header as the type of packet this is. Note that * this effectively disables the fragment cache for > 1 protocol at a * time. */ return frag->ip6f_nxt; } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_dstopts6 */ /* Returns: int - value of the next header or IPPROTO_NONE if error */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* IPv6 Only */ /* This is function checks pending destination options extension header */ /* ------------------------------------------------------------------------ */ static INLINE int ipf_pr_dstopts6(fin) fr_info_t *fin; { ipf_main_softc_t *softc = fin->fin_main_soft; struct ip6_ext *hdr; hdr = ipf_pr_ipv6exthdr(fin, 0, IPPROTO_DSTOPTS); if (hdr == NULL) { LBUMPD(ipf_stats[fin->fin_out], fr_v6_dst_bad); return IPPROTO_NONE; } return hdr->ip6e_nxt; } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_icmp6 */ /* Returns: void */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* IPv6 Only */ /* This routine is mainly concerned with determining the minimum valid size */ /* for an ICMPv6 packet. */ /* ------------------------------------------------------------------------ */ static INLINE void ipf_pr_icmp6(fin) fr_info_t *fin; { int minicmpsz = sizeof(struct icmp6_hdr); struct icmp6_hdr *icmp6; if (ipf_pr_pullup(fin, ICMP6ERR_MINPKTLEN - sizeof(ip6_t)) == -1) { ipf_main_softc_t *softc = fin->fin_main_soft; LBUMPD(ipf_stats[fin->fin_out], fr_v6_icmp6_pullup); return; } if (fin->fin_dlen > 1) { ip6_t *ip6; icmp6 = fin->fin_dp; fin->fin_data[0] = *(u_short *)icmp6; if ((icmp6->icmp6_type & ICMP6_INFOMSG_MASK) != 0) fin->fin_flx |= FI_ICMPQUERY; switch (icmp6->icmp6_type) { case ICMP6_ECHO_REPLY : case ICMP6_ECHO_REQUEST : if (fin->fin_dlen >= 6) fin->fin_data[1] = icmp6->icmp6_id; minicmpsz = ICMP6ERR_MINPKTLEN - sizeof(ip6_t); break; case ICMP6_DST_UNREACH : case ICMP6_PACKET_TOO_BIG : case ICMP6_TIME_EXCEEDED : case ICMP6_PARAM_PROB : fin->fin_flx |= FI_ICMPERR; minicmpsz = ICMP6ERR_IPICMPHLEN - sizeof(ip6_t); if (fin->fin_plen < ICMP6ERR_IPICMPHLEN) break; if (M_LEN(fin->fin_m) < fin->fin_plen) { if (ipf_coalesce(fin) != 1) return; } if (ipf_pr_pullup(fin, ICMP6ERR_MINPKTLEN) == -1) return; /* * If the destination of this packet doesn't match the * source of the original packet then this packet is * not correct. */ icmp6 = fin->fin_dp; ip6 = (ip6_t *)((char *)icmp6 + ICMPERR_ICMPHLEN); if (IP6_NEQ(&fin->fin_fi.fi_dst, (i6addr_t *)&ip6->ip6_src)) { fin->fin_flx |= FI_BAD; DT1(ipf_fi_bad_icmp6, fr_info_t *, fin); } break; default : break; } } ipf_pr_short6(fin, minicmpsz); if ((fin->fin_flx & (FI_SHORT|FI_BAD)) == 0) { u_char p = fin->fin_p; fin->fin_p = IPPROTO_ICMPV6; ipf_checkv6sum(fin); fin->fin_p = p; } } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_udp6 */ /* Returns: void */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* IPv6 Only */ /* Analyse the packet for IPv6/UDP properties. */ /* Is not expected to be called for fragmented packets. */ /* ------------------------------------------------------------------------ */ static INLINE void ipf_pr_udp6(fin) fr_info_t *fin; { if (ipf_pr_udpcommon(fin) == 0) { u_char p = fin->fin_p; fin->fin_p = IPPROTO_UDP; ipf_checkv6sum(fin); fin->fin_p = p; } } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_tcp6 */ /* Returns: void */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* IPv6 Only */ /* Analyse the packet for IPv6/TCP properties. */ /* Is not expected to be called for fragmented packets. */ /* ------------------------------------------------------------------------ */ static INLINE void ipf_pr_tcp6(fin) fr_info_t *fin; { if (ipf_pr_tcpcommon(fin) == 0) { u_char p = fin->fin_p; fin->fin_p = IPPROTO_TCP; ipf_checkv6sum(fin); fin->fin_p = p; } } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_esp6 */ /* Returns: void */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* IPv6 Only */ /* Analyse the packet for ESP properties. */ /* The minimum length is taken to be the SPI (32bits) plus a tail (32bits) */ /* even though the newer ESP packets must also have a sequence number that */ /* is 32bits as well, it is not possible(?) to determine the version from a */ /* simple packet header. */ /* ------------------------------------------------------------------------ */ static INLINE void ipf_pr_esp6(fin) fr_info_t *fin; { if ((fin->fin_off == 0) && (ipf_pr_pullup(fin, 8) == -1)) { ipf_main_softc_t *softc = fin->fin_main_soft; LBUMPD(ipf_stats[fin->fin_out], fr_v6_esp_pullup); return; } } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_ah6 */ /* Returns: int - value of the next header or IPPROTO_NONE if error */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* IPv6 Only */ /* Analyse the packet for AH properties. */ /* The minimum length is taken to be the combination of all fields in the */ /* header being present and no authentication data (null algorithm used.) */ /* ------------------------------------------------------------------------ */ static INLINE int ipf_pr_ah6(fin) fr_info_t *fin; { authhdr_t *ah; fin->fin_flx |= FI_AH; ah = (authhdr_t *)ipf_pr_ipv6exthdr(fin, 0, IPPROTO_HOPOPTS); if (ah == NULL) { ipf_main_softc_t *softc = fin->fin_main_soft; LBUMPD(ipf_stats[fin->fin_out], fr_v6_ah_bad); return IPPROTO_NONE; } ipf_pr_short6(fin, sizeof(*ah)); /* * No need for another pullup, ipf_pr_ipv6exthdr() will pullup * enough data to satisfy ah_next (the very first one.) */ return ah->ah_next; } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_gre6 */ /* Returns: void */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* Analyse the packet for GRE properties. */ /* ------------------------------------------------------------------------ */ static INLINE void ipf_pr_gre6(fin) fr_info_t *fin; { grehdr_t *gre; if (ipf_pr_pullup(fin, sizeof(grehdr_t)) == -1) { ipf_main_softc_t *softc = fin->fin_main_soft; LBUMPD(ipf_stats[fin->fin_out], fr_v6_gre_pullup); return; } gre = fin->fin_dp; if (GRE_REV(gre->gr_flags) == 1) fin->fin_data[0] = gre->gr_call; } #endif /* USE_INET6 */ /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_pullup */ /* Returns: int - 0 == pullup succeeded, -1 == failure */ /* Parameters: fin(I) - pointer to packet information */ /* plen(I) - length (excluding L3 header) to pullup */ /* */ /* Short inline function to cut down on code duplication to perform a call */ /* to ipf_pullup to ensure there is the required amount of data, */ /* consecutively in the packet buffer. */ /* */ /* This function pulls up 'extra' data at the location of fin_dp. fin_dp */ /* points to the first byte after the complete layer 3 header, which will */ /* include all of the known extension headers for IPv6 or options for IPv4. */ /* */ /* Since fr_pullup() expects the total length of bytes to be pulled up, it */ /* is necessary to add those we can already assume to be pulled up (fin_dp */ /* - fin_ip) to what is passed through. */ /* ------------------------------------------------------------------------ */ int ipf_pr_pullup(fin, plen) fr_info_t *fin; int plen; { ipf_main_softc_t *softc = fin->fin_main_soft; if (fin->fin_m != NULL) { if (fin->fin_dp != NULL) plen += (char *)fin->fin_dp - ((char *)fin->fin_ip + fin->fin_hlen); plen += fin->fin_hlen; if (M_LEN(fin->fin_m) < plen + fin->fin_ipoff) { #if defined(_KERNEL) if (ipf_pullup(fin->fin_m, fin, plen) == NULL) { DT(ipf_pullup_fail); LBUMP(ipf_stats[fin->fin_out].fr_pull[1]); return -1; } LBUMP(ipf_stats[fin->fin_out].fr_pull[0]); #else LBUMP(ipf_stats[fin->fin_out].fr_pull[1]); /* * Fake ipf_pullup failing */ fin->fin_reason = FRB_PULLUP; *fin->fin_mp = NULL; fin->fin_m = NULL; fin->fin_ip = NULL; return -1; #endif } } return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_short */ /* Returns: void */ /* Parameters: fin(I) - pointer to packet information */ /* xmin(I) - minimum header size */ /* */ /* Check if a packet is "short" as defined by xmin. The rule we are */ /* applying here is that the packet must not be fragmented within the layer */ /* 4 header. That is, it must not be a fragment that has its offset set to */ /* start within the layer 4 header (hdrmin) or if it is at offset 0, the */ /* entire layer 4 header must be present (min). */ /* ------------------------------------------------------------------------ */ static INLINE void ipf_pr_short(fin, xmin) fr_info_t *fin; int xmin; { if (fin->fin_off == 0) { if (fin->fin_dlen < xmin) fin->fin_flx |= FI_SHORT; } else if (fin->fin_off < xmin) { fin->fin_flx |= FI_SHORT; } } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_icmp */ /* Returns: void */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* IPv4 Only */ /* Do a sanity check on the packet for ICMP (v4). In nearly all cases, */ /* except extrememly bad packets, both type and code will be present. */ /* The expected minimum size of an ICMP packet is very much dependent on */ /* the type of it. */ /* */ /* XXX - other ICMP sanity checks? */ /* ------------------------------------------------------------------------ */ static INLINE void ipf_pr_icmp(fin) fr_info_t *fin; { ipf_main_softc_t *softc = fin->fin_main_soft; int minicmpsz = sizeof(struct icmp); icmphdr_t *icmp; ip_t *oip; ipf_pr_short(fin, ICMPERR_ICMPHLEN); if (fin->fin_off != 0) { LBUMPD(ipf_stats[fin->fin_out], fr_v4_icmp_frag); return; } if (ipf_pr_pullup(fin, ICMPERR_ICMPHLEN) == -1) { LBUMPD(ipf_stats[fin->fin_out], fr_v4_icmp_pullup); return; } icmp = fin->fin_dp; fin->fin_data[0] = *(u_short *)icmp; fin->fin_data[1] = icmp->icmp_id; switch (icmp->icmp_type) { case ICMP_ECHOREPLY : case ICMP_ECHO : /* Router discovery messaes - RFC 1256 */ case ICMP_ROUTERADVERT : case ICMP_ROUTERSOLICIT : fin->fin_flx |= FI_ICMPQUERY; minicmpsz = ICMP_MINLEN; break; /* * type(1) + code(1) + cksum(2) + id(2) seq(2) + * 3 * timestamp(3 * 4) */ case ICMP_TSTAMP : case ICMP_TSTAMPREPLY : fin->fin_flx |= FI_ICMPQUERY; minicmpsz = 20; break; /* * type(1) + code(1) + cksum(2) + id(2) seq(2) + * mask(4) */ case ICMP_IREQ : case ICMP_IREQREPLY : case ICMP_MASKREQ : case ICMP_MASKREPLY : fin->fin_flx |= FI_ICMPQUERY; minicmpsz = 12; break; /* * type(1) + code(1) + cksum(2) + id(2) seq(2) + ip(20+) */ case ICMP_UNREACH : #ifdef icmp_nextmtu if (icmp->icmp_code == ICMP_UNREACH_NEEDFRAG) { if (icmp->icmp_nextmtu < softc->ipf_icmpminfragmtu) { fin->fin_flx |= FI_BAD; DT3(ipf_fi_bad_icmp_nextmtu, fr_info_t *, fin, u_int, icmp->icmp_nextmtu, u_int, softc->ipf_icmpminfragmtu); } } #endif case ICMP_SOURCEQUENCH : case ICMP_REDIRECT : case ICMP_TIMXCEED : case ICMP_PARAMPROB : fin->fin_flx |= FI_ICMPERR; if (ipf_coalesce(fin) != 1) { LBUMPD(ipf_stats[fin->fin_out], fr_icmp_coalesce); return; } /* * ICMP error packets should not be generated for IP * packets that are a fragment that isn't the first * fragment. */ oip = (ip_t *)((char *)fin->fin_dp + ICMPERR_ICMPHLEN); if ((ntohs(oip->ip_off) & IP_OFFMASK) != 0) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_icmp_err, fr_info_t, fin, u_int, (ntohs(oip->ip_off) & IP_OFFMASK)); } /* * If the destination of this packet doesn't match the * source of the original packet then this packet is * not correct. */ if (oip->ip_src.s_addr != fin->fin_daddr) { fin->fin_flx |= FI_BAD; DT1(ipf_fi_bad_src_ne_dst, fr_info_t *, fin); } break; default : break; } ipf_pr_short(fin, minicmpsz); ipf_checkv4sum(fin); } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_tcpcommon */ /* Returns: int - 0 = header ok, 1 = bad packet, -1 = buffer error */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* TCP header sanity checking. Look for bad combinations of TCP flags, */ /* and make some checks with how they interact with other fields. */ /* If compiled with IPFILTER_CKSUM, check to see if the TCP checksum is */ /* valid and mark the packet as bad if not. */ /* ------------------------------------------------------------------------ */ static INLINE int ipf_pr_tcpcommon(fin) fr_info_t *fin; { ipf_main_softc_t *softc = fin->fin_main_soft; int flags, tlen; tcphdr_t *tcp; fin->fin_flx |= FI_TCPUDP; if (fin->fin_off != 0) { LBUMPD(ipf_stats[fin->fin_out], fr_tcp_frag); return 0; } if (ipf_pr_pullup(fin, sizeof(*tcp)) == -1) { LBUMPD(ipf_stats[fin->fin_out], fr_tcp_pullup); return -1; } tcp = fin->fin_dp; if (fin->fin_dlen > 3) { fin->fin_sport = ntohs(tcp->th_sport); fin->fin_dport = ntohs(tcp->th_dport); } if ((fin->fin_flx & FI_SHORT) != 0) { LBUMPD(ipf_stats[fin->fin_out], fr_tcp_short); return 1; } /* * Use of the TCP data offset *must* result in a value that is at * least the same size as the TCP header. */ tlen = TCP_OFF(tcp) << 2; if (tlen < sizeof(tcphdr_t)) { LBUMPD(ipf_stats[fin->fin_out], fr_tcp_small); fin->fin_flx |= FI_BAD; DT3(ipf_fi_bad_tlen, fr_info_t, fin, u_int, tlen, u_int, sizeof(tcphdr_t)); return 1; } flags = tcp->th_flags; fin->fin_tcpf = tcp->th_flags; /* * If the urgent flag is set, then the urgent pointer must * also be set and vice versa. Good TCP packets do not have * just one of these set. */ if ((flags & TH_URG) != 0 && (tcp->th_urp == 0)) { fin->fin_flx |= FI_BAD; DT3(ipf_fi_bad_th_urg, fr_info_t*, fin, u_int, (flags & TH_URG), u_int, tcp->th_urp); #if 0 } else if ((flags & TH_URG) == 0 && (tcp->th_urp != 0)) { /* * Ignore this case (#if 0) as it shows up in "real" * traffic with bogus values in the urgent pointer field. */ fin->fin_flx |= FI_BAD; DT3(ipf_fi_bad_th_urg0, fr_info_t *, fin, u_int, (flags & TH_URG), u_int, tcp->th_urp); #endif } else if (((flags & (TH_SYN|TH_FIN)) != 0) && ((flags & (TH_RST|TH_ACK)) == TH_RST)) { /* TH_FIN|TH_RST|TH_ACK seems to appear "naturally" */ fin->fin_flx |= FI_BAD; DT1(ipf_fi_bad_th_fin_rst_ack, fr_info_t, fin); #if 1 } else if (((flags & TH_SYN) != 0) && ((flags & (TH_URG|TH_PUSH)) != 0)) { /* * SYN with URG and PUSH set is not for normal TCP but it is * possible(?) with T/TCP...but who uses T/TCP? */ fin->fin_flx |= FI_BAD; DT1(ipf_fi_bad_th_syn_urg_psh, fr_info_t *, fin); #endif } else if (!(flags & TH_ACK)) { /* * If the ack bit isn't set, then either the SYN or * RST bit must be set. If the SYN bit is set, then * we expect the ACK field to be 0. If the ACK is * not set and if URG, PSH or FIN are set, consdier * that to indicate a bad TCP packet. */ if ((flags == TH_SYN) && (tcp->th_ack != 0)) { /* * Cisco PIX sets the ACK field to a random value. * In light of this, do not set FI_BAD until a patch * is available from Cisco to ensure that * interoperability between existing systems is * achieved. */ /*fin->fin_flx |= FI_BAD*/; /*DT1(ipf_fi_bad_th_syn_ack, fr_info_t *, fin);*/ } else if (!(flags & (TH_RST|TH_SYN))) { fin->fin_flx |= FI_BAD; DT1(ipf_fi_bad_th_rst_syn, fr_info_t *, fin); } else if ((flags & (TH_URG|TH_PUSH|TH_FIN)) != 0) { fin->fin_flx |= FI_BAD; DT1(ipf_fi_bad_th_urg_push_fin, fr_info_t *, fin); } } if (fin->fin_flx & FI_BAD) { LBUMPD(ipf_stats[fin->fin_out], fr_tcp_bad_flags); return 1; } /* * At this point, it's not exactly clear what is to be gained by * marking up which TCP options are and are not present. The one we * are most interested in is the TCP window scale. This is only in * a SYN packet [RFC1323] so we don't need this here...? * Now if we were to analyse the header for passive fingerprinting, * then that might add some weight to adding this... */ if (tlen == sizeof(tcphdr_t)) { return 0; } if (ipf_pr_pullup(fin, tlen) == -1) { LBUMPD(ipf_stats[fin->fin_out], fr_tcp_pullup); return -1; } #if 0 tcp = fin->fin_dp; ip = fin->fin_ip; s = (u_char *)(tcp + 1); off = IP_HL(ip) << 2; # ifdef _KERNEL if (fin->fin_mp != NULL) { mb_t *m = *fin->fin_mp; if (off + tlen > M_LEN(m)) return; } # endif for (tlen -= (int)sizeof(*tcp); tlen > 0; ) { opt = *s; if (opt == '\0') break; else if (opt == TCPOPT_NOP) ol = 1; else { if (tlen < 2) break; ol = (int)*(s + 1); if (ol < 2 || ol > tlen) break; } for (i = 9, mv = 4; mv >= 0; ) { op = ipopts + i; if (opt == (u_char)op->ol_val) { optmsk |= op->ol_bit; break; } } tlen -= ol; s += ol; } #endif /* 0 */ return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_udpcommon */ /* Returns: int - 0 = header ok, 1 = bad packet */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* Extract the UDP source and destination ports, if present. If compiled */ /* with IPFILTER_CKSUM, check to see if the UDP checksum is valid. */ /* ------------------------------------------------------------------------ */ static INLINE int ipf_pr_udpcommon(fin) fr_info_t *fin; { udphdr_t *udp; fin->fin_flx |= FI_TCPUDP; if (!fin->fin_off && (fin->fin_dlen > 3)) { if (ipf_pr_pullup(fin, sizeof(*udp)) == -1) { ipf_main_softc_t *softc = fin->fin_main_soft; fin->fin_flx |= FI_SHORT; LBUMPD(ipf_stats[fin->fin_out], fr_udp_pullup); return 1; } udp = fin->fin_dp; fin->fin_sport = ntohs(udp->uh_sport); fin->fin_dport = ntohs(udp->uh_dport); } return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_tcp */ /* Returns: void */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* IPv4 Only */ /* Analyse the packet for IPv4/TCP properties. */ /* ------------------------------------------------------------------------ */ static INLINE void ipf_pr_tcp(fin) fr_info_t *fin; { ipf_pr_short(fin, sizeof(tcphdr_t)); if (ipf_pr_tcpcommon(fin) == 0) ipf_checkv4sum(fin); } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_udp */ /* Returns: void */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* IPv4 Only */ /* Analyse the packet for IPv4/UDP properties. */ /* ------------------------------------------------------------------------ */ static INLINE void ipf_pr_udp(fin) fr_info_t *fin; { ipf_pr_short(fin, sizeof(udphdr_t)); if (ipf_pr_udpcommon(fin) == 0) ipf_checkv4sum(fin); } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_esp */ /* Returns: void */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* Analyse the packet for ESP properties. */ /* The minimum length is taken to be the SPI (32bits) plus a tail (32bits) */ /* even though the newer ESP packets must also have a sequence number that */ /* is 32bits as well, it is not possible(?) to determine the version from a */ /* simple packet header. */ /* ------------------------------------------------------------------------ */ static INLINE void ipf_pr_esp(fin) fr_info_t *fin; { if (fin->fin_off == 0) { ipf_pr_short(fin, 8); if (ipf_pr_pullup(fin, 8) == -1) { ipf_main_softc_t *softc = fin->fin_main_soft; LBUMPD(ipf_stats[fin->fin_out], fr_v4_esp_pullup); } } } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_ah */ /* Returns: int - value of the next header or IPPROTO_NONE if error */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* Analyse the packet for AH properties. */ /* The minimum length is taken to be the combination of all fields in the */ /* header being present and no authentication data (null algorithm used.) */ /* ------------------------------------------------------------------------ */ static INLINE int ipf_pr_ah(fin) fr_info_t *fin; { ipf_main_softc_t *softc = fin->fin_main_soft; authhdr_t *ah; int len; fin->fin_flx |= FI_AH; ipf_pr_short(fin, sizeof(*ah)); if (((fin->fin_flx & FI_SHORT) != 0) || (fin->fin_off != 0)) { LBUMPD(ipf_stats[fin->fin_out], fr_v4_ah_bad); return IPPROTO_NONE; } if (ipf_pr_pullup(fin, sizeof(*ah)) == -1) { DT(fr_v4_ah_pullup_1); LBUMP(ipf_stats[fin->fin_out].fr_v4_ah_pullup); return IPPROTO_NONE; } ah = (authhdr_t *)fin->fin_dp; len = (ah->ah_plen + 2) << 2; ipf_pr_short(fin, len); if (ipf_pr_pullup(fin, len) == -1) { DT(fr_v4_ah_pullup_2); LBUMP(ipf_stats[fin->fin_out].fr_v4_ah_pullup); return IPPROTO_NONE; } /* * Adjust fin_dp and fin_dlen for skipping over the authentication * header. */ fin->fin_dp = (char *)fin->fin_dp + len; fin->fin_dlen -= len; return ah->ah_next; } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_gre */ /* Returns: void */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* Analyse the packet for GRE properties. */ /* ------------------------------------------------------------------------ */ static INLINE void ipf_pr_gre(fin) fr_info_t *fin; { ipf_main_softc_t *softc = fin->fin_main_soft; grehdr_t *gre; ipf_pr_short(fin, sizeof(grehdr_t)); if (fin->fin_off != 0) { LBUMPD(ipf_stats[fin->fin_out], fr_v4_gre_frag); return; } if (ipf_pr_pullup(fin, sizeof(grehdr_t)) == -1) { LBUMPD(ipf_stats[fin->fin_out], fr_v4_gre_pullup); return; } gre = fin->fin_dp; if (GRE_REV(gre->gr_flags) == 1) fin->fin_data[0] = gre->gr_call; } /* ------------------------------------------------------------------------ */ /* Function: ipf_pr_ipv4hdr */ /* Returns: void */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* IPv4 Only */ /* Analyze the IPv4 header and set fields in the fr_info_t structure. */ /* Check all options present and flag their presence if any exist. */ /* ------------------------------------------------------------------------ */ static INLINE void ipf_pr_ipv4hdr(fin) fr_info_t *fin; { u_short optmsk = 0, secmsk = 0, auth = 0; int hlen, ol, mv, p, i; const struct optlist *op; u_char *s, opt; u_short off; fr_ip_t *fi; ip_t *ip; fi = &fin->fin_fi; hlen = fin->fin_hlen; ip = fin->fin_ip; p = ip->ip_p; fi->fi_p = p; fin->fin_crc = p; fi->fi_tos = ip->ip_tos; fin->fin_id = ip->ip_id; off = ntohs(ip->ip_off); /* Get both TTL and protocol */ fi->fi_p = ip->ip_p; fi->fi_ttl = ip->ip_ttl; /* Zero out bits not used in IPv6 address */ fi->fi_src.i6[1] = 0; fi->fi_src.i6[2] = 0; fi->fi_src.i6[3] = 0; fi->fi_dst.i6[1] = 0; fi->fi_dst.i6[2] = 0; fi->fi_dst.i6[3] = 0; fi->fi_saddr = ip->ip_src.s_addr; fin->fin_crc += fi->fi_saddr; fi->fi_daddr = ip->ip_dst.s_addr; fin->fin_crc += fi->fi_daddr; if (IN_CLASSD(ntohl(fi->fi_daddr))) fin->fin_flx |= FI_MULTICAST|FI_MBCAST; /* * set packet attribute flags based on the offset and * calculate the byte offset that it represents. */ off &= IP_MF|IP_OFFMASK; if (off != 0) { int morefrag = off & IP_MF; fi->fi_flx |= FI_FRAG; off &= IP_OFFMASK; if (off != 0) { fin->fin_flx |= FI_FRAGBODY; off <<= 3; if ((off + fin->fin_dlen > 65535) || (fin->fin_dlen == 0) || ((morefrag != 0) && ((fin->fin_dlen & 7) != 0))) { /* * The length of the packet, starting at its * offset cannot exceed 65535 (0xffff) as the * length of an IP packet is only 16 bits. * * Any fragment that isn't the last fragment * must have a length greater than 0 and it * must be an even multiple of 8. */ fi->fi_flx |= FI_BAD; DT1(ipf_fi_bad_fragbody_gt_65535, fr_info_t *, fin); } } } fin->fin_off = off; /* * Call per-protocol setup and checking */ if (p == IPPROTO_AH) { /* * Treat AH differently because we expect there to be another * layer 4 header after it. */ p = ipf_pr_ah(fin); } switch (p) { case IPPROTO_UDP : ipf_pr_udp(fin); break; case IPPROTO_TCP : ipf_pr_tcp(fin); break; case IPPROTO_ICMP : ipf_pr_icmp(fin); break; case IPPROTO_ESP : ipf_pr_esp(fin); break; case IPPROTO_GRE : ipf_pr_gre(fin); break; } ip = fin->fin_ip; if (ip == NULL) return; /* * If it is a standard IP header (no options), set the flag fields * which relate to options to 0. */ if (hlen == sizeof(*ip)) { fi->fi_optmsk = 0; fi->fi_secmsk = 0; fi->fi_auth = 0; return; } /* * So the IP header has some IP options attached. Walk the entire * list of options present with this packet and set flags to indicate * which ones are here and which ones are not. For the somewhat out * of date and obscure security classification options, set a flag to * represent which classification is present. */ fi->fi_flx |= FI_OPTIONS; for (s = (u_char *)(ip + 1), hlen -= (int)sizeof(*ip); hlen > 0; ) { opt = *s; if (opt == '\0') break; else if (opt == IPOPT_NOP) ol = 1; else { if (hlen < 2) break; ol = (int)*(s + 1); if (ol < 2 || ol > hlen) break; } for (i = 9, mv = 4; mv >= 0; ) { op = ipopts + i; if ((opt == (u_char)op->ol_val) && (ol > 4)) { u_32_t doi; switch (opt) { case IPOPT_SECURITY : if (optmsk & op->ol_bit) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_ipopt_security, fr_info_t *, fin, u_short, (optmsk & op->ol_bit)); } else { doi = ipf_checkripso(s); secmsk = doi >> 16; auth = doi & 0xffff; } break; case IPOPT_CIPSO : if (optmsk & op->ol_bit) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_ipopt_cipso, fr_info_t *, fin, u_short, (optmsk & op->ol_bit)); } else { doi = ipf_checkcipso(fin, s, ol); secmsk = doi >> 16; auth = doi & 0xffff; } break; } optmsk |= op->ol_bit; } if (opt < op->ol_val) i -= mv; else i += mv; mv--; } hlen -= ol; s += ol; } /* * */ if (auth && !(auth & 0x0100)) auth &= 0xff00; fi->fi_optmsk = optmsk; fi->fi_secmsk = secmsk; fi->fi_auth = auth; } /* ------------------------------------------------------------------------ */ /* Function: ipf_checkripso */ /* Returns: void */ /* Parameters: s(I) - pointer to start of RIPSO option */ /* */ /* ------------------------------------------------------------------------ */ static u_32_t ipf_checkripso(s) u_char *s; { const struct optlist *sp; u_short secmsk = 0, auth = 0; u_char sec; int j, m; sec = *(s + 2); /* classification */ for (j = 3, m = 2; m >= 0; ) { sp = secopt + j; if (sec == sp->ol_val) { secmsk |= sp->ol_bit; auth = *(s + 3); auth *= 256; auth += *(s + 4); break; } if (sec < sp->ol_val) j -= m; else j += m; m--; } return (secmsk << 16) | auth; } /* ------------------------------------------------------------------------ */ /* Function: ipf_checkcipso */ /* Returns: u_32_t - 0 = failure, else the doi from the header */ /* Parameters: fin(IO) - pointer to packet information */ /* s(I) - pointer to start of CIPSO option */ /* ol(I) - length of CIPSO option field */ /* */ /* This function returns the domain of integrity (DOI) field from the CIPSO */ /* header and returns that whilst also storing the highest sensitivity */ /* value found in the fr_info_t structure. */ /* */ /* No attempt is made to extract the category bitmaps as these are defined */ /* by the user (rather than the protocol) and can be rather numerous on the */ /* end nodes. */ /* ------------------------------------------------------------------------ */ static u_32_t ipf_checkcipso(fin, s, ol) fr_info_t *fin; u_char *s; int ol; { ipf_main_softc_t *softc = fin->fin_main_soft; fr_ip_t *fi; u_32_t doi; u_char *t, tag, tlen, sensitivity; int len; if (ol < 6 || ol > 40) { LBUMPD(ipf_stats[fin->fin_out], fr_v4_cipso_bad); fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkcipso_ol, fr_info_t *, fin, u_int, ol); return 0; } fi = &fin->fin_fi; fi->fi_sensitivity = 0; /* * The DOI field MUST be there. */ bcopy(s + 2, &doi, sizeof(doi)); t = (u_char *)s + 6; for (len = ol - 6; len >= 2; len -= tlen, t+= tlen) { tag = *t; tlen = *(t + 1); if (tlen > len || tlen < 4 || tlen > 34) { LBUMPD(ipf_stats[fin->fin_out], fr_v4_cipso_tlen); fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkcipso_tlen, fr_info_t *, fin, u_int, tlen); return 0; } sensitivity = 0; /* * Tag numbers 0, 1, 2, 5 are laid out in the CIPSO Internet * draft (16 July 1992) that has expired. */ if (tag == 0) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkcipso_tag, fr_info_t *, fin, u_int, tag); continue; } else if (tag == 1) { if (*(t + 2) != 0) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkcipso_tag1_t2, fr_info_t *, fin, u_int, (*t + 2)); continue; } sensitivity = *(t + 3); /* Category bitmap for categories 0-239 */ } else if (tag == 4) { if (*(t + 2) != 0) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkcipso_tag4_t2, fr_info_t *, fin, u_int, (*t + 2)); continue; } sensitivity = *(t + 3); /* Enumerated categories, 16bits each, upto 15 */ } else if (tag == 5) { if (*(t + 2) != 0) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkcipso_tag5_t2, fr_info_t *, fin, u_int, (*t + 2)); continue; } sensitivity = *(t + 3); /* Range of categories (2*16bits), up to 7 pairs */ } else if (tag > 127) { /* Custom defined DOI */ ; } else { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkcipso_tag127, fr_info_t *, fin, u_int, tag); continue; } if (sensitivity > fi->fi_sensitivity) fi->fi_sensitivity = sensitivity; } return doi; } /* ------------------------------------------------------------------------ */ /* Function: ipf_makefrip */ /* Returns: int - 0 == packet ok, -1 == packet freed */ /* Parameters: hlen(I) - length of IP packet header */ /* ip(I) - pointer to the IP header */ /* fin(IO) - pointer to packet information */ /* */ /* Compact the IP header into a structure which contains just the info. */ /* which is useful for comparing IP headers with and store this information */ /* in the fr_info_t structure pointer to by fin. At present, it is assumed */ /* this function will be called with either an IPv4 or IPv6 packet. */ /* ------------------------------------------------------------------------ */ int ipf_makefrip(hlen, ip, fin) int hlen; ip_t *ip; fr_info_t *fin; { ipf_main_softc_t *softc = fin->fin_main_soft; int v; fin->fin_depth = 0; fin->fin_hlen = (u_short)hlen; fin->fin_ip = ip; fin->fin_rule = 0xffffffff; fin->fin_group[0] = -1; fin->fin_group[1] = '\0'; fin->fin_dp = (char *)ip + hlen; v = fin->fin_v; if (v == 4) { fin->fin_plen = ntohs(ip->ip_len); fin->fin_dlen = fin->fin_plen - hlen; ipf_pr_ipv4hdr(fin); #ifdef USE_INET6 } else if (v == 6) { fin->fin_plen = ntohs(((ip6_t *)ip)->ip6_plen); fin->fin_dlen = fin->fin_plen; fin->fin_plen += hlen; ipf_pr_ipv6hdr(fin); #endif } if (fin->fin_ip == NULL) { LBUMP(ipf_stats[fin->fin_out].fr_ip_freed); return -1; } return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_portcheck */ /* Returns: int - 1 == port matched, 0 == port match failed */ /* Parameters: frp(I) - pointer to port check `expression' */ /* pop(I) - port number to evaluate */ /* */ /* Perform a comparison of a port number against some other(s), using a */ /* structure with compare information stored in it. */ /* ------------------------------------------------------------------------ */ static INLINE int ipf_portcheck(frp, pop) frpcmp_t *frp; u_32_t pop; { int err = 1; u_32_t po; po = frp->frp_port; /* * Do opposite test to that required and continue if that succeeds. */ switch (frp->frp_cmp) { case FR_EQUAL : if (pop != po) /* EQUAL */ err = 0; break; case FR_NEQUAL : if (pop == po) /* NOTEQUAL */ err = 0; break; case FR_LESST : if (pop >= po) /* LESSTHAN */ err = 0; break; case FR_GREATERT : if (pop <= po) /* GREATERTHAN */ err = 0; break; case FR_LESSTE : if (pop > po) /* LT or EQ */ err = 0; break; case FR_GREATERTE : if (pop < po) /* GT or EQ */ err = 0; break; case FR_OUTRANGE : if (pop >= po && pop <= frp->frp_top) /* Out of range */ err = 0; break; case FR_INRANGE : if (pop <= po || pop >= frp->frp_top) /* In range */ err = 0; break; case FR_INCRANGE : if (pop < po || pop > frp->frp_top) /* Inclusive range */ err = 0; break; default : break; } return err; } /* ------------------------------------------------------------------------ */ /* Function: ipf_tcpudpchk */ /* Returns: int - 1 == protocol matched, 0 == check failed */ /* Parameters: fda(I) - pointer to packet information */ /* ft(I) - pointer to structure with comparison data */ /* */ /* Compares the current pcket (assuming it is TCP/UDP) information with a */ /* structure containing information that we want to match against. */ /* ------------------------------------------------------------------------ */ int ipf_tcpudpchk(fi, ft) fr_ip_t *fi; frtuc_t *ft; { int err = 1; /* * Both ports should *always* be in the first fragment. * So far, I cannot find any cases where they can not be. * * compare destination ports */ if (ft->ftu_dcmp) err = ipf_portcheck(&ft->ftu_dst, fi->fi_ports[1]); /* * compare source ports */ if (err && ft->ftu_scmp) err = ipf_portcheck(&ft->ftu_src, fi->fi_ports[0]); /* * If we don't have all the TCP/UDP header, then how can we * expect to do any sort of match on it ? If we were looking for * TCP flags, then NO match. If not, then match (which should * satisfy the "short" class too). */ if (err && (fi->fi_p == IPPROTO_TCP)) { if (fi->fi_flx & FI_SHORT) return !(ft->ftu_tcpf | ft->ftu_tcpfm); /* * Match the flags ? If not, abort this match. */ if (ft->ftu_tcpfm && ft->ftu_tcpf != (fi->fi_tcpf & ft->ftu_tcpfm)) { FR_DEBUG(("f. %#x & %#x != %#x\n", fi->fi_tcpf, ft->ftu_tcpfm, ft->ftu_tcpf)); err = 0; } } return err; } /* ------------------------------------------------------------------------ */ /* Function: ipf_check_ipf */ /* Returns: int - 0 == match, else no match */ /* Parameters: fin(I) - pointer to packet information */ /* fr(I) - pointer to filter rule */ /* portcmp(I) - flag indicating whether to attempt matching on */ /* TCP/UDP port data. */ /* */ /* Check to see if a packet matches an IPFilter rule. Checks of addresses, */ /* port numbers, etc, for "standard" IPFilter rules are all orchestrated in */ /* this function. */ /* ------------------------------------------------------------------------ */ static INLINE int ipf_check_ipf(fin, fr, portcmp) fr_info_t *fin; frentry_t *fr; int portcmp; { u_32_t *ld, *lm, *lip; fripf_t *fri; fr_ip_t *fi; int i; fi = &fin->fin_fi; fri = fr->fr_ipf; lip = (u_32_t *)fi; lm = (u_32_t *)&fri->fri_mip; ld = (u_32_t *)&fri->fri_ip; /* * first 32 bits to check coversion: * IP version, TOS, TTL, protocol */ i = ((*lip & *lm) != *ld); FR_DEBUG(("0. %#08x & %#08x != %#08x\n", ntohl(*lip), ntohl(*lm), ntohl(*ld))); if (i) return 1; /* * Next 32 bits is a constructed bitmask indicating which IP options * are present (if any) in this packet. */ lip++, lm++, ld++; i = ((*lip & *lm) != *ld); FR_DEBUG(("1. %#08x & %#08x != %#08x\n", ntohl(*lip), ntohl(*lm), ntohl(*ld))); if (i != 0) return 1; lip++, lm++, ld++; /* * Unrolled loops (4 each, for 32 bits) for address checks. */ /* * Check the source address. */ if (fr->fr_satype == FRI_LOOKUP) { i = (*fr->fr_srcfunc)(fin->fin_main_soft, fr->fr_srcptr, fi->fi_v, lip, fin->fin_plen); if (i == -1) return 1; lip += 3; lm += 3; ld += 3; } else { i = ((*lip & *lm) != *ld); FR_DEBUG(("2a. %#08x & %#08x != %#08x\n", ntohl(*lip), ntohl(*lm), ntohl(*ld))); if (fi->fi_v == 6) { lip++, lm++, ld++; i |= ((*lip & *lm) != *ld); FR_DEBUG(("2b. %#08x & %#08x != %#08x\n", ntohl(*lip), ntohl(*lm), ntohl(*ld))); lip++, lm++, ld++; i |= ((*lip & *lm) != *ld); FR_DEBUG(("2c. %#08x & %#08x != %#08x\n", ntohl(*lip), ntohl(*lm), ntohl(*ld))); lip++, lm++, ld++; i |= ((*lip & *lm) != *ld); FR_DEBUG(("2d. %#08x & %#08x != %#08x\n", ntohl(*lip), ntohl(*lm), ntohl(*ld))); } else { lip += 3; lm += 3; ld += 3; } } i ^= (fr->fr_flags & FR_NOTSRCIP) >> 6; if (i != 0) return 1; /* * Check the destination address. */ lip++, lm++, ld++; if (fr->fr_datype == FRI_LOOKUP) { i = (*fr->fr_dstfunc)(fin->fin_main_soft, fr->fr_dstptr, fi->fi_v, lip, fin->fin_plen); if (i == -1) return 1; lip += 3; lm += 3; ld += 3; } else { i = ((*lip & *lm) != *ld); FR_DEBUG(("3a. %#08x & %#08x != %#08x\n", ntohl(*lip), ntohl(*lm), ntohl(*ld))); if (fi->fi_v == 6) { lip++, lm++, ld++; i |= ((*lip & *lm) != *ld); FR_DEBUG(("3b. %#08x & %#08x != %#08x\n", ntohl(*lip), ntohl(*lm), ntohl(*ld))); lip++, lm++, ld++; i |= ((*lip & *lm) != *ld); FR_DEBUG(("3c. %#08x & %#08x != %#08x\n", ntohl(*lip), ntohl(*lm), ntohl(*ld))); lip++, lm++, ld++; i |= ((*lip & *lm) != *ld); FR_DEBUG(("3d. %#08x & %#08x != %#08x\n", ntohl(*lip), ntohl(*lm), ntohl(*ld))); } else { lip += 3; lm += 3; ld += 3; } } i ^= (fr->fr_flags & FR_NOTDSTIP) >> 7; if (i != 0) return 1; /* * IP addresses matched. The next 32bits contains: * mast of old IP header security & authentication bits. */ lip++, lm++, ld++; i = (*ld - (*lip & *lm)); FR_DEBUG(("4. %#08x & %#08x != %#08x\n", *lip, *lm, *ld)); /* * Next we have 32 bits of packet flags. */ lip++, lm++, ld++; i |= (*ld - (*lip & *lm)); FR_DEBUG(("5. %#08x & %#08x != %#08x\n", *lip, *lm, *ld)); if (i == 0) { /* * If a fragment, then only the first has what we're * looking for here... */ if (portcmp) { if (!ipf_tcpudpchk(&fin->fin_fi, &fr->fr_tuc)) i = 1; } else { if (fr->fr_dcmp || fr->fr_scmp || fr->fr_tcpf || fr->fr_tcpfm) i = 1; if (fr->fr_icmpm || fr->fr_icmp) { if (((fi->fi_p != IPPROTO_ICMP) && (fi->fi_p != IPPROTO_ICMPV6)) || fin->fin_off || (fin->fin_dlen < 2)) i = 1; else if ((fin->fin_data[0] & fr->fr_icmpm) != fr->fr_icmp) { FR_DEBUG(("i. %#x & %#x != %#x\n", fin->fin_data[0], fr->fr_icmpm, fr->fr_icmp)); i = 1; } } } } return i; } /* ------------------------------------------------------------------------ */ /* Function: ipf_scanlist */ /* Returns: int - result flags of scanning filter list */ /* Parameters: fin(I) - pointer to packet information */ /* pass(I) - default result to return for filtering */ /* */ /* Check the input/output list of rules for a match to the current packet. */ /* If a match is found, the value of fr_flags from the rule becomes the */ /* return value and fin->fin_fr points to the matched rule. */ /* */ /* This function may be called recusively upto 16 times (limit inbuilt.) */ /* When unwinding, it should finish up with fin_depth as 0. */ /* */ /* Could be per interface, but this gets real nasty when you don't have, */ /* or can't easily change, the kernel source code to . */ /* ------------------------------------------------------------------------ */ int ipf_scanlist(fin, pass) fr_info_t *fin; u_32_t pass; { ipf_main_softc_t *softc = fin->fin_main_soft; int rulen, portcmp, off, skip; struct frentry *fr, *fnext; u_32_t passt, passo; /* * Do not allow nesting deeper than 16 levels. */ if (fin->fin_depth >= 16) return pass; fr = fin->fin_fr; /* * If there are no rules in this list, return now. */ if (fr == NULL) return pass; skip = 0; portcmp = 0; fin->fin_depth++; fin->fin_fr = NULL; off = fin->fin_off; if ((fin->fin_flx & FI_TCPUDP) && (fin->fin_dlen > 3) && !off) portcmp = 1; for (rulen = 0; fr; fr = fnext, rulen++) { fnext = fr->fr_next; if (skip != 0) { FR_VERBOSE(("SKIP %d (%#x)\n", skip, fr->fr_flags)); skip--; continue; } /* * In all checks below, a null (zero) value in the * filter struture is taken to mean a wildcard. * * check that we are working for the right interface */ #ifdef _KERNEL if (fr->fr_ifa && fr->fr_ifa != fin->fin_ifp) continue; #else if (opts & (OPT_VERBOSE|OPT_DEBUG)) printf("\n"); FR_VERBOSE(("%c", FR_ISSKIP(pass) ? 's' : FR_ISPASS(pass) ? 'p' : FR_ISACCOUNT(pass) ? 'A' : FR_ISAUTH(pass) ? 'a' : (pass & FR_NOMATCH) ? 'n' :'b')); if (fr->fr_ifa && fr->fr_ifa != fin->fin_ifp) continue; FR_VERBOSE((":i")); #endif switch (fr->fr_type) { case FR_T_IPF : case FR_T_IPF_BUILTIN : if (ipf_check_ipf(fin, fr, portcmp)) continue; break; #if defined(IPFILTER_BPF) case FR_T_BPFOPC : case FR_T_BPFOPC_BUILTIN : { u_char *mc; int wlen; if (*fin->fin_mp == NULL) continue; if (fin->fin_family != fr->fr_family) continue; mc = (u_char *)fin->fin_m; wlen = fin->fin_dlen + fin->fin_hlen; if (!bpf_filter(fr->fr_data, mc, wlen, 0)) continue; break; } #endif case FR_T_CALLFUNC_BUILTIN : { frentry_t *f; f = (*fr->fr_func)(fin, &pass); if (f != NULL) fr = f; else continue; break; } case FR_T_IPFEXPR : case FR_T_IPFEXPR_BUILTIN : if (fin->fin_family != fr->fr_family) continue; if (ipf_fr_matcharray(fin, fr->fr_data) == 0) continue; break; default : break; } if ((fin->fin_out == 0) && (fr->fr_nattag.ipt_num[0] != 0)) { if (fin->fin_nattag == NULL) continue; if (ipf_matchtag(&fr->fr_nattag, fin->fin_nattag) == 0) continue; } FR_VERBOSE(("=%d/%d.%d *", fr->fr_grhead, fr->fr_group, rulen)); passt = fr->fr_flags; /* * If the rule is a "call now" rule, then call the function * in the rule, if it exists and use the results from that. * If the function pointer is bad, just make like we ignore * it, except for increasing the hit counter. */ if ((passt & FR_CALLNOW) != 0) { frentry_t *frs; ATOMIC_INC64(fr->fr_hits); if ((fr->fr_func == NULL) || (fr->fr_func == (ipfunc_t)-1)) continue; frs = fin->fin_fr; fin->fin_fr = fr; fr = (*fr->fr_func)(fin, &passt); if (fr == NULL) { fin->fin_fr = frs; continue; } passt = fr->fr_flags; } fin->fin_fr = fr; #ifdef IPFILTER_LOG /* * Just log this packet... */ if ((passt & FR_LOGMASK) == FR_LOG) { if (ipf_log_pkt(fin, passt) == -1) { if (passt & FR_LOGORBLOCK) { DT(frb_logfail); passt &= ~FR_CMDMASK; passt |= FR_BLOCK|FR_QUICK; fin->fin_reason = FRB_LOGFAIL; } } } #endif /* IPFILTER_LOG */ MUTEX_ENTER(&fr->fr_lock); fr->fr_bytes += (U_QUAD_T)fin->fin_plen; fr->fr_hits++; MUTEX_EXIT(&fr->fr_lock); fin->fin_rule = rulen; passo = pass; if (FR_ISSKIP(passt)) { skip = fr->fr_arg; continue; } else if (((passt & FR_LOGMASK) != FR_LOG) && ((passt & FR_LOGMASK) != FR_DECAPSULATE)) { pass = passt; } if (passt & (FR_RETICMP|FR_FAKEICMP)) fin->fin_icode = fr->fr_icode; if (fr->fr_group != -1) { (void) strncpy(fin->fin_group, FR_NAME(fr, fr_group), strlen(FR_NAME(fr, fr_group))); } else { fin->fin_group[0] = '\0'; } FR_DEBUG(("pass %#x/%#x/%x\n", passo, pass, passt)); if (fr->fr_grphead != NULL) { fin->fin_fr = fr->fr_grphead->fg_start; FR_VERBOSE(("group %s\n", FR_NAME(fr, fr_grhead))); if (FR_ISDECAPS(passt)) passt = ipf_decaps(fin, pass, fr->fr_icode); else passt = ipf_scanlist(fin, pass); if (fin->fin_fr == NULL) { fin->fin_rule = rulen; if (fr->fr_group != -1) (void) strncpy(fin->fin_group, fr->fr_names + fr->fr_group, strlen(fr->fr_names + fr->fr_group)); fin->fin_fr = fr; passt = pass; } pass = passt; } if (pass & FR_QUICK) { /* * Finally, if we've asked to track state for this * packet, set it up. Add state for "quick" rules * here so that if the action fails we can consider * the rule to "not match" and keep on processing * filter rules. */ if ((pass & FR_KEEPSTATE) && !FR_ISAUTH(pass) && !(fin->fin_flx & FI_STATE)) { int out = fin->fin_out; fin->fin_fr = fr; if (ipf_state_add(softc, fin, NULL, 0) == 0) { LBUMPD(ipf_stats[out], fr_ads); } else { LBUMPD(ipf_stats[out], fr_bads); pass = passo; continue; } } break; } } fin->fin_depth--; return pass; } /* ------------------------------------------------------------------------ */ /* Function: ipf_acctpkt */ /* Returns: frentry_t* - always returns NULL */ /* Parameters: fin(I) - pointer to packet information */ /* passp(IO) - pointer to current/new filter decision (unused) */ /* */ /* Checks a packet against accounting rules, if there are any for the given */ /* IP protocol version. */ /* */ /* N.B.: this function returns NULL to match the prototype used by other */ /* functions called from the IPFilter "mainline" in ipf_check(). */ /* ------------------------------------------------------------------------ */ frentry_t * ipf_acctpkt(fin, passp) fr_info_t *fin; u_32_t *passp; { ipf_main_softc_t *softc = fin->fin_main_soft; char group[FR_GROUPLEN]; frentry_t *fr, *frsave; u_32_t pass, rulen; passp = passp; fr = softc->ipf_acct[fin->fin_out][softc->ipf_active]; if (fr != NULL) { frsave = fin->fin_fr; bcopy(fin->fin_group, group, FR_GROUPLEN); rulen = fin->fin_rule; fin->fin_fr = fr; pass = ipf_scanlist(fin, FR_NOMATCH); if (FR_ISACCOUNT(pass)) { LBUMPD(ipf_stats[0], fr_acct); } fin->fin_fr = frsave; bcopy(group, fin->fin_group, FR_GROUPLEN); fin->fin_rule = rulen; } return NULL; } /* ------------------------------------------------------------------------ */ /* Function: ipf_firewall */ /* Returns: frentry_t* - returns pointer to matched rule, if no matches */ /* were found, returns NULL. */ /* Parameters: fin(I) - pointer to packet information */ /* passp(IO) - pointer to current/new filter decision (unused) */ /* */ /* Applies an appropriate set of firewall rules to the packet, to see if */ /* there are any matches. The first check is to see if a match can be seen */ /* in the cache. If not, then search an appropriate list of rules. Once a */ /* matching rule is found, take any appropriate actions as defined by the */ /* rule - except logging. */ /* ------------------------------------------------------------------------ */ static frentry_t * ipf_firewall(fin, passp) fr_info_t *fin; u_32_t *passp; { ipf_main_softc_t *softc = fin->fin_main_soft; frentry_t *fr; u_32_t pass; int out; out = fin->fin_out; pass = *passp; /* * This rule cache will only affect packets that are not being * statefully filtered. */ fin->fin_fr = softc->ipf_rules[out][softc->ipf_active]; if (fin->fin_fr != NULL) pass = ipf_scanlist(fin, softc->ipf_pass); if ((pass & FR_NOMATCH)) { LBUMPD(ipf_stats[out], fr_nom); } fr = fin->fin_fr; /* * Apply packets per second rate-limiting to a rule as required. */ if ((fr != NULL) && (fr->fr_pps != 0) && !ppsratecheck(&fr->fr_lastpkt, &fr->fr_curpps, fr->fr_pps)) { DT2(frb_ppsrate, fr_info_t *, fin, frentry_t *, fr); pass &= ~(FR_CMDMASK|FR_RETICMP|FR_RETRST); pass |= FR_BLOCK; LBUMPD(ipf_stats[out], fr_ppshit); fin->fin_reason = FRB_PPSRATE; } /* * If we fail to add a packet to the authorization queue, then we * drop the packet later. However, if it was added then pretend * we've dropped it already. */ if (FR_ISAUTH(pass)) { if (ipf_auth_new(fin->fin_m, fin) != 0) { DT1(frb_authnew, fr_info_t *, fin); fin->fin_m = *fin->fin_mp = NULL; fin->fin_reason = FRB_AUTHNEW; fin->fin_error = 0; } else { IPFERROR(1); fin->fin_error = ENOSPC; } } if ((fr != NULL) && (fr->fr_func != NULL) && (fr->fr_func != (ipfunc_t)-1) && !(pass & FR_CALLNOW)) (void) (*fr->fr_func)(fin, &pass); /* * If a rule is a pre-auth rule, check again in the list of rules * loaded for authenticated use. It does not particulary matter * if this search fails because a "preauth" result, from a rule, * is treated as "not a pass", hence the packet is blocked. */ if (FR_ISPREAUTH(pass)) { pass = ipf_auth_pre_scanlist(softc, fin, pass); } /* * If the rule has "keep frag" and the packet is actually a fragment, * then create a fragment state entry. */ if ((pass & (FR_KEEPFRAG|FR_KEEPSTATE)) == FR_KEEPFRAG) { if (fin->fin_flx & FI_FRAG) { if (ipf_frag_new(softc, fin, pass) == -1) { LBUMP(ipf_stats[out].fr_bnfr); } else { LBUMP(ipf_stats[out].fr_nfr); } } else { LBUMP(ipf_stats[out].fr_cfr); } } fr = fin->fin_fr; *passp = pass; return fr; } /* ------------------------------------------------------------------------ */ /* Function: ipf_check */ /* Returns: int - 0 == packet allowed through, */ /* User space: */ /* -1 == packet blocked */ /* 1 == packet not matched */ /* -2 == requires authentication */ /* Kernel: */ /* > 0 == filter error # for packet */ /* Parameters: ip(I) - pointer to start of IPv4/6 packet */ /* hlen(I) - length of header */ /* ifp(I) - pointer to interface this packet is on */ /* out(I) - 0 == packet going in, 1 == packet going out */ /* mp(IO) - pointer to caller's buffer pointer that holds this */ /* IP packet. */ /* Solaris & HP-UX ONLY : */ /* qpi(I) - pointer to STREAMS queue information for this */ /* interface & direction. */ /* */ /* ipf_check() is the master function for all IPFilter packet processing. */ /* It orchestrates: Network Address Translation (NAT), checking for packet */ /* authorisation (or pre-authorisation), presence of related state info., */ /* generating log entries, IP packet accounting, routing of packets as */ /* directed by firewall rules and of course whether or not to allow the */ /* packet to be further processed by the kernel. */ /* */ /* For packets blocked, the contents of "mp" will be NULL'd and the buffer */ /* freed. Packets passed may be returned with the pointer pointed to by */ /* by "mp" changed to a new buffer. */ /* ------------------------------------------------------------------------ */ int ipf_check(ctx, ip, hlen, ifp, out #if defined(_KERNEL) && defined(MENTAT) , qif, mp) void *qif; #else , mp) #endif mb_t **mp; ip_t *ip; int hlen; void *ifp; int out; void *ctx; { /* * The above really sucks, but short of writing a diff */ ipf_main_softc_t *softc = ctx; fr_info_t frinfo; fr_info_t *fin = &frinfo; u_32_t pass = softc->ipf_pass; frentry_t *fr = NULL; int v = IP_V(ip); mb_t *mc = NULL; mb_t *m; /* * The first part of ipf_check() deals with making sure that what goes * into the filtering engine makes some sense. Information about the * the packet is distilled, collected into a fr_info_t structure and * the an attempt to ensure the buffer the packet is in is big enough * to hold all the required packet headers. */ #ifdef _KERNEL # ifdef MENTAT qpktinfo_t *qpi = qif; # ifdef __sparc if ((u_int)ip & 0x3) return 2; # endif # else SPL_INT(s); # endif if (softc->ipf_running <= 0) { return 0; } bzero((char *)fin, sizeof(*fin)); # ifdef MENTAT if (qpi->qpi_flags & QF_BROADCAST) fin->fin_flx |= FI_MBCAST|FI_BROADCAST; if (qpi->qpi_flags & QF_MULTICAST) fin->fin_flx |= FI_MBCAST|FI_MULTICAST; m = qpi->qpi_m; fin->fin_qfm = m; fin->fin_qpi = qpi; # else /* MENTAT */ m = *mp; # if defined(M_MCAST) if ((m->m_flags & M_MCAST) != 0) fin->fin_flx |= FI_MBCAST|FI_MULTICAST; # endif # if defined(M_MLOOP) if ((m->m_flags & M_MLOOP) != 0) fin->fin_flx |= FI_MBCAST|FI_MULTICAST; # endif # if defined(M_BCAST) if ((m->m_flags & M_BCAST) != 0) fin->fin_flx |= FI_MBCAST|FI_BROADCAST; # endif # ifdef M_CANFASTFWD /* * XXX For now, IP Filter and fast-forwarding of cached flows * XXX are mutually exclusive. Eventually, IP Filter should * XXX get a "can-fast-forward" filter rule. */ m->m_flags &= ~M_CANFASTFWD; # endif /* M_CANFASTFWD */ # if defined(CSUM_DELAY_DATA) && (!defined(__FreeBSD_version) || \ (__FreeBSD_version < 501108)) /* * disable delayed checksums. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } # endif /* CSUM_DELAY_DATA */ # endif /* MENTAT */ #else bzero((char *)fin, sizeof(*fin)); m = *mp; # if defined(M_MCAST) if ((m->m_flags & M_MCAST) != 0) fin->fin_flx |= FI_MBCAST|FI_MULTICAST; # endif # if defined(M_MLOOP) if ((m->m_flags & M_MLOOP) != 0) fin->fin_flx |= FI_MBCAST|FI_MULTICAST; # endif # if defined(M_BCAST) if ((m->m_flags & M_BCAST) != 0) fin->fin_flx |= FI_MBCAST|FI_BROADCAST; # endif #endif /* _KERNEL */ fin->fin_v = v; fin->fin_m = m; fin->fin_ip = ip; fin->fin_mp = mp; fin->fin_out = out; fin->fin_ifp = ifp; fin->fin_error = ENETUNREACH; fin->fin_hlen = (u_short)hlen; fin->fin_dp = (char *)ip + hlen; fin->fin_main_soft = softc; fin->fin_ipoff = (char *)ip - MTOD(m, char *); SPL_NET(s); #ifdef USE_INET6 if (v == 6) { LBUMP(ipf_stats[out].fr_ipv6); /* * Jumbo grams are quite likely too big for internal buffer * structures to handle comfortably, for now, so just drop * them. */ if (((ip6_t *)ip)->ip6_plen == 0) { DT1(frb_jumbo, ip6_t *, (ip6_t *)ip); pass = FR_BLOCK|FR_NOMATCH; fin->fin_reason = FRB_JUMBO; goto finished; } fin->fin_family = AF_INET6; } else #endif { fin->fin_family = AF_INET; } if (ipf_makefrip(hlen, ip, fin) == -1) { DT1(frb_makefrip, fr_info_t *, fin); pass = FR_BLOCK|FR_NOMATCH; fin->fin_reason = FRB_MAKEFRIP; goto finished; } /* * For at least IPv6 packets, if a m_pullup() fails then this pointer * becomes NULL and so we have no packet to free. */ if (*fin->fin_mp == NULL) goto finished; if (!out) { if (v == 4) { if (softc->ipf_chksrc && !ipf_verifysrc(fin)) { LBUMPD(ipf_stats[0], fr_v4_badsrc); fin->fin_flx |= FI_BADSRC; } if (fin->fin_ip->ip_ttl < softc->ipf_minttl) { LBUMPD(ipf_stats[0], fr_v4_badttl); fin->fin_flx |= FI_LOWTTL; } } #ifdef USE_INET6 else if (v == 6) { if (((ip6_t *)ip)->ip6_hlim < softc->ipf_minttl) { LBUMPD(ipf_stats[0], fr_v6_badttl); fin->fin_flx |= FI_LOWTTL; } } #endif } if (fin->fin_flx & FI_SHORT) { LBUMPD(ipf_stats[out], fr_short); } READ_ENTER(&softc->ipf_mutex); if (!out) { switch (fin->fin_v) { case 4 : if (ipf_nat_checkin(fin, &pass) == -1) { goto filterdone; } break; #ifdef USE_INET6 case 6 : if (ipf_nat6_checkin(fin, &pass) == -1) { goto filterdone; } break; #endif default : break; } } /* * Check auth now. * If a packet is found in the auth table, then skip checking * the access lists for permission but we do need to consider * the result as if it were from the ACL's. In addition, being * found in the auth table means it has been seen before, so do * not pass it through accounting (again), lest it be counted twice. */ fr = ipf_auth_check(fin, &pass); if (!out && (fr == NULL)) (void) ipf_acctpkt(fin, NULL); if (fr == NULL) { if ((fin->fin_flx & FI_FRAG) != 0) fr = ipf_frag_known(fin, &pass); if (fr == NULL) fr = ipf_state_check(fin, &pass); } if ((pass & FR_NOMATCH) || (fr == NULL)) fr = ipf_firewall(fin, &pass); /* * If we've asked to track state for this packet, set it up. * Here rather than ipf_firewall because ipf_checkauth may decide * to return a packet for "keep state" */ if ((pass & FR_KEEPSTATE) && (fin->fin_m != NULL) && !(fin->fin_flx & FI_STATE)) { if (ipf_state_add(softc, fin, NULL, 0) == 0) { LBUMP(ipf_stats[out].fr_ads); } else { LBUMP(ipf_stats[out].fr_bads); if (FR_ISPASS(pass)) { DT(frb_stateadd); pass &= ~FR_CMDMASK; pass |= FR_BLOCK; fin->fin_reason = FRB_STATEADD; } } } fin->fin_fr = fr; if ((fr != NULL) && !(fin->fin_flx & FI_STATE)) { fin->fin_dif = &fr->fr_dif; fin->fin_tif = &fr->fr_tifs[fin->fin_rev]; } /* * Only count/translate packets which will be passed on, out the * interface. */ if (out && FR_ISPASS(pass)) { (void) ipf_acctpkt(fin, NULL); switch (fin->fin_v) { case 4 : if (ipf_nat_checkout(fin, &pass) == -1) { ; } else if ((softc->ipf_update_ipid != 0) && (v == 4)) { if (ipf_updateipid(fin) == -1) { DT(frb_updateipid); LBUMP(ipf_stats[1].fr_ipud); pass &= ~FR_CMDMASK; pass |= FR_BLOCK; fin->fin_reason = FRB_UPDATEIPID; } else { LBUMP(ipf_stats[0].fr_ipud); } } break; #ifdef USE_INET6 case 6 : (void) ipf_nat6_checkout(fin, &pass); break; #endif default : break; } } filterdone: #ifdef IPFILTER_LOG if ((softc->ipf_flags & FF_LOGGING) || (pass & FR_LOGMASK)) { (void) ipf_dolog(fin, &pass); } #endif /* * The FI_STATE flag is cleared here so that calling ipf_state_check * will work when called from inside of fr_fastroute. Although * there is a similar flag, FI_NATED, for NAT, it does have the same * impact on code execution. */ fin->fin_flx &= ~FI_STATE; #if defined(FASTROUTE_RECURSION) /* * Up the reference on fr_lock and exit ipf_mutex. The generation of * a packet below can sometimes cause a recursive call into IPFilter. * On those platforms where that does happen, we need to hang onto * the filter rule just in case someone decides to remove or flush it * in the meantime. */ if (fr != NULL) { MUTEX_ENTER(&fr->fr_lock); fr->fr_ref++; MUTEX_EXIT(&fr->fr_lock); } RWLOCK_EXIT(&softc->ipf_mutex); #endif if ((pass & FR_RETMASK) != 0) { /* * Should we return an ICMP packet to indicate error * status passing through the packet filter ? * WARNING: ICMP error packets AND TCP RST packets should * ONLY be sent in repsonse to incoming packets. Sending * them in response to outbound packets can result in a * panic on some operating systems. */ if (!out) { if (pass & FR_RETICMP) { int dst; if ((pass & FR_RETMASK) == FR_FAKEICMP) dst = 1; else dst = 0; (void) ipf_send_icmp_err(ICMP_UNREACH, fin, dst); LBUMP(ipf_stats[0].fr_ret); } else if (((pass & FR_RETMASK) == FR_RETRST) && !(fin->fin_flx & FI_SHORT)) { if (((fin->fin_flx & FI_OOW) != 0) || (ipf_send_reset(fin) == 0)) { LBUMP(ipf_stats[1].fr_ret); } } /* * When using return-* with auth rules, the auth code * takes over disposing of this packet. */ if (FR_ISAUTH(pass) && (fin->fin_m != NULL)) { DT1(frb_authcapture, fr_info_t *, fin); fin->fin_m = *fin->fin_mp = NULL; fin->fin_reason = FRB_AUTHCAPTURE; m = NULL; } } else { if (pass & FR_RETRST) { fin->fin_error = ECONNRESET; } } } /* * After the above so that ICMP unreachables and TCP RSTs get * created properly. */ if (FR_ISBLOCK(pass) && (fin->fin_flx & FI_NEWNAT)) ipf_nat_uncreate(fin); /* * If we didn't drop off the bottom of the list of rules (and thus * the 'current' rule fr is not NULL), then we may have some extra * instructions about what to do with a packet. * Once we're finished return to our caller, freeing the packet if * we are dropping it. */ if (fr != NULL) { frdest_t *fdp; /* * Generate a duplicated packet first because ipf_fastroute * can lead to fin_m being free'd... not good. */ fdp = fin->fin_dif; if ((fdp != NULL) && (fdp->fd_ptr != NULL) && (fdp->fd_ptr != (void *)-1)) { - mc = M_COPY(fin->fin_m); + mc = M_COPYM(fin->fin_m); if (mc != NULL) ipf_fastroute(mc, &mc, fin, fdp); } fdp = fin->fin_tif; if (!out && (pass & FR_FASTROUTE)) { /* * For fastroute rule, no destination interface defined * so pass NULL as the frdest_t parameter */ (void) ipf_fastroute(fin->fin_m, mp, fin, NULL); m = *mp = NULL; } else if ((fdp != NULL) && (fdp->fd_ptr != NULL) && (fdp->fd_ptr != (struct ifnet *)-1)) { /* this is for to rules: */ ipf_fastroute(fin->fin_m, mp, fin, fdp); m = *mp = NULL; } #if defined(FASTROUTE_RECURSION) (void) ipf_derefrule(softc, &fr); #endif } #if !defined(FASTROUTE_RECURSION) RWLOCK_EXIT(&softc->ipf_mutex); #endif finished: if (!FR_ISPASS(pass)) { LBUMP(ipf_stats[out].fr_block); if (*mp != NULL) { #ifdef _KERNEL FREE_MB_T(*mp); #endif m = *mp = NULL; } } else { LBUMP(ipf_stats[out].fr_pass); #if defined(_KERNEL) && defined(__sgi) if ((fin->fin_hbuf != NULL) && (mtod(fin->fin_m, struct ip *) != fin->fin_ip)) { COPYBACK(fin->fin_m, 0, fin->fin_plen, fin->fin_hbuf); } #endif } SPL_X(s); #ifdef _KERNEL if (FR_ISPASS(pass)) return 0; LBUMP(ipf_stats[out].fr_blocked[fin->fin_reason]); return fin->fin_error; #else /* _KERNEL */ if (*mp != NULL) (*mp)->mb_ifp = fin->fin_ifp; blockreason = fin->fin_reason; FR_VERBOSE(("fin_flx %#x pass %#x ", fin->fin_flx, pass)); /*if ((pass & FR_CMDMASK) == (softc->ipf_pass & FR_CMDMASK))*/ if ((pass & FR_NOMATCH) != 0) return 1; if ((pass & FR_RETMASK) != 0) switch (pass & FR_RETMASK) { case FR_RETRST : return 3; case FR_RETICMP : return 4; case FR_FAKEICMP : return 5; } switch (pass & FR_CMDMASK) { case FR_PASS : return 0; case FR_BLOCK : return -1; case FR_AUTH : return -2; case FR_ACCOUNT : return -3; case FR_PREAUTH : return -4; } return 2; #endif /* _KERNEL */ } #ifdef IPFILTER_LOG /* ------------------------------------------------------------------------ */ /* Function: ipf_dolog */ /* Returns: frentry_t* - returns contents of fin_fr (no change made) */ /* Parameters: fin(I) - pointer to packet information */ /* passp(IO) - pointer to current/new filter decision (unused) */ /* */ /* Checks flags set to see how a packet should be logged, if it is to be */ /* logged. Adjust statistics based on its success or not. */ /* ------------------------------------------------------------------------ */ frentry_t * ipf_dolog(fin, passp) fr_info_t *fin; u_32_t *passp; { ipf_main_softc_t *softc = fin->fin_main_soft; u_32_t pass; int out; out = fin->fin_out; pass = *passp; if ((softc->ipf_flags & FF_LOGNOMATCH) && (pass & FR_NOMATCH)) { pass |= FF_LOGNOMATCH; LBUMPD(ipf_stats[out], fr_npkl); goto logit; } else if (((pass & FR_LOGMASK) == FR_LOGP) || (FR_ISPASS(pass) && (softc->ipf_flags & FF_LOGPASS))) { if ((pass & FR_LOGMASK) != FR_LOGP) pass |= FF_LOGPASS; LBUMPD(ipf_stats[out], fr_ppkl); goto logit; } else if (((pass & FR_LOGMASK) == FR_LOGB) || (FR_ISBLOCK(pass) && (softc->ipf_flags & FF_LOGBLOCK))) { if ((pass & FR_LOGMASK) != FR_LOGB) pass |= FF_LOGBLOCK; LBUMPD(ipf_stats[out], fr_bpkl); logit: if (ipf_log_pkt(fin, pass) == -1) { /* * If the "or-block" option has been used then * block the packet if we failed to log it. */ if ((pass & FR_LOGORBLOCK) && FR_ISPASS(pass)) { DT1(frb_logfail2, u_int, pass); pass &= ~FR_CMDMASK; pass |= FR_BLOCK; fin->fin_reason = FRB_LOGFAIL2; } } *passp = pass; } return fin->fin_fr; } #endif /* IPFILTER_LOG */ /* ------------------------------------------------------------------------ */ /* Function: ipf_cksum */ /* Returns: u_short - IP header checksum */ /* Parameters: addr(I) - pointer to start of buffer to checksum */ /* len(I) - length of buffer in bytes */ /* */ /* Calculate the two's complement 16 bit checksum of the buffer passed. */ /* */ /* N.B.: addr should be 16bit aligned. */ /* ------------------------------------------------------------------------ */ u_short ipf_cksum(addr, len) u_short *addr; int len; { u_32_t sum = 0; for (sum = 0; len > 1; len -= 2) sum += *addr++; /* mop up an odd byte, if necessary */ if (len == 1) sum += *(u_char *)addr; /* * add back carry outs from top 16 bits to low 16 bits */ sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ sum += (sum >> 16); /* add carry */ return (u_short)(~sum); } /* ------------------------------------------------------------------------ */ /* Function: fr_cksum */ /* Returns: u_short - layer 4 checksum */ /* Parameters: fin(I) - pointer to packet information */ /* ip(I) - pointer to IP header */ /* l4proto(I) - protocol to caclulate checksum for */ /* l4hdr(I) - pointer to layer 4 header */ /* */ /* Calculates the TCP checksum for the packet held in "m", using the data */ /* in the IP header "ip" to seed it. */ /* */ /* NB: This function assumes we've pullup'd enough for all of the IP header */ /* and the TCP header. We also assume that data blocks aren't allocated in */ /* odd sizes. */ /* */ /* Expects ip_len and ip_off to be in network byte order when called. */ /* ------------------------------------------------------------------------ */ u_short fr_cksum(fin, ip, l4proto, l4hdr) fr_info_t *fin; ip_t *ip; int l4proto; void *l4hdr; { u_short *sp, slen, sumsave, *csump; u_int sum, sum2; int hlen; int off; #ifdef USE_INET6 ip6_t *ip6; #endif csump = NULL; sumsave = 0; sp = NULL; slen = 0; hlen = 0; sum = 0; sum = htons((u_short)l4proto); /* * Add up IP Header portion */ #ifdef USE_INET6 if (IP_V(ip) == 4) { #endif hlen = IP_HL(ip) << 2; off = hlen; sp = (u_short *)&ip->ip_src; sum += *sp++; /* ip_src */ sum += *sp++; sum += *sp++; /* ip_dst */ sum += *sp++; #ifdef USE_INET6 } else if (IP_V(ip) == 6) { ip6 = (ip6_t *)ip; hlen = sizeof(*ip6); off = ((char *)fin->fin_dp - (char *)fin->fin_ip); sp = (u_short *)&ip6->ip6_src; sum += *sp++; /* ip6_src */ sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; /* This needs to be routing header aware. */ sum += *sp++; /* ip6_dst */ sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; } else { return 0xffff; } #endif slen = fin->fin_plen - off; sum += htons(slen); switch (l4proto) { case IPPROTO_UDP : csump = &((udphdr_t *)l4hdr)->uh_sum; break; case IPPROTO_TCP : csump = &((tcphdr_t *)l4hdr)->th_sum; break; case IPPROTO_ICMP : csump = &((icmphdr_t *)l4hdr)->icmp_cksum; sum = 0; /* Pseudo-checksum is not included */ break; #ifdef USE_INET6 case IPPROTO_ICMPV6 : csump = &((struct icmp6_hdr *)l4hdr)->icmp6_cksum; break; #endif default : break; } if (csump != NULL) { sumsave = *csump; *csump = 0; } sum2 = ipf_pcksum(fin, off, sum); if (csump != NULL) *csump = sumsave; return sum2; } /* ------------------------------------------------------------------------ */ /* Function: ipf_findgroup */ /* Returns: frgroup_t * - NULL = group not found, else pointer to group */ /* Parameters: softc(I) - pointer to soft context main structure */ /* group(I) - group name to search for */ /* unit(I) - device to which this group belongs */ /* set(I) - which set of rules (inactive/inactive) this is */ /* fgpp(O) - pointer to place to store pointer to the pointer */ /* to where to add the next (last) group or where */ /* to delete group from. */ /* */ /* Search amongst the defined groups for a particular group number. */ /* ------------------------------------------------------------------------ */ frgroup_t * ipf_findgroup(softc, group, unit, set, fgpp) ipf_main_softc_t *softc; char *group; minor_t unit; int set; frgroup_t ***fgpp; { frgroup_t *fg, **fgp; /* * Which list of groups to search in is dependent on which list of * rules are being operated on. */ fgp = &softc->ipf_groups[unit][set]; while ((fg = *fgp) != NULL) { if (strncmp(group, fg->fg_name, FR_GROUPLEN) == 0) break; else fgp = &fg->fg_next; } if (fgpp != NULL) *fgpp = fgp; return fg; } /* ------------------------------------------------------------------------ */ /* Function: ipf_group_add */ /* Returns: frgroup_t * - NULL == did not create group, */ /* != NULL == pointer to the group */ /* Parameters: softc(I) - pointer to soft context main structure */ /* num(I) - group number to add */ /* head(I) - rule pointer that is using this as the head */ /* flags(I) - rule flags which describe the type of rule it is */ /* unit(I) - device to which this group will belong to */ /* set(I) - which set of rules (inactive/inactive) this is */ /* Write Locks: ipf_mutex */ /* */ /* Add a new group head, or if it already exists, increase the reference */ /* count to it. */ /* ------------------------------------------------------------------------ */ frgroup_t * ipf_group_add(softc, group, head, flags, unit, set) ipf_main_softc_t *softc; char *group; void *head; u_32_t flags; minor_t unit; int set; { frgroup_t *fg, **fgp; u_32_t gflags; if (group == NULL) return NULL; if (unit == IPL_LOGIPF && *group == '\0') return NULL; fgp = NULL; gflags = flags & FR_INOUT; fg = ipf_findgroup(softc, group, unit, set, &fgp); if (fg != NULL) { if (fg->fg_head == NULL && head != NULL) fg->fg_head = head; if (fg->fg_flags == 0) fg->fg_flags = gflags; else if (gflags != fg->fg_flags) return NULL; fg->fg_ref++; return fg; } KMALLOC(fg, frgroup_t *); if (fg != NULL) { fg->fg_head = head; fg->fg_start = NULL; fg->fg_next = *fgp; bcopy(group, fg->fg_name, strlen(group) + 1); fg->fg_flags = gflags; fg->fg_ref = 1; fg->fg_set = &softc->ipf_groups[unit][set]; *fgp = fg; } return fg; } /* ------------------------------------------------------------------------ */ /* Function: ipf_group_del */ /* Returns: int - number of rules deleted */ /* Parameters: softc(I) - pointer to soft context main structure */ /* group(I) - group name to delete */ /* fr(I) - filter rule from which group is referenced */ /* Write Locks: ipf_mutex */ /* */ /* This function is called whenever a reference to a group is to be dropped */ /* and thus its reference count needs to be lowered and the group free'd if */ /* the reference count reaches zero. Passing in fr is really for the sole */ /* purpose of knowing when the head rule is being deleted. */ /* ------------------------------------------------------------------------ */ void ipf_group_del(softc, group, fr) ipf_main_softc_t *softc; frgroup_t *group; frentry_t *fr; { if (group->fg_head == fr) group->fg_head = NULL; group->fg_ref--; if ((group->fg_ref == 0) && (group->fg_start == NULL)) ipf_group_free(group); } /* ------------------------------------------------------------------------ */ /* Function: ipf_group_free */ /* Returns: Nil */ /* Parameters: group(I) - pointer to filter rule group */ /* */ /* Remove the group from the list of groups and free it. */ /* ------------------------------------------------------------------------ */ static void ipf_group_free(group) frgroup_t *group; { frgroup_t **gp; for (gp = group->fg_set; *gp != NULL; gp = &(*gp)->fg_next) { if (*gp == group) { *gp = group->fg_next; break; } } KFREE(group); } /* ------------------------------------------------------------------------ */ /* Function: ipf_group_flush */ /* Returns: int - number of rules flush from group */ /* Parameters: softc(I) - pointer to soft context main structure */ /* Parameters: group(I) - pointer to filter rule group */ /* */ /* Remove all of the rules that currently are listed under the given group. */ /* ------------------------------------------------------------------------ */ static int ipf_group_flush(softc, group) ipf_main_softc_t *softc; frgroup_t *group; { int gone = 0; (void) ipf_flushlist(softc, &gone, &group->fg_start); return gone; } /* ------------------------------------------------------------------------ */ /* Function: ipf_getrulen */ /* Returns: frentry_t * - NULL == not found, else pointer to rule n */ /* Parameters: softc(I) - pointer to soft context main structure */ /* Parameters: unit(I) - device for which to count the rule's number */ /* flags(I) - which set of rules to find the rule in */ /* group(I) - group name */ /* n(I) - rule number to find */ /* */ /* Find rule # n in group # g and return a pointer to it. Return NULl if */ /* group # g doesn't exist or there are less than n rules in the group. */ /* ------------------------------------------------------------------------ */ frentry_t * ipf_getrulen(softc, unit, group, n) ipf_main_softc_t *softc; int unit; char *group; u_32_t n; { frentry_t *fr; frgroup_t *fg; fg = ipf_findgroup(softc, group, unit, softc->ipf_active, NULL); if (fg == NULL) return NULL; for (fr = fg->fg_start; fr && n; fr = fr->fr_next, n--) ; if (n != 0) return NULL; return fr; } /* ------------------------------------------------------------------------ */ /* Function: ipf_flushlist */ /* Returns: int - >= 0 - number of flushed rules */ /* Parameters: softc(I) - pointer to soft context main structure */ /* nfreedp(O) - pointer to int where flush count is stored */ /* listp(I) - pointer to list to flush pointer */ /* Write Locks: ipf_mutex */ /* */ /* Recursively flush rules from the list, descending groups as they are */ /* encountered. if a rule is the head of a group and it has lost all its */ /* group members, then also delete the group reference. nfreedp is needed */ /* to store the accumulating count of rules removed, whereas the returned */ /* value is just the number removed from the current list. The latter is */ /* needed to correctly adjust reference counts on rules that define groups. */ /* */ /* NOTE: Rules not loaded from user space cannot be flushed. */ /* ------------------------------------------------------------------------ */ static int ipf_flushlist(softc, nfreedp, listp) ipf_main_softc_t *softc; int *nfreedp; frentry_t **listp; { int freed = 0; frentry_t *fp; while ((fp = *listp) != NULL) { if ((fp->fr_type & FR_T_BUILTIN) || !(fp->fr_flags & FR_COPIED)) { listp = &fp->fr_next; continue; } *listp = fp->fr_next; if (fp->fr_next != NULL) fp->fr_next->fr_pnext = fp->fr_pnext; fp->fr_pnext = NULL; if (fp->fr_grphead != NULL) { freed += ipf_group_flush(softc, fp->fr_grphead); fp->fr_names[fp->fr_grhead] = '\0'; } if (fp->fr_icmpgrp != NULL) { freed += ipf_group_flush(softc, fp->fr_icmpgrp); fp->fr_names[fp->fr_icmphead] = '\0'; } if (fp->fr_srctrack.ht_max_nodes) ipf_rb_ht_flush(&fp->fr_srctrack); fp->fr_next = NULL; ASSERT(fp->fr_ref > 0); if (ipf_derefrule(softc, &fp) == 0) freed++; } *nfreedp += freed; return freed; } /* ------------------------------------------------------------------------ */ /* Function: ipf_flush */ /* Returns: int - >= 0 - number of flushed rules */ /* Parameters: softc(I) - pointer to soft context main structure */ /* unit(I) - device for which to flush rules */ /* flags(I) - which set of rules to flush */ /* */ /* Calls flushlist() for all filter rules (accounting, firewall - both IPv4 */ /* and IPv6) as defined by the value of flags. */ /* ------------------------------------------------------------------------ */ int ipf_flush(softc, unit, flags) ipf_main_softc_t *softc; minor_t unit; int flags; { int flushed = 0, set; WRITE_ENTER(&softc->ipf_mutex); set = softc->ipf_active; if ((flags & FR_INACTIVE) == FR_INACTIVE) set = 1 - set; if (flags & FR_OUTQUE) { ipf_flushlist(softc, &flushed, &softc->ipf_rules[1][set]); ipf_flushlist(softc, &flushed, &softc->ipf_acct[1][set]); } if (flags & FR_INQUE) { ipf_flushlist(softc, &flushed, &softc->ipf_rules[0][set]); ipf_flushlist(softc, &flushed, &softc->ipf_acct[0][set]); } flushed += ipf_flush_groups(softc, &softc->ipf_groups[unit][set], flags & (FR_INQUE|FR_OUTQUE)); RWLOCK_EXIT(&softc->ipf_mutex); if (unit == IPL_LOGIPF) { int tmp; tmp = ipf_flush(softc, IPL_LOGCOUNT, flags); if (tmp >= 0) flushed += tmp; } return flushed; } /* ------------------------------------------------------------------------ */ /* Function: ipf_flush_groups */ /* Returns: int - >= 0 - number of flushed rules */ /* Parameters: softc(I) - soft context pointerto work with */ /* grhead(I) - pointer to the start of the group list to flush */ /* flags(I) - which set of rules to flush */ /* */ /* Walk through all of the groups under the given group head and remove all */ /* of those that match the flags passed in. The for loop here is bit more */ /* complicated than usual because the removal of a rule with ipf_derefrule */ /* may end up removing not only the structure pointed to by "fg" but also */ /* what is fg_next and fg_next after that. So if a filter rule is actually */ /* removed from the group then it is necessary to start again. */ /* ------------------------------------------------------------------------ */ static int ipf_flush_groups(softc, grhead, flags) ipf_main_softc_t *softc; frgroup_t **grhead; int flags; { frentry_t *fr, **frp; frgroup_t *fg, **fgp; int flushed = 0; int removed = 0; for (fgp = grhead; (fg = *fgp) != NULL; ) { while ((fg != NULL) && ((fg->fg_flags & flags) == 0)) fg = fg->fg_next; if (fg == NULL) break; removed = 0; frp = &fg->fg_start; while ((removed == 0) && ((fr = *frp) != NULL)) { if ((fr->fr_flags & flags) == 0) { frp = &fr->fr_next; } else { if (fr->fr_next != NULL) fr->fr_next->fr_pnext = fr->fr_pnext; *frp = fr->fr_next; fr->fr_pnext = NULL; fr->fr_next = NULL; (void) ipf_derefrule(softc, &fr); flushed++; removed++; } } if (removed == 0) fgp = &fg->fg_next; } return flushed; } /* ------------------------------------------------------------------------ */ /* Function: memstr */ /* Returns: char * - NULL if failed, != NULL pointer to matching bytes */ /* Parameters: src(I) - pointer to byte sequence to match */ /* dst(I) - pointer to byte sequence to search */ /* slen(I) - match length */ /* dlen(I) - length available to search in */ /* */ /* Search dst for a sequence of bytes matching those at src and extend for */ /* slen bytes. */ /* ------------------------------------------------------------------------ */ char * memstr(src, dst, slen, dlen) const char *src; char *dst; size_t slen, dlen; { char *s = NULL; while (dlen >= slen) { if (bcmp(src, dst, slen) == 0) { s = dst; break; } dst++; dlen--; } return s; } /* ------------------------------------------------------------------------ */ /* Function: ipf_fixskip */ /* Returns: Nil */ /* Parameters: listp(IO) - pointer to start of list with skip rule */ /* rp(I) - rule added/removed with skip in it. */ /* addremove(I) - adjustment (-1/+1) to make to skip count, */ /* depending on whether a rule was just added */ /* or removed. */ /* */ /* Adjust all the rules in a list which would have skip'd past the position */ /* where we are inserting to skip to the right place given the change. */ /* ------------------------------------------------------------------------ */ void ipf_fixskip(listp, rp, addremove) frentry_t **listp, *rp; int addremove; { int rules, rn; frentry_t *fp; rules = 0; for (fp = *listp; (fp != NULL) && (fp != rp); fp = fp->fr_next) rules++; if (!fp) return; for (rn = 0, fp = *listp; fp && (fp != rp); fp = fp->fr_next, rn++) if (FR_ISSKIP(fp->fr_flags) && (rn + fp->fr_arg >= rules)) fp->fr_arg += addremove; } #ifdef _KERNEL /* ------------------------------------------------------------------------ */ /* Function: count4bits */ /* Returns: int - >= 0 - number of consecutive bits in input */ /* Parameters: ip(I) - 32bit IP address */ /* */ /* IPv4 ONLY */ /* count consecutive 1's in bit mask. If the mask generated by counting */ /* consecutive 1's is different to that passed, return -1, else return # */ /* of bits. */ /* ------------------------------------------------------------------------ */ int count4bits(ip) u_32_t ip; { u_32_t ipn; int cnt = 0, i, j; ip = ipn = ntohl(ip); for (i = 32; i; i--, ipn *= 2) if (ipn & 0x80000000) cnt++; else break; ipn = 0; for (i = 32, j = cnt; i; i--, j--) { ipn *= 2; if (j > 0) ipn++; } if (ipn == ip) return cnt; return -1; } /* ------------------------------------------------------------------------ */ /* Function: count6bits */ /* Returns: int - >= 0 - number of consecutive bits in input */ /* Parameters: msk(I) - pointer to start of IPv6 bitmask */ /* */ /* IPv6 ONLY */ /* count consecutive 1's in bit mask. */ /* ------------------------------------------------------------------------ */ # ifdef USE_INET6 int count6bits(msk) u_32_t *msk; { int i = 0, k; u_32_t j; for (k = 3; k >= 0; k--) if (msk[k] == 0xffffffff) i += 32; else { for (j = msk[k]; j; j <<= 1) if (j & 0x80000000) i++; } return i; } # endif #endif /* _KERNEL */ /* ------------------------------------------------------------------------ */ /* Function: ipf_synclist */ /* Returns: int - 0 = no failures, else indication of first failure */ /* Parameters: fr(I) - start of filter list to sync interface names for */ /* ifp(I) - interface pointer for limiting sync lookups */ /* Write Locks: ipf_mutex */ /* */ /* Walk through a list of filter rules and resolve any interface names into */ /* pointers. Where dynamic addresses are used, also update the IP address */ /* used in the rule. The interface pointer is used to limit the lookups to */ /* a specific set of matching names if it is non-NULL. */ /* Errors can occur when resolving the destination name of to/dup-to fields */ /* when the name points to a pool and that pool doest not exist. If this */ /* does happen then it is necessary to check if there are any lookup refs */ /* that need to be dropped before returning with an error. */ /* ------------------------------------------------------------------------ */ static int ipf_synclist(softc, fr, ifp) ipf_main_softc_t *softc; frentry_t *fr; void *ifp; { frentry_t *frt, *start = fr; frdest_t *fdp; char *name; int error; void *ifa; int v, i; error = 0; for (; fr; fr = fr->fr_next) { if (fr->fr_family == AF_INET) v = 4; else if (fr->fr_family == AF_INET6) v = 6; else v = 0; /* * Lookup all the interface names that are part of the rule. */ for (i = 0; i < 4; i++) { if ((ifp != NULL) && (fr->fr_ifas[i] != ifp)) continue; if (fr->fr_ifnames[i] == -1) continue; name = FR_NAME(fr, fr_ifnames[i]); fr->fr_ifas[i] = ipf_resolvenic(softc, name, v); } if ((fr->fr_type & ~FR_T_BUILTIN) == FR_T_IPF) { if (fr->fr_satype != FRI_NORMAL && fr->fr_satype != FRI_LOOKUP) { ifa = ipf_resolvenic(softc, fr->fr_names + fr->fr_sifpidx, v); ipf_ifpaddr(softc, v, fr->fr_satype, ifa, &fr->fr_src6, &fr->fr_smsk6); } if (fr->fr_datype != FRI_NORMAL && fr->fr_datype != FRI_LOOKUP) { ifa = ipf_resolvenic(softc, fr->fr_names + fr->fr_sifpidx, v); ipf_ifpaddr(softc, v, fr->fr_datype, ifa, &fr->fr_dst6, &fr->fr_dmsk6); } } fdp = &fr->fr_tifs[0]; if ((ifp == NULL) || (fdp->fd_ptr == ifp)) { error = ipf_resolvedest(softc, fr->fr_names, fdp, v); if (error != 0) goto unwind; } fdp = &fr->fr_tifs[1]; if ((ifp == NULL) || (fdp->fd_ptr == ifp)) { error = ipf_resolvedest(softc, fr->fr_names, fdp, v); if (error != 0) goto unwind; } fdp = &fr->fr_dif; if ((ifp == NULL) || (fdp->fd_ptr == ifp)) { error = ipf_resolvedest(softc, fr->fr_names, fdp, v); if (error != 0) goto unwind; } if (((fr->fr_type & ~FR_T_BUILTIN) == FR_T_IPF) && (fr->fr_satype == FRI_LOOKUP) && (fr->fr_srcptr == NULL)) { fr->fr_srcptr = ipf_lookup_res_num(softc, fr->fr_srctype, IPL_LOGIPF, fr->fr_srcnum, &fr->fr_srcfunc); } if (((fr->fr_type & ~FR_T_BUILTIN) == FR_T_IPF) && (fr->fr_datype == FRI_LOOKUP) && (fr->fr_dstptr == NULL)) { fr->fr_dstptr = ipf_lookup_res_num(softc, fr->fr_dsttype, IPL_LOGIPF, fr->fr_dstnum, &fr->fr_dstfunc); } } return 0; unwind: for (frt = start; frt != fr; fr = fr->fr_next) { if (((frt->fr_type & ~FR_T_BUILTIN) == FR_T_IPF) && (frt->fr_satype == FRI_LOOKUP) && (frt->fr_srcptr != NULL)) ipf_lookup_deref(softc, frt->fr_srctype, frt->fr_srcptr); if (((frt->fr_type & ~FR_T_BUILTIN) == FR_T_IPF) && (frt->fr_datype == FRI_LOOKUP) && (frt->fr_dstptr != NULL)) ipf_lookup_deref(softc, frt->fr_dsttype, frt->fr_dstptr); } return error; } /* ------------------------------------------------------------------------ */ /* Function: ipf_sync */ /* Returns: void */ /* Parameters: Nil */ /* */ /* ipf_sync() is called when we suspect that the interface list or */ /* information about interfaces (like IP#) has changed. Go through all */ /* filter rules, NAT entries and the state table and check if anything */ /* needs to be changed/updated. */ /* ------------------------------------------------------------------------ */ int ipf_sync(softc, ifp) ipf_main_softc_t *softc; void *ifp; { int i; # if !SOLARIS ipf_nat_sync(softc, ifp); ipf_state_sync(softc, ifp); ipf_lookup_sync(softc, ifp); # endif WRITE_ENTER(&softc->ipf_mutex); (void) ipf_synclist(softc, softc->ipf_acct[0][softc->ipf_active], ifp); (void) ipf_synclist(softc, softc->ipf_acct[1][softc->ipf_active], ifp); (void) ipf_synclist(softc, softc->ipf_rules[0][softc->ipf_active], ifp); (void) ipf_synclist(softc, softc->ipf_rules[1][softc->ipf_active], ifp); for (i = 0; i < IPL_LOGSIZE; i++) { frgroup_t *g; for (g = softc->ipf_groups[i][0]; g != NULL; g = g->fg_next) (void) ipf_synclist(softc, g->fg_start, ifp); for (g = softc->ipf_groups[i][1]; g != NULL; g = g->fg_next) (void) ipf_synclist(softc, g->fg_start, ifp); } RWLOCK_EXIT(&softc->ipf_mutex); return 0; } /* * In the functions below, bcopy() is called because the pointer being * copied _from_ in this instance is a pointer to a char buf (which could * end up being unaligned) and on the kernel's local stack. */ /* ------------------------------------------------------------------------ */ /* Function: copyinptr */ /* Returns: int - 0 = success, else failure */ /* Parameters: src(I) - pointer to the source address */ /* dst(I) - destination address */ /* size(I) - number of bytes to copy */ /* */ /* Copy a block of data in from user space, given a pointer to the pointer */ /* to start copying from (src) and a pointer to where to store it (dst). */ /* NB: src - pointer to user space pointer, dst - kernel space pointer */ /* ------------------------------------------------------------------------ */ int copyinptr(softc, src, dst, size) ipf_main_softc_t *softc; void *src, *dst; size_t size; { caddr_t ca; int error; # if SOLARIS error = COPYIN(src, &ca, sizeof(ca)); if (error != 0) return error; # else bcopy(src, (caddr_t)&ca, sizeof(ca)); # endif error = COPYIN(ca, dst, size); if (error != 0) { IPFERROR(3); error = EFAULT; } return error; } /* ------------------------------------------------------------------------ */ /* Function: copyoutptr */ /* Returns: int - 0 = success, else failure */ /* Parameters: src(I) - pointer to the source address */ /* dst(I) - destination address */ /* size(I) - number of bytes to copy */ /* */ /* Copy a block of data out to user space, given a pointer to the pointer */ /* to start copying from (src) and a pointer to where to store it (dst). */ /* NB: src - kernel space pointer, dst - pointer to user space pointer. */ /* ------------------------------------------------------------------------ */ int copyoutptr(softc, src, dst, size) ipf_main_softc_t *softc; void *src, *dst; size_t size; { caddr_t ca; int error; bcopy(dst, (caddr_t)&ca, sizeof(ca)); error = COPYOUT(src, ca, size); if (error != 0) { IPFERROR(4); error = EFAULT; } return error; } #ifdef _KERNEL #endif /* ------------------------------------------------------------------------ */ /* Function: ipf_lock */ /* Returns: int - 0 = success, else error */ /* Parameters: data(I) - pointer to lock value to set */ /* lockp(O) - pointer to location to store old lock value */ /* */ /* Get the new value for the lock integer, set it and return the old value */ /* in *lockp. */ /* ------------------------------------------------------------------------ */ int ipf_lock(data, lockp) caddr_t data; int *lockp; { int arg, err; err = BCOPYIN(data, &arg, sizeof(arg)); if (err != 0) return EFAULT; err = BCOPYOUT(lockp, data, sizeof(*lockp)); if (err != 0) return EFAULT; *lockp = arg; return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_getstat */ /* Returns: Nil */ /* Parameters: softc(I) - pointer to soft context main structure */ /* fiop(I) - pointer to ipfilter stats structure */ /* rev(I) - version claim by program doing ioctl */ /* */ /* Stores a copy of current pointers, counters, etc, in the friostat */ /* structure. */ /* If IPFILTER_COMPAT is compiled, we pretend to be whatever version the */ /* program is looking for. This ensure that validation of the version it */ /* expects will always succeed. Thus kernels with IPFILTER_COMPAT will */ /* allow older binaries to work but kernels without it will not. */ /* ------------------------------------------------------------------------ */ /*ARGSUSED*/ static void ipf_getstat(softc, fiop, rev) ipf_main_softc_t *softc; friostat_t *fiop; int rev; { int i; bcopy((char *)softc->ipf_stats, (char *)fiop->f_st, sizeof(ipf_statistics_t) * 2); fiop->f_locks[IPL_LOGSTATE] = -1; fiop->f_locks[IPL_LOGNAT] = -1; fiop->f_locks[IPL_LOGIPF] = -1; fiop->f_locks[IPL_LOGAUTH] = -1; fiop->f_ipf[0][0] = softc->ipf_rules[0][0]; fiop->f_acct[0][0] = softc->ipf_acct[0][0]; fiop->f_ipf[0][1] = softc->ipf_rules[0][1]; fiop->f_acct[0][1] = softc->ipf_acct[0][1]; fiop->f_ipf[1][0] = softc->ipf_rules[1][0]; fiop->f_acct[1][0] = softc->ipf_acct[1][0]; fiop->f_ipf[1][1] = softc->ipf_rules[1][1]; fiop->f_acct[1][1] = softc->ipf_acct[1][1]; fiop->f_ticks = softc->ipf_ticks; fiop->f_active = softc->ipf_active; fiop->f_froute[0] = softc->ipf_frouteok[0]; fiop->f_froute[1] = softc->ipf_frouteok[1]; fiop->f_rb_no_mem = softc->ipf_rb_no_mem; fiop->f_rb_node_max = softc->ipf_rb_node_max; fiop->f_running = softc->ipf_running; for (i = 0; i < IPL_LOGSIZE; i++) { fiop->f_groups[i][0] = softc->ipf_groups[i][0]; fiop->f_groups[i][1] = softc->ipf_groups[i][1]; } #ifdef IPFILTER_LOG fiop->f_log_ok = ipf_log_logok(softc, IPL_LOGIPF); fiop->f_log_fail = ipf_log_failures(softc, IPL_LOGIPF); fiop->f_logging = 1; #else fiop->f_log_ok = 0; fiop->f_log_fail = 0; fiop->f_logging = 0; #endif fiop->f_defpass = softc->ipf_pass; fiop->f_features = ipf_features; #ifdef IPFILTER_COMPAT sprintf(fiop->f_version, "IP Filter: v%d.%d.%d", (rev / 1000000) % 100, (rev / 10000) % 100, (rev / 100) % 100); #else rev = rev; (void) strncpy(fiop->f_version, ipfilter_version, sizeof(fiop->f_version)); #endif } #ifdef USE_INET6 int icmptoicmp6types[ICMP_MAXTYPE+1] = { ICMP6_ECHO_REPLY, /* 0: ICMP_ECHOREPLY */ -1, /* 1: UNUSED */ -1, /* 2: UNUSED */ ICMP6_DST_UNREACH, /* 3: ICMP_UNREACH */ -1, /* 4: ICMP_SOURCEQUENCH */ ND_REDIRECT, /* 5: ICMP_REDIRECT */ -1, /* 6: UNUSED */ -1, /* 7: UNUSED */ ICMP6_ECHO_REQUEST, /* 8: ICMP_ECHO */ -1, /* 9: UNUSED */ -1, /* 10: UNUSED */ ICMP6_TIME_EXCEEDED, /* 11: ICMP_TIMXCEED */ ICMP6_PARAM_PROB, /* 12: ICMP_PARAMPROB */ -1, /* 13: ICMP_TSTAMP */ -1, /* 14: ICMP_TSTAMPREPLY */ -1, /* 15: ICMP_IREQ */ -1, /* 16: ICMP_IREQREPLY */ -1, /* 17: ICMP_MASKREQ */ -1, /* 18: ICMP_MASKREPLY */ }; int icmptoicmp6unreach[ICMP_MAX_UNREACH] = { ICMP6_DST_UNREACH_ADDR, /* 0: ICMP_UNREACH_NET */ ICMP6_DST_UNREACH_ADDR, /* 1: ICMP_UNREACH_HOST */ -1, /* 2: ICMP_UNREACH_PROTOCOL */ ICMP6_DST_UNREACH_NOPORT, /* 3: ICMP_UNREACH_PORT */ -1, /* 4: ICMP_UNREACH_NEEDFRAG */ ICMP6_DST_UNREACH_NOTNEIGHBOR, /* 5: ICMP_UNREACH_SRCFAIL */ ICMP6_DST_UNREACH_ADDR, /* 6: ICMP_UNREACH_NET_UNKNOWN */ ICMP6_DST_UNREACH_ADDR, /* 7: ICMP_UNREACH_HOST_UNKNOWN */ -1, /* 8: ICMP_UNREACH_ISOLATED */ ICMP6_DST_UNREACH_ADMIN, /* 9: ICMP_UNREACH_NET_PROHIB */ ICMP6_DST_UNREACH_ADMIN, /* 10: ICMP_UNREACH_HOST_PROHIB */ -1, /* 11: ICMP_UNREACH_TOSNET */ -1, /* 12: ICMP_UNREACH_TOSHOST */ ICMP6_DST_UNREACH_ADMIN, /* 13: ICMP_UNREACH_ADMIN_PROHIBIT */ }; int icmpreplytype6[ICMP6_MAXTYPE + 1]; #endif int icmpreplytype4[ICMP_MAXTYPE + 1]; /* ------------------------------------------------------------------------ */ /* Function: ipf_matchicmpqueryreply */ /* Returns: int - 1 if "icmp" is a valid reply to "ic" else 0. */ /* Parameters: v(I) - IP protocol version (4 or 6) */ /* ic(I) - ICMP information */ /* icmp(I) - ICMP packet header */ /* rev(I) - direction (0 = forward/1 = reverse) of packet */ /* */ /* Check if the ICMP packet defined by the header pointed to by icmp is a */ /* reply to one as described by what's in ic. If it is a match, return 1, */ /* else return 0 for no match. */ /* ------------------------------------------------------------------------ */ int ipf_matchicmpqueryreply(v, ic, icmp, rev) int v; icmpinfo_t *ic; icmphdr_t *icmp; int rev; { int ictype; ictype = ic->ici_type; if (v == 4) { /* * If we matched its type on the way in, then when going out * it will still be the same type. */ if ((!rev && (icmp->icmp_type == ictype)) || (rev && (icmpreplytype4[ictype] == icmp->icmp_type))) { if (icmp->icmp_type != ICMP_ECHOREPLY) return 1; if (icmp->icmp_id == ic->ici_id) return 1; } } #ifdef USE_INET6 else if (v == 6) { if ((!rev && (icmp->icmp_type == ictype)) || (rev && (icmpreplytype6[ictype] == icmp->icmp_type))) { if (icmp->icmp_type != ICMP6_ECHO_REPLY) return 1; if (icmp->icmp_id == ic->ici_id) return 1; } } #endif return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_rule_compare */ /* Parameters: fr1(I) - first rule structure to compare */ /* fr2(I) - second rule structure to compare */ /* Returns: int - 0 == rules are the same, else mismatch */ /* */ /* Compare two rules and return 0 if they match or a number indicating */ /* which of the individual checks failed. */ /* ------------------------------------------------------------------------ */ static int ipf_rule_compare(frentry_t *fr1, frentry_t *fr2) { if (fr1->fr_cksum != fr2->fr_cksum) return 1; if (fr1->fr_size != fr2->fr_size) return 2; if (fr1->fr_dsize != fr2->fr_dsize) return 3; if (bcmp((char *)&fr1->fr_func, (char *)&fr2->fr_func, fr1->fr_size - offsetof(struct frentry, fr_func)) != 0) return 4; if (fr1->fr_data && !fr2->fr_data) return 5; if (!fr1->fr_data && fr2->fr_data) return 6; if (fr1->fr_data) { if (bcmp(fr1->fr_caddr, fr2->fr_caddr, fr1->fr_dsize)) return 7; } return 0; } /* ------------------------------------------------------------------------ */ /* Function: frrequest */ /* Returns: int - 0 == success, > 0 == errno value */ /* Parameters: unit(I) - device for which this is for */ /* req(I) - ioctl command (SIOC*) */ /* data(I) - pointr to ioctl data */ /* set(I) - 1 or 0 (filter set) */ /* makecopy(I) - flag indicating whether data points to a rule */ /* in kernel space & hence doesn't need copying. */ /* */ /* This function handles all the requests which operate on the list of */ /* filter rules. This includes adding, deleting, insertion. It is also */ /* responsible for creating groups when a "head" rule is loaded. Interface */ /* names are resolved here and other sanity checks are made on the content */ /* of the rule structure being loaded. If a rule has user defined timeouts */ /* then make sure they are created and initialised before exiting. */ /* ------------------------------------------------------------------------ */ int frrequest(softc, unit, req, data, set, makecopy) ipf_main_softc_t *softc; int unit; ioctlcmd_t req; int set, makecopy; caddr_t data; { int error = 0, in, family, addrem, need_free = 0; frentry_t frd, *fp, *f, **fprev, **ftail; void *ptr, *uptr, *cptr; u_int *p, *pp; frgroup_t *fg; char *group; ptr = NULL; cptr = NULL; fg = NULL; fp = &frd; if (makecopy != 0) { bzero(fp, sizeof(frd)); error = ipf_inobj(softc, data, NULL, fp, IPFOBJ_FRENTRY); if (error) { return error; } if ((fp->fr_type & FR_T_BUILTIN) != 0) { IPFERROR(6); return EINVAL; } KMALLOCS(f, frentry_t *, fp->fr_size); if (f == NULL) { IPFERROR(131); return ENOMEM; } bzero(f, fp->fr_size); error = ipf_inobjsz(softc, data, f, IPFOBJ_FRENTRY, fp->fr_size); if (error) { KFREES(f, fp->fr_size); return error; } fp = f; f = NULL; fp->fr_next = NULL; fp->fr_dnext = NULL; fp->fr_pnext = NULL; fp->fr_pdnext = NULL; fp->fr_grp = NULL; fp->fr_grphead = NULL; fp->fr_icmpgrp = NULL; fp->fr_isc = (void *)-1; fp->fr_ptr = NULL; fp->fr_ref = 0; fp->fr_flags |= FR_COPIED; } else { fp = (frentry_t *)data; if ((fp->fr_type & FR_T_BUILTIN) == 0) { IPFERROR(7); return EINVAL; } fp->fr_flags &= ~FR_COPIED; } if (((fp->fr_dsize == 0) && (fp->fr_data != NULL)) || ((fp->fr_dsize != 0) && (fp->fr_data == NULL))) { IPFERROR(8); error = EINVAL; goto donenolock; } family = fp->fr_family; uptr = fp->fr_data; if (req == (ioctlcmd_t)SIOCINAFR || req == (ioctlcmd_t)SIOCINIFR || req == (ioctlcmd_t)SIOCADAFR || req == (ioctlcmd_t)SIOCADIFR) addrem = 0; else if (req == (ioctlcmd_t)SIOCRMAFR || req == (ioctlcmd_t)SIOCRMIFR) addrem = 1; else if (req == (ioctlcmd_t)SIOCZRLST) addrem = 2; else { IPFERROR(9); error = EINVAL; goto donenolock; } /* * Only filter rules for IPv4 or IPv6 are accepted. */ if (family == AF_INET) { /*EMPTY*/; #ifdef USE_INET6 } else if (family == AF_INET6) { /*EMPTY*/; #endif } else if (family != 0) { IPFERROR(10); error = EINVAL; goto donenolock; } /* * If the rule is being loaded from user space, i.e. we had to copy it * into kernel space, then do not trust the function pointer in the * rule. */ if ((makecopy == 1) && (fp->fr_func != NULL)) { if (ipf_findfunc(fp->fr_func) == NULL) { IPFERROR(11); error = ESRCH; goto donenolock; } if (addrem == 0) { error = ipf_funcinit(softc, fp); if (error != 0) goto donenolock; } } if ((fp->fr_flags & FR_CALLNOW) && ((fp->fr_func == NULL) || (fp->fr_func == (ipfunc_t)-1))) { IPFERROR(142); error = ESRCH; goto donenolock; } if (((fp->fr_flags & FR_CMDMASK) == FR_CALL) && ((fp->fr_func == NULL) || (fp->fr_func == (ipfunc_t)-1))) { IPFERROR(143); error = ESRCH; goto donenolock; } ptr = NULL; cptr = NULL; if (FR_ISACCOUNT(fp->fr_flags)) unit = IPL_LOGCOUNT; /* * Check that each group name in the rule has a start index that * is valid. */ if (fp->fr_icmphead != -1) { if ((fp->fr_icmphead < 0) || (fp->fr_icmphead >= fp->fr_namelen)) { IPFERROR(136); error = EINVAL; goto donenolock; } if (!strcmp(FR_NAME(fp, fr_icmphead), "0")) fp->fr_names[fp->fr_icmphead] = '\0'; } if (fp->fr_grhead != -1) { if ((fp->fr_grhead < 0) || (fp->fr_grhead >= fp->fr_namelen)) { IPFERROR(137); error = EINVAL; goto donenolock; } if (!strcmp(FR_NAME(fp, fr_grhead), "0")) fp->fr_names[fp->fr_grhead] = '\0'; } if (fp->fr_group != -1) { if ((fp->fr_group < 0) || (fp->fr_group >= fp->fr_namelen)) { IPFERROR(138); error = EINVAL; goto donenolock; } if ((req != (int)SIOCZRLST) && (fp->fr_group != -1)) { /* * Allow loading rules that are in groups to cause * them to be created if they don't already exit. */ group = FR_NAME(fp, fr_group); if (addrem == 0) { fg = ipf_group_add(softc, group, NULL, fp->fr_flags, unit, set); fp->fr_grp = fg; } else { fg = ipf_findgroup(softc, group, unit, set, NULL); if (fg == NULL) { IPFERROR(12); error = ESRCH; goto donenolock; } } if (fg->fg_flags == 0) { fg->fg_flags = fp->fr_flags & FR_INOUT; } else if (fg->fg_flags != (fp->fr_flags & FR_INOUT)) { IPFERROR(13); error = ESRCH; goto donenolock; } } } else { /* * If a rule is going to be part of a group then it does * not matter whether it is an in or out rule, but if it * isn't in a group, then it does... */ if ((fp->fr_flags & (FR_INQUE|FR_OUTQUE)) == 0) { IPFERROR(14); error = EINVAL; goto donenolock; } } in = (fp->fr_flags & FR_INQUE) ? 0 : 1; /* * Work out which rule list this change is being applied to. */ ftail = NULL; fprev = NULL; if (unit == IPL_LOGAUTH) { if ((fp->fr_tifs[0].fd_ptr != NULL) || (fp->fr_tifs[1].fd_ptr != NULL) || (fp->fr_dif.fd_ptr != NULL) || (fp->fr_flags & FR_FASTROUTE)) { softc->ipf_interror = 145; error = EINVAL; goto donenolock; } fprev = ipf_auth_rulehead(softc); } else { if (FR_ISACCOUNT(fp->fr_flags)) fprev = &softc->ipf_acct[in][set]; else if ((fp->fr_flags & (FR_OUTQUE|FR_INQUE)) != 0) fprev = &softc->ipf_rules[in][set]; } if (fprev == NULL) { IPFERROR(15); error = ESRCH; goto donenolock; } if (fg != NULL) fprev = &fg->fg_start; /* * Copy in extra data for the rule. */ if (fp->fr_dsize != 0) { if (makecopy != 0) { KMALLOCS(ptr, void *, fp->fr_dsize); if (ptr == NULL) { IPFERROR(16); error = ENOMEM; goto donenolock; } /* * The bcopy case is for when the data is appended * to the rule by ipf_in_compat(). */ if (uptr >= (void *)fp && uptr < (void *)((char *)fp + fp->fr_size)) { bcopy(uptr, ptr, fp->fr_dsize); error = 0; } else { error = COPYIN(uptr, ptr, fp->fr_dsize); if (error != 0) { IPFERROR(17); error = EFAULT; goto donenolock; } } } else { ptr = uptr; } fp->fr_data = ptr; } else { fp->fr_data = NULL; } /* * Perform per-rule type sanity checks of their members. * All code after this needs to be aware that allocated memory * may need to be free'd before exiting. */ switch (fp->fr_type & ~FR_T_BUILTIN) { #if defined(IPFILTER_BPF) case FR_T_BPFOPC : if (fp->fr_dsize == 0) { IPFERROR(19); error = EINVAL; break; } if (!bpf_validate(ptr, fp->fr_dsize/sizeof(struct bpf_insn))) { IPFERROR(20); error = EINVAL; break; } break; #endif case FR_T_IPF : /* * Preparation for error case at the bottom of this function. */ if (fp->fr_datype == FRI_LOOKUP) fp->fr_dstptr = NULL; if (fp->fr_satype == FRI_LOOKUP) fp->fr_srcptr = NULL; if (fp->fr_dsize != sizeof(fripf_t)) { IPFERROR(21); error = EINVAL; break; } /* * Allowing a rule with both "keep state" and "with oow" is * pointless because adding a state entry to the table will * fail with the out of window (oow) flag set. */ if ((fp->fr_flags & FR_KEEPSTATE) && (fp->fr_flx & FI_OOW)) { IPFERROR(22); error = EINVAL; break; } switch (fp->fr_satype) { case FRI_BROADCAST : case FRI_DYNAMIC : case FRI_NETWORK : case FRI_NETMASKED : case FRI_PEERADDR : if (fp->fr_sifpidx < 0) { IPFERROR(23); error = EINVAL; } break; case FRI_LOOKUP : fp->fr_srcptr = ipf_findlookup(softc, unit, fp, &fp->fr_src6, &fp->fr_smsk6); if (fp->fr_srcfunc == NULL) { IPFERROR(132); error = ESRCH; break; } break; case FRI_NORMAL : break; default : IPFERROR(133); error = EINVAL; break; } if (error != 0) break; switch (fp->fr_datype) { case FRI_BROADCAST : case FRI_DYNAMIC : case FRI_NETWORK : case FRI_NETMASKED : case FRI_PEERADDR : if (fp->fr_difpidx < 0) { IPFERROR(24); error = EINVAL; } break; case FRI_LOOKUP : fp->fr_dstptr = ipf_findlookup(softc, unit, fp, &fp->fr_dst6, &fp->fr_dmsk6); if (fp->fr_dstfunc == NULL) { IPFERROR(134); error = ESRCH; } break; case FRI_NORMAL : break; default : IPFERROR(135); error = EINVAL; } break; case FR_T_NONE : case FR_T_CALLFUNC : case FR_T_COMPIPF : break; case FR_T_IPFEXPR : if (ipf_matcharray_verify(fp->fr_data, fp->fr_dsize) == -1) { IPFERROR(25); error = EINVAL; } break; default : IPFERROR(26); error = EINVAL; break; } if (error != 0) goto donenolock; if (fp->fr_tif.fd_name != -1) { if ((fp->fr_tif.fd_name < 0) || (fp->fr_tif.fd_name >= fp->fr_namelen)) { IPFERROR(139); error = EINVAL; goto donenolock; } } if (fp->fr_dif.fd_name != -1) { if ((fp->fr_dif.fd_name < 0) || (fp->fr_dif.fd_name >= fp->fr_namelen)) { IPFERROR(140); error = EINVAL; goto donenolock; } } if (fp->fr_rif.fd_name != -1) { if ((fp->fr_rif.fd_name < 0) || (fp->fr_rif.fd_name >= fp->fr_namelen)) { IPFERROR(141); error = EINVAL; goto donenolock; } } /* * Lookup all the interface names that are part of the rule. */ error = ipf_synclist(softc, fp, NULL); if (error != 0) goto donenolock; fp->fr_statecnt = 0; if (fp->fr_srctrack.ht_max_nodes != 0) ipf_rb_ht_init(&fp->fr_srctrack); /* * Look for an existing matching filter rule, but don't include the * next or interface pointer in the comparison (fr_next, fr_ifa). * This elminates rules which are indentical being loaded. Checksum * the constant part of the filter rule to make comparisons quicker * (this meaning no pointers are included). */ for (fp->fr_cksum = 0, p = (u_int *)&fp->fr_func, pp = &fp->fr_cksum; p < pp; p++) fp->fr_cksum += *p; pp = (u_int *)(fp->fr_caddr + fp->fr_dsize); for (p = (u_int *)fp->fr_data; p < pp; p++) fp->fr_cksum += *p; WRITE_ENTER(&softc->ipf_mutex); /* * Now that the filter rule lists are locked, we can walk the * chain of them without fear. */ ftail = fprev; for (f = *ftail; (f = *ftail) != NULL; ftail = &f->fr_next) { if (fp->fr_collect <= f->fr_collect) { ftail = fprev; f = NULL; break; } fprev = ftail; } for (; (f = *ftail) != NULL; ftail = &f->fr_next) { if (ipf_rule_compare(fp, f) == 0) break; } /* * If zero'ing statistics, copy current to caller and zero. */ if (addrem == 2) { if (f == NULL) { IPFERROR(27); error = ESRCH; } else { /* * Copy and reduce lock because of impending copyout. * Well we should, but if we do then the atomicity of * this call and the correctness of fr_hits and * fr_bytes cannot be guaranteed. As it is, this code * only resets them to 0 if they are successfully * copied out into user space. */ bcopy((char *)f, (char *)fp, f->fr_size); /* MUTEX_DOWNGRADE(&softc->ipf_mutex); */ /* * When we copy this rule back out, set the data * pointer to be what it was in user space. */ fp->fr_data = uptr; error = ipf_outobj(softc, data, fp, IPFOBJ_FRENTRY); if (error == 0) { if ((f->fr_dsize != 0) && (uptr != NULL)) error = COPYOUT(f->fr_data, uptr, f->fr_dsize); if (error != 0) { IPFERROR(28); error = EFAULT; } if (error == 0) { f->fr_hits = 0; f->fr_bytes = 0; } } } if (makecopy != 0) { if (ptr != NULL) { KFREES(ptr, fp->fr_dsize); } KFREES(fp, fp->fr_size); } RWLOCK_EXIT(&softc->ipf_mutex); return error; } if (!f) { /* * At the end of this, ftail must point to the place where the * new rule is to be saved/inserted/added. * For SIOCAD*FR, this should be the last rule in the group of * rules that have equal fr_collect fields. * For SIOCIN*FR, ... */ if (req == (ioctlcmd_t)SIOCADAFR || req == (ioctlcmd_t)SIOCADIFR) { for (ftail = fprev; (f = *ftail) != NULL; ) { if (f->fr_collect > fp->fr_collect) break; ftail = &f->fr_next; fprev = ftail; } ftail = fprev; f = NULL; ptr = NULL; } else if (req == (ioctlcmd_t)SIOCINAFR || req == (ioctlcmd_t)SIOCINIFR) { while ((f = *fprev) != NULL) { if (f->fr_collect >= fp->fr_collect) break; fprev = &f->fr_next; } ftail = fprev; if (fp->fr_hits != 0) { while (fp->fr_hits && (f = *ftail)) { if (f->fr_collect != fp->fr_collect) break; fprev = ftail; ftail = &f->fr_next; fp->fr_hits--; } } f = NULL; ptr = NULL; } } /* * Request to remove a rule. */ if (addrem == 1) { if (!f) { IPFERROR(29); error = ESRCH; } else { /* * Do not allow activity from user space to interfere * with rules not loaded that way. */ if ((makecopy == 1) && !(f->fr_flags & FR_COPIED)) { IPFERROR(30); error = EPERM; goto done; } /* * Return EBUSY if the rule is being reference by * something else (eg state information.) */ if (f->fr_ref > 1) { IPFERROR(31); error = EBUSY; goto done; } #ifdef IPFILTER_SCAN if (f->fr_isctag != -1 && (f->fr_isc != (struct ipscan *)-1)) ipf_scan_detachfr(f); #endif if (unit == IPL_LOGAUTH) { error = ipf_auth_precmd(softc, req, f, ftail); goto done; } ipf_rule_delete(softc, f, unit, set); need_free = makecopy; } } else { /* * Not removing, so we must be adding/inserting a rule. */ if (f != NULL) { IPFERROR(32); error = EEXIST; goto done; } if (unit == IPL_LOGAUTH) { error = ipf_auth_precmd(softc, req, fp, ftail); goto done; } MUTEX_NUKE(&fp->fr_lock); MUTEX_INIT(&fp->fr_lock, "filter rule lock"); if (fp->fr_die != 0) ipf_rule_expire_insert(softc, fp, set); fp->fr_hits = 0; if (makecopy != 0) fp->fr_ref = 1; fp->fr_pnext = ftail; fp->fr_next = *ftail; if (fp->fr_next != NULL) fp->fr_next->fr_pnext = &fp->fr_next; *ftail = fp; if (addrem == 0) ipf_fixskip(ftail, fp, 1); fp->fr_icmpgrp = NULL; if (fp->fr_icmphead != -1) { group = FR_NAME(fp, fr_icmphead); fg = ipf_group_add(softc, group, fp, 0, unit, set); fp->fr_icmpgrp = fg; } fp->fr_grphead = NULL; if (fp->fr_grhead != -1) { group = FR_NAME(fp, fr_grhead); fg = ipf_group_add(softc, group, fp, fp->fr_flags, unit, set); fp->fr_grphead = fg; } } done: RWLOCK_EXIT(&softc->ipf_mutex); donenolock: if (need_free || (error != 0)) { if ((fp->fr_type & ~FR_T_BUILTIN) == FR_T_IPF) { if ((fp->fr_satype == FRI_LOOKUP) && (fp->fr_srcptr != NULL)) ipf_lookup_deref(softc, fp->fr_srctype, fp->fr_srcptr); if ((fp->fr_datype == FRI_LOOKUP) && (fp->fr_dstptr != NULL)) ipf_lookup_deref(softc, fp->fr_dsttype, fp->fr_dstptr); } if (fp->fr_grp != NULL) { WRITE_ENTER(&softc->ipf_mutex); ipf_group_del(softc, fp->fr_grp, fp); RWLOCK_EXIT(&softc->ipf_mutex); } if ((ptr != NULL) && (makecopy != 0)) { KFREES(ptr, fp->fr_dsize); } KFREES(fp, fp->fr_size); } return (error); } /* ------------------------------------------------------------------------ */ /* Function: ipf_rule_delete */ /* Returns: Nil */ /* Parameters: softc(I) - pointer to soft context main structure */ /* f(I) - pointer to the rule being deleted */ /* ftail(I) - pointer to the pointer to f */ /* unit(I) - device for which this is for */ /* set(I) - 1 or 0 (filter set) */ /* */ /* This function attempts to do what it can to delete a filter rule: remove */ /* it from any linked lists and remove any groups it is responsible for. */ /* But in the end, removing a rule can only drop the reference count - we */ /* must use that as the guide for whether or not it can be freed. */ /* ------------------------------------------------------------------------ */ static void ipf_rule_delete(softc, f, unit, set) ipf_main_softc_t *softc; frentry_t *f; int unit, set; { /* * If fr_pdnext is set, then the rule is on the expire list, so * remove it from there. */ if (f->fr_pdnext != NULL) { *f->fr_pdnext = f->fr_dnext; if (f->fr_dnext != NULL) f->fr_dnext->fr_pdnext = f->fr_pdnext; f->fr_pdnext = NULL; f->fr_dnext = NULL; } ipf_fixskip(f->fr_pnext, f, -1); if (f->fr_pnext != NULL) *f->fr_pnext = f->fr_next; if (f->fr_next != NULL) f->fr_next->fr_pnext = f->fr_pnext; f->fr_pnext = NULL; f->fr_next = NULL; (void) ipf_derefrule(softc, &f); } /* ------------------------------------------------------------------------ */ /* Function: ipf_rule_expire_insert */ /* Returns: Nil */ /* Parameters: softc(I) - pointer to soft context main structure */ /* f(I) - pointer to rule to be added to expire list */ /* set(I) - 1 or 0 (filter set) */ /* */ /* If the new rule has a given expiration time, insert it into the list of */ /* expiring rules with the ones to be removed first added to the front of */ /* the list. The insertion is O(n) but it is kept sorted for quick scans at */ /* expiration interval checks. */ /* ------------------------------------------------------------------------ */ static void ipf_rule_expire_insert(softc, f, set) ipf_main_softc_t *softc; frentry_t *f; int set; { frentry_t *fr; /* */ f->fr_die = softc->ipf_ticks + IPF_TTLVAL(f->fr_die); for (fr = softc->ipf_rule_explist[set]; fr != NULL; fr = fr->fr_dnext) { if (f->fr_die < fr->fr_die) break; if (fr->fr_dnext == NULL) { /* * We've got to the last rule and everything * wanted to be expired before this new node, * so we have to tack it on the end... */ fr->fr_dnext = f; f->fr_pdnext = &fr->fr_dnext; fr = NULL; break; } } if (softc->ipf_rule_explist[set] == NULL) { softc->ipf_rule_explist[set] = f; f->fr_pdnext = &softc->ipf_rule_explist[set]; } else if (fr != NULL) { f->fr_dnext = fr; f->fr_pdnext = fr->fr_pdnext; fr->fr_pdnext = &f->fr_dnext; } } /* ------------------------------------------------------------------------ */ /* Function: ipf_findlookup */ /* Returns: NULL = failure, else success */ /* Parameters: softc(I) - pointer to soft context main structure */ /* unit(I) - ipf device we want to find match for */ /* fp(I) - rule for which lookup is for */ /* addrp(I) - pointer to lookup information in address struct */ /* maskp(O) - pointer to lookup information for storage */ /* */ /* When using pools and hash tables to store addresses for matching in */ /* rules, it is necessary to resolve both the object referred to by the */ /* name or address (and return that pointer) and also provide the means by */ /* which to determine if an address belongs to that object to make the */ /* packet matching quicker. */ /* ------------------------------------------------------------------------ */ static void * ipf_findlookup(softc, unit, fr, addrp, maskp) ipf_main_softc_t *softc; int unit; frentry_t *fr; i6addr_t *addrp, *maskp; { void *ptr = NULL; switch (addrp->iplookupsubtype) { case 0 : ptr = ipf_lookup_res_num(softc, unit, addrp->iplookuptype, addrp->iplookupnum, &maskp->iplookupfunc); break; case 1 : if (addrp->iplookupname < 0) break; if (addrp->iplookupname >= fr->fr_namelen) break; ptr = ipf_lookup_res_name(softc, unit, addrp->iplookuptype, fr->fr_names + addrp->iplookupname, &maskp->iplookupfunc); break; default : break; } return ptr; } /* ------------------------------------------------------------------------ */ /* Function: ipf_funcinit */ /* Returns: int - 0 == success, else ESRCH: cannot resolve rule details */ /* Parameters: softc(I) - pointer to soft context main structure */ /* fr(I) - pointer to filter rule */ /* */ /* If a rule is a call rule, then check if the function it points to needs */ /* an init function to be called now the rule has been loaded. */ /* ------------------------------------------------------------------------ */ static int ipf_funcinit(softc, fr) ipf_main_softc_t *softc; frentry_t *fr; { ipfunc_resolve_t *ft; int err; IPFERROR(34); err = ESRCH; for (ft = ipf_availfuncs; ft->ipfu_addr != NULL; ft++) if (ft->ipfu_addr == fr->fr_func) { err = 0; if (ft->ipfu_init != NULL) err = (*ft->ipfu_init)(softc, fr); break; } return err; } /* ------------------------------------------------------------------------ */ /* Function: ipf_funcfini */ /* Returns: Nil */ /* Parameters: softc(I) - pointer to soft context main structure */ /* fr(I) - pointer to filter rule */ /* */ /* For a given filter rule, call the matching "fini" function if the rule */ /* is using a known function that would have resulted in the "init" being */ /* called for ealier. */ /* ------------------------------------------------------------------------ */ static void ipf_funcfini(softc, fr) ipf_main_softc_t *softc; frentry_t *fr; { ipfunc_resolve_t *ft; for (ft = ipf_availfuncs; ft->ipfu_addr != NULL; ft++) if (ft->ipfu_addr == fr->fr_func) { if (ft->ipfu_fini != NULL) (void) (*ft->ipfu_fini)(softc, fr); break; } } /* ------------------------------------------------------------------------ */ /* Function: ipf_findfunc */ /* Returns: ipfunc_t - pointer to function if found, else NULL */ /* Parameters: funcptr(I) - function pointer to lookup */ /* */ /* Look for a function in the table of known functions. */ /* ------------------------------------------------------------------------ */ static ipfunc_t ipf_findfunc(funcptr) ipfunc_t funcptr; { ipfunc_resolve_t *ft; for (ft = ipf_availfuncs; ft->ipfu_addr != NULL; ft++) if (ft->ipfu_addr == funcptr) return funcptr; return NULL; } /* ------------------------------------------------------------------------ */ /* Function: ipf_resolvefunc */ /* Returns: int - 0 == success, else error */ /* Parameters: data(IO) - ioctl data pointer to ipfunc_resolve_t struct */ /* */ /* Copy in a ipfunc_resolve_t structure and then fill in the missing field. */ /* This will either be the function name (if the pointer is set) or the */ /* function pointer if the name is set. When found, fill in the other one */ /* so that the entire, complete, structure can be copied back to user space.*/ /* ------------------------------------------------------------------------ */ int ipf_resolvefunc(softc, data) ipf_main_softc_t *softc; void *data; { ipfunc_resolve_t res, *ft; int error; error = BCOPYIN(data, &res, sizeof(res)); if (error != 0) { IPFERROR(123); return EFAULT; } if (res.ipfu_addr == NULL && res.ipfu_name[0] != '\0') { for (ft = ipf_availfuncs; ft->ipfu_addr != NULL; ft++) if (strncmp(res.ipfu_name, ft->ipfu_name, sizeof(res.ipfu_name)) == 0) { res.ipfu_addr = ft->ipfu_addr; res.ipfu_init = ft->ipfu_init; if (COPYOUT(&res, data, sizeof(res)) != 0) { IPFERROR(35); return EFAULT; } return 0; } } if (res.ipfu_addr != NULL && res.ipfu_name[0] == '\0') { for (ft = ipf_availfuncs; ft->ipfu_addr != NULL; ft++) if (ft->ipfu_addr == res.ipfu_addr) { (void) strncpy(res.ipfu_name, ft->ipfu_name, sizeof(res.ipfu_name)); res.ipfu_init = ft->ipfu_init; if (COPYOUT(&res, data, sizeof(res)) != 0) { IPFERROR(36); return EFAULT; } return 0; } } IPFERROR(37); return ESRCH; } #if !defined(_KERNEL) || (!defined(__NetBSD__) && !defined(__OpenBSD__) && \ !defined(__FreeBSD__)) || \ FREEBSD_LT_REV(501000) || NETBSD_LT_REV(105000000) || \ OPENBSD_LT_REV(200006) /* * From: NetBSD * ppsratecheck(): packets (or events) per second limitation. */ int ppsratecheck(lasttime, curpps, maxpps) struct timeval *lasttime; int *curpps; int maxpps; /* maximum pps allowed */ { struct timeval tv, delta; int rv; GETKTIME(&tv); delta.tv_sec = tv.tv_sec - lasttime->tv_sec; delta.tv_usec = tv.tv_usec - lasttime->tv_usec; if (delta.tv_usec < 0) { delta.tv_sec--; delta.tv_usec += 1000000; } /* * check for 0,0 is so that the message will be seen at least once. * if more than one second have passed since the last update of * lasttime, reset the counter. * * we do increment *curpps even in *curpps < maxpps case, as some may * try to use *curpps for stat purposes as well. */ if ((lasttime->tv_sec == 0 && lasttime->tv_usec == 0) || delta.tv_sec >= 1) { *lasttime = tv; *curpps = 0; rv = 1; } else if (maxpps < 0) rv = 1; else if (*curpps < maxpps) rv = 1; else rv = 0; *curpps = *curpps + 1; return (rv); } #endif /* ------------------------------------------------------------------------ */ /* Function: ipf_derefrule */ /* Returns: int - 0 == rule freed up, else rule not freed */ /* Parameters: fr(I) - pointer to filter rule */ /* */ /* Decrement the reference counter to a rule by one. If it reaches zero, */ /* free it and any associated storage space being used by it. */ /* ------------------------------------------------------------------------ */ int ipf_derefrule(softc, frp) ipf_main_softc_t *softc; frentry_t **frp; { frentry_t *fr; frdest_t *fdp; fr = *frp; *frp = NULL; MUTEX_ENTER(&fr->fr_lock); fr->fr_ref--; if (fr->fr_ref == 0) { MUTEX_EXIT(&fr->fr_lock); MUTEX_DESTROY(&fr->fr_lock); ipf_funcfini(softc, fr); fdp = &fr->fr_tif; if (fdp->fd_type == FRD_DSTLIST) ipf_lookup_deref(softc, IPLT_DSTLIST, fdp->fd_ptr); fdp = &fr->fr_rif; if (fdp->fd_type == FRD_DSTLIST) ipf_lookup_deref(softc, IPLT_DSTLIST, fdp->fd_ptr); fdp = &fr->fr_dif; if (fdp->fd_type == FRD_DSTLIST) ipf_lookup_deref(softc, IPLT_DSTLIST, fdp->fd_ptr); if ((fr->fr_type & ~FR_T_BUILTIN) == FR_T_IPF && fr->fr_satype == FRI_LOOKUP) ipf_lookup_deref(softc, fr->fr_srctype, fr->fr_srcptr); if ((fr->fr_type & ~FR_T_BUILTIN) == FR_T_IPF && fr->fr_datype == FRI_LOOKUP) ipf_lookup_deref(softc, fr->fr_dsttype, fr->fr_dstptr); if (fr->fr_grp != NULL) ipf_group_del(softc, fr->fr_grp, fr); if (fr->fr_grphead != NULL) ipf_group_del(softc, fr->fr_grphead, fr); if (fr->fr_icmpgrp != NULL) ipf_group_del(softc, fr->fr_icmpgrp, fr); if ((fr->fr_flags & FR_COPIED) != 0) { if (fr->fr_dsize) { KFREES(fr->fr_data, fr->fr_dsize); } KFREES(fr, fr->fr_size); return 0; } return 1; } else { MUTEX_EXIT(&fr->fr_lock); } return -1; } /* ------------------------------------------------------------------------ */ /* Function: ipf_grpmapinit */ /* Returns: int - 0 == success, else ESRCH because table entry not found*/ /* Parameters: fr(I) - pointer to rule to find hash table for */ /* */ /* Looks for group hash table fr_arg and stores a pointer to it in fr_ptr. */ /* fr_ptr is later used by ipf_srcgrpmap and ipf_dstgrpmap. */ /* ------------------------------------------------------------------------ */ static int ipf_grpmapinit(softc, fr) ipf_main_softc_t *softc; frentry_t *fr; { char name[FR_GROUPLEN]; iphtable_t *iph; #if defined(SNPRINTF) && defined(_KERNEL) SNPRINTF(name, sizeof(name), "%d", fr->fr_arg); #else (void) sprintf(name, "%d", fr->fr_arg); #endif iph = ipf_lookup_find_htable(softc, IPL_LOGIPF, name); if (iph == NULL) { IPFERROR(38); return ESRCH; } if ((iph->iph_flags & FR_INOUT) != (fr->fr_flags & FR_INOUT)) { IPFERROR(39); return ESRCH; } iph->iph_ref++; fr->fr_ptr = iph; return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_grpmapfini */ /* Returns: int - 0 == success, else ESRCH because table entry not found*/ /* Parameters: softc(I) - pointer to soft context main structure */ /* fr(I) - pointer to rule to release hash table for */ /* */ /* For rules that have had ipf_grpmapinit called, ipf_lookup_deref needs to */ /* be called to undo what ipf_grpmapinit caused to be done. */ /* ------------------------------------------------------------------------ */ static int ipf_grpmapfini(softc, fr) ipf_main_softc_t *softc; frentry_t *fr; { iphtable_t *iph; iph = fr->fr_ptr; if (iph != NULL) ipf_lookup_deref(softc, IPLT_HASH, iph); return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_srcgrpmap */ /* Returns: frentry_t * - pointer to "new last matching" rule or NULL */ /* Parameters: fin(I) - pointer to packet information */ /* passp(IO) - pointer to current/new filter decision (unused) */ /* */ /* Look for a rule group head in a hash table, using the source address as */ /* the key, and descend into that group and continue matching rules against */ /* the packet. */ /* ------------------------------------------------------------------------ */ frentry_t * ipf_srcgrpmap(fin, passp) fr_info_t *fin; u_32_t *passp; { frgroup_t *fg; void *rval; rval = ipf_iphmfindgroup(fin->fin_main_soft, fin->fin_fr->fr_ptr, &fin->fin_src); if (rval == NULL) return NULL; fg = rval; fin->fin_fr = fg->fg_start; (void) ipf_scanlist(fin, *passp); return fin->fin_fr; } /* ------------------------------------------------------------------------ */ /* Function: ipf_dstgrpmap */ /* Returns: frentry_t * - pointer to "new last matching" rule or NULL */ /* Parameters: fin(I) - pointer to packet information */ /* passp(IO) - pointer to current/new filter decision (unused) */ /* */ /* Look for a rule group head in a hash table, using the destination */ /* address as the key, and descend into that group and continue matching */ /* rules against the packet. */ /* ------------------------------------------------------------------------ */ frentry_t * ipf_dstgrpmap(fin, passp) fr_info_t *fin; u_32_t *passp; { frgroup_t *fg; void *rval; rval = ipf_iphmfindgroup(fin->fin_main_soft, fin->fin_fr->fr_ptr, &fin->fin_dst); if (rval == NULL) return NULL; fg = rval; fin->fin_fr = fg->fg_start; (void) ipf_scanlist(fin, *passp); return fin->fin_fr; } /* * Queue functions * =============== * These functions manage objects on queues for efficient timeouts. There * are a number of system defined queues as well as user defined timeouts. * It is expected that a lock is held in the domain in which the queue * belongs (i.e. either state or NAT) when calling any of these functions * that prevents ipf_freetimeoutqueue() from being called at the same time * as any other. */ /* ------------------------------------------------------------------------ */ /* Function: ipf_addtimeoutqueue */ /* Returns: struct ifqtq * - NULL if malloc fails, else pointer to */ /* timeout queue with given interval. */ /* Parameters: parent(I) - pointer to pointer to parent node of this list */ /* of interface queues. */ /* seconds(I) - timeout value in seconds for this queue. */ /* */ /* This routine first looks for a timeout queue that matches the interval */ /* being requested. If it finds one, increments the reference counter and */ /* returns a pointer to it. If none are found, it allocates a new one and */ /* inserts it at the top of the list. */ /* */ /* Locking. */ /* It is assumed that the caller of this function has an appropriate lock */ /* held (exclusively) in the domain that encompases 'parent'. */ /* ------------------------------------------------------------------------ */ ipftq_t * ipf_addtimeoutqueue(softc, parent, seconds) ipf_main_softc_t *softc; ipftq_t **parent; u_int seconds; { ipftq_t *ifq; u_int period; period = seconds * IPF_HZ_DIVIDE; MUTEX_ENTER(&softc->ipf_timeoutlock); for (ifq = *parent; ifq != NULL; ifq = ifq->ifq_next) { if (ifq->ifq_ttl == period) { /* * Reset the delete flag, if set, so the structure * gets reused rather than freed and reallocated. */ MUTEX_ENTER(&ifq->ifq_lock); ifq->ifq_flags &= ~IFQF_DELETE; ifq->ifq_ref++; MUTEX_EXIT(&ifq->ifq_lock); MUTEX_EXIT(&softc->ipf_timeoutlock); return ifq; } } KMALLOC(ifq, ipftq_t *); if (ifq != NULL) { MUTEX_NUKE(&ifq->ifq_lock); IPFTQ_INIT(ifq, period, "ipftq mutex"); ifq->ifq_next = *parent; ifq->ifq_pnext = parent; ifq->ifq_flags = IFQF_USER; ifq->ifq_ref++; *parent = ifq; softc->ipf_userifqs++; } MUTEX_EXIT(&softc->ipf_timeoutlock); return ifq; } /* ------------------------------------------------------------------------ */ /* Function: ipf_deletetimeoutqueue */ /* Returns: int - new reference count value of the timeout queue */ /* Parameters: ifq(I) - timeout queue which is losing a reference. */ /* Locks: ifq->ifq_lock */ /* */ /* This routine must be called when we're discarding a pointer to a timeout */ /* queue object, taking care of the reference counter. */ /* */ /* Now that this just sets a DELETE flag, it requires the expire code to */ /* check the list of user defined timeout queues and call the free function */ /* below (currently commented out) to stop memory leaking. It is done this */ /* way because the locking may not be sufficient to safely do a free when */ /* this function is called. */ /* ------------------------------------------------------------------------ */ int ipf_deletetimeoutqueue(ifq) ipftq_t *ifq; { ifq->ifq_ref--; if ((ifq->ifq_ref == 0) && ((ifq->ifq_flags & IFQF_USER) != 0)) { ifq->ifq_flags |= IFQF_DELETE; } return ifq->ifq_ref; } /* ------------------------------------------------------------------------ */ /* Function: ipf_freetimeoutqueue */ /* Parameters: ifq(I) - timeout queue which is losing a reference. */ /* Returns: Nil */ /* */ /* Locking: */ /* It is assumed that the caller of this function has an appropriate lock */ /* held (exclusively) in the domain that encompases the callers "domain". */ /* The ifq_lock for this structure should not be held. */ /* */ /* Remove a user defined timeout queue from the list of queues it is in and */ /* tidy up after this is done. */ /* ------------------------------------------------------------------------ */ void ipf_freetimeoutqueue(softc, ifq) ipf_main_softc_t *softc; ipftq_t *ifq; { if (((ifq->ifq_flags & IFQF_DELETE) == 0) || (ifq->ifq_ref != 0) || ((ifq->ifq_flags & IFQF_USER) == 0)) { printf("ipf_freetimeoutqueue(%lx) flags 0x%x ttl %d ref %d\n", (u_long)ifq, ifq->ifq_flags, ifq->ifq_ttl, ifq->ifq_ref); return; } /* * Remove from its position in the list. */ *ifq->ifq_pnext = ifq->ifq_next; if (ifq->ifq_next != NULL) ifq->ifq_next->ifq_pnext = ifq->ifq_pnext; ifq->ifq_next = NULL; ifq->ifq_pnext = NULL; MUTEX_DESTROY(&ifq->ifq_lock); ATOMIC_DEC(softc->ipf_userifqs); KFREE(ifq); } /* ------------------------------------------------------------------------ */ /* Function: ipf_deletequeueentry */ /* Returns: Nil */ /* Parameters: tqe(I) - timeout queue entry to delete */ /* */ /* Remove a tail queue entry from its queue and make it an orphan. */ /* ipf_deletetimeoutqueue is called to make sure the reference count on the */ /* queue is correct. We can't, however, call ipf_freetimeoutqueue because */ /* the correct lock(s) may not be held that would make it safe to do so. */ /* ------------------------------------------------------------------------ */ void ipf_deletequeueentry(tqe) ipftqent_t *tqe; { ipftq_t *ifq; ifq = tqe->tqe_ifq; MUTEX_ENTER(&ifq->ifq_lock); if (tqe->tqe_pnext != NULL) { *tqe->tqe_pnext = tqe->tqe_next; if (tqe->tqe_next != NULL) tqe->tqe_next->tqe_pnext = tqe->tqe_pnext; else /* we must be the tail anyway */ ifq->ifq_tail = tqe->tqe_pnext; tqe->tqe_pnext = NULL; tqe->tqe_ifq = NULL; } (void) ipf_deletetimeoutqueue(ifq); ASSERT(ifq->ifq_ref > 0); MUTEX_EXIT(&ifq->ifq_lock); } /* ------------------------------------------------------------------------ */ /* Function: ipf_queuefront */ /* Returns: Nil */ /* Parameters: tqe(I) - pointer to timeout queue entry */ /* */ /* Move a queue entry to the front of the queue, if it isn't already there. */ /* ------------------------------------------------------------------------ */ void ipf_queuefront(tqe) ipftqent_t *tqe; { ipftq_t *ifq; ifq = tqe->tqe_ifq; if (ifq == NULL) return; MUTEX_ENTER(&ifq->ifq_lock); if (ifq->ifq_head != tqe) { *tqe->tqe_pnext = tqe->tqe_next; if (tqe->tqe_next) tqe->tqe_next->tqe_pnext = tqe->tqe_pnext; else ifq->ifq_tail = tqe->tqe_pnext; tqe->tqe_next = ifq->ifq_head; ifq->ifq_head->tqe_pnext = &tqe->tqe_next; ifq->ifq_head = tqe; tqe->tqe_pnext = &ifq->ifq_head; } MUTEX_EXIT(&ifq->ifq_lock); } /* ------------------------------------------------------------------------ */ /* Function: ipf_queueback */ /* Returns: Nil */ /* Parameters: ticks(I) - ipf tick time to use with this call */ /* tqe(I) - pointer to timeout queue entry */ /* */ /* Move a queue entry to the back of the queue, if it isn't already there. */ /* We use use ticks to calculate the expiration and mark for when we last */ /* touched the structure. */ /* ------------------------------------------------------------------------ */ void ipf_queueback(ticks, tqe) u_long ticks; ipftqent_t *tqe; { ipftq_t *ifq; ifq = tqe->tqe_ifq; if (ifq == NULL) return; tqe->tqe_die = ticks + ifq->ifq_ttl; tqe->tqe_touched = ticks; MUTEX_ENTER(&ifq->ifq_lock); if (tqe->tqe_next != NULL) { /* at the end already ? */ /* * Remove from list */ *tqe->tqe_pnext = tqe->tqe_next; tqe->tqe_next->tqe_pnext = tqe->tqe_pnext; /* * Make it the last entry. */ tqe->tqe_next = NULL; tqe->tqe_pnext = ifq->ifq_tail; *ifq->ifq_tail = tqe; ifq->ifq_tail = &tqe->tqe_next; } MUTEX_EXIT(&ifq->ifq_lock); } /* ------------------------------------------------------------------------ */ /* Function: ipf_queueappend */ /* Returns: Nil */ /* Parameters: ticks(I) - ipf tick time to use with this call */ /* tqe(I) - pointer to timeout queue entry */ /* ifq(I) - pointer to timeout queue */ /* parent(I) - owing object pointer */ /* */ /* Add a new item to this queue and put it on the very end. */ /* We use use ticks to calculate the expiration and mark for when we last */ /* touched the structure. */ /* ------------------------------------------------------------------------ */ void ipf_queueappend(ticks, tqe, ifq, parent) u_long ticks; ipftqent_t *tqe; ipftq_t *ifq; void *parent; { MUTEX_ENTER(&ifq->ifq_lock); tqe->tqe_parent = parent; tqe->tqe_pnext = ifq->ifq_tail; *ifq->ifq_tail = tqe; ifq->ifq_tail = &tqe->tqe_next; tqe->tqe_next = NULL; tqe->tqe_ifq = ifq; tqe->tqe_die = ticks + ifq->ifq_ttl; tqe->tqe_touched = ticks; ifq->ifq_ref++; MUTEX_EXIT(&ifq->ifq_lock); } /* ------------------------------------------------------------------------ */ /* Function: ipf_movequeue */ /* Returns: Nil */ /* Parameters: tq(I) - pointer to timeout queue information */ /* oifp(I) - old timeout queue entry was on */ /* nifp(I) - new timeout queue to put entry on */ /* */ /* Move a queue entry from one timeout queue to another timeout queue. */ /* If it notices that the current entry is already last and does not need */ /* to move queue, the return. */ /* ------------------------------------------------------------------------ */ void ipf_movequeue(ticks, tqe, oifq, nifq) u_long ticks; ipftqent_t *tqe; ipftq_t *oifq, *nifq; { /* * If the queue hasn't changed and we last touched this entry at the * same ipf time, then we're not going to achieve anything by either * changing the ttl or moving it on the queue. */ if (oifq == nifq && tqe->tqe_touched == ticks) return; /* * For any of this to be outside the lock, there is a risk that two * packets entering simultaneously, with one changing to a different * queue and one not, could end up with things in a bizarre state. */ MUTEX_ENTER(&oifq->ifq_lock); tqe->tqe_touched = ticks; tqe->tqe_die = ticks + nifq->ifq_ttl; /* * Is the operation here going to be a no-op ? */ if (oifq == nifq) { if ((tqe->tqe_next == NULL) || (tqe->tqe_next->tqe_die == tqe->tqe_die)) { MUTEX_EXIT(&oifq->ifq_lock); return; } } /* * Remove from the old queue */ *tqe->tqe_pnext = tqe->tqe_next; if (tqe->tqe_next) tqe->tqe_next->tqe_pnext = tqe->tqe_pnext; else oifq->ifq_tail = tqe->tqe_pnext; tqe->tqe_next = NULL; /* * If we're moving from one queue to another, release the * lock on the old queue and get a lock on the new queue. * For user defined queues, if we're moving off it, call * delete in case it can now be freed. */ if (oifq != nifq) { tqe->tqe_ifq = NULL; (void) ipf_deletetimeoutqueue(oifq); MUTEX_EXIT(&oifq->ifq_lock); MUTEX_ENTER(&nifq->ifq_lock); tqe->tqe_ifq = nifq; nifq->ifq_ref++; } /* * Add to the bottom of the new queue */ tqe->tqe_pnext = nifq->ifq_tail; *nifq->ifq_tail = tqe; nifq->ifq_tail = &tqe->tqe_next; MUTEX_EXIT(&nifq->ifq_lock); } /* ------------------------------------------------------------------------ */ /* Function: ipf_updateipid */ /* Returns: int - 0 == success, -1 == error (packet should be droppped) */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* When we are doing NAT, change the IP of every packet to represent a */ /* single sequence of packets coming from the host, hiding any host */ /* specific sequencing that might otherwise be revealed. If the packet is */ /* a fragment, then store the 'new' IPid in the fragment cache and look up */ /* the fragment cache for non-leading fragments. If a non-leading fragment */ /* has no match in the cache, return an error. */ /* ------------------------------------------------------------------------ */ static int ipf_updateipid(fin) fr_info_t *fin; { u_short id, ido, sums; u_32_t sumd, sum; ip_t *ip; ip = fin->fin_ip; ido = ntohs(ip->ip_id); if (fin->fin_off != 0) { sum = ipf_frag_ipidknown(fin); if (sum == 0xffffffff) return -1; sum &= 0xffff; id = (u_short)sum; ip->ip_id = htons(id); } else { ip_fillid(ip); id = ntohs(ip->ip_id); if ((fin->fin_flx & FI_FRAG) != 0) (void) ipf_frag_ipidnew(fin, (u_32_t)id); } if (id == ido) return 0; CALC_SUMD(ido, id, sumd); /* DESTRUCTIVE MACRO! id,ido change */ sum = (~ntohs(ip->ip_sum)) & 0xffff; sum += sumd; sum = (sum >> 16) + (sum & 0xffff); sum = (sum >> 16) + (sum & 0xffff); sums = ~(u_short)sum; ip->ip_sum = htons(sums); return 0; } #ifdef NEED_FRGETIFNAME /* ------------------------------------------------------------------------ */ /* Function: ipf_getifname */ /* Returns: char * - pointer to interface name */ /* Parameters: ifp(I) - pointer to network interface */ /* buffer(O) - pointer to where to store interface name */ /* */ /* Constructs an interface name in the buffer passed. The buffer passed is */ /* expected to be at least LIFNAMSIZ in bytes big. If buffer is passed in */ /* as a NULL pointer then return a pointer to a static array. */ /* ------------------------------------------------------------------------ */ char * ipf_getifname(ifp, buffer) struct ifnet *ifp; char *buffer; { static char namebuf[LIFNAMSIZ]; # if defined(MENTAT) || defined(__FreeBSD__) || defined(__osf__) || \ defined(__sgi) || defined(linux) || defined(_AIX51) || \ (defined(sun) && !defined(__SVR4) && !defined(__svr4__)) int unit, space; char temp[20]; char *s; # endif if (buffer == NULL) buffer = namebuf; (void) strncpy(buffer, ifp->if_name, LIFNAMSIZ); buffer[LIFNAMSIZ - 1] = '\0'; # if defined(MENTAT) || defined(__FreeBSD__) || defined(__osf__) || \ defined(__sgi) || defined(_AIX51) || \ (defined(sun) && !defined(__SVR4) && !defined(__svr4__)) for (s = buffer; *s; s++) ; unit = ifp->if_unit; space = LIFNAMSIZ - (s - buffer); if ((space > 0) && (unit >= 0)) { # if defined(SNPRINTF) && defined(_KERNEL) SNPRINTF(temp, sizeof(temp), "%d", unit); # else (void) sprintf(temp, "%d", unit); # endif (void) strncpy(s, temp, space); } # endif return buffer; } #endif /* ------------------------------------------------------------------------ */ /* Function: ipf_ioctlswitch */ /* Returns: int - -1 continue processing, else ioctl return value */ /* Parameters: unit(I) - device unit opened */ /* data(I) - pointer to ioctl data */ /* cmd(I) - ioctl command */ /* mode(I) - mode value */ /* uid(I) - uid making the ioctl call */ /* ctx(I) - pointer to context data */ /* */ /* Based on the value of unit, call the appropriate ioctl handler or return */ /* EIO if ipfilter is not running. Also checks if write perms are req'd */ /* for the device in order to execute the ioctl. A special case is made */ /* SIOCIPFINTERROR so that the same code isn't required in every handler. */ /* The context data pointer is passed through as this is used as the key */ /* for locating a matching token for continued access for walking lists, */ /* etc. */ /* ------------------------------------------------------------------------ */ int ipf_ioctlswitch(softc, unit, data, cmd, mode, uid, ctx) ipf_main_softc_t *softc; int unit, mode, uid; ioctlcmd_t cmd; void *data, *ctx; { int error = 0; switch (cmd) { case SIOCIPFINTERROR : error = BCOPYOUT(&softc->ipf_interror, data, sizeof(softc->ipf_interror)); if (error != 0) { IPFERROR(40); error = EFAULT; } return error; default : break; } switch (unit) { case IPL_LOGIPF : error = ipf_ipf_ioctl(softc, data, cmd, mode, uid, ctx); break; case IPL_LOGNAT : if (softc->ipf_running > 0) { error = ipf_nat_ioctl(softc, data, cmd, mode, uid, ctx); } else { IPFERROR(42); error = EIO; } break; case IPL_LOGSTATE : if (softc->ipf_running > 0) { error = ipf_state_ioctl(softc, data, cmd, mode, uid, ctx); } else { IPFERROR(43); error = EIO; } break; case IPL_LOGAUTH : if (softc->ipf_running > 0) { error = ipf_auth_ioctl(softc, data, cmd, mode, uid, ctx); } else { IPFERROR(44); error = EIO; } break; case IPL_LOGSYNC : if (softc->ipf_running > 0) { error = ipf_sync_ioctl(softc, data, cmd, mode, uid, ctx); } else { error = EIO; IPFERROR(45); } break; case IPL_LOGSCAN : #ifdef IPFILTER_SCAN if (softc->ipf_running > 0) error = ipf_scan_ioctl(softc, data, cmd, mode, uid, ctx); else #endif { error = EIO; IPFERROR(46); } break; case IPL_LOGLOOKUP : if (softc->ipf_running > 0) { error = ipf_lookup_ioctl(softc, data, cmd, mode, uid, ctx); } else { error = EIO; IPFERROR(47); } break; default : IPFERROR(48); error = EIO; break; } return error; } /* * This array defines the expected size of objects coming into the kernel * for the various recognised object types. The first column is flags (see * below), 2nd column is current size, 3rd column is the version number of * when the current size became current. * Flags: * 1 = minimum size, not absolute size */ static int ipf_objbytes[IPFOBJ_COUNT][3] = { { 1, sizeof(struct frentry), 5010000 }, /* 0 */ { 1, sizeof(struct friostat), 5010000 }, { 0, sizeof(struct fr_info), 5010000 }, { 0, sizeof(struct ipf_authstat), 4010100 }, { 0, sizeof(struct ipfrstat), 5010000 }, { 1, sizeof(struct ipnat), 5010000 }, /* 5 */ { 0, sizeof(struct natstat), 5010000 }, { 0, sizeof(struct ipstate_save), 5010000 }, { 1, sizeof(struct nat_save), 5010000 }, { 0, sizeof(struct natlookup), 5010000 }, { 1, sizeof(struct ipstate), 5010000 }, /* 10 */ { 0, sizeof(struct ips_stat), 5010000 }, { 0, sizeof(struct frauth), 5010000 }, { 0, sizeof(struct ipftune), 4010100 }, { 0, sizeof(struct nat), 5010000 }, { 0, sizeof(struct ipfruleiter), 4011400 }, /* 15 */ { 0, sizeof(struct ipfgeniter), 4011400 }, { 0, sizeof(struct ipftable), 4011400 }, { 0, sizeof(struct ipflookupiter), 4011400 }, { 0, sizeof(struct ipftq) * IPF_TCP_NSTATES }, { 1, 0, 0 }, /* IPFEXPR */ { 0, 0, 0 }, /* PROXYCTL */ { 0, sizeof (struct fripf), 5010000 } }; /* ------------------------------------------------------------------------ */ /* Function: ipf_inobj */ /* Returns: int - 0 = success, else failure */ /* Parameters: softc(I) - soft context pointerto work with */ /* data(I) - pointer to ioctl data */ /* objp(O) - where to store ipfobj structure */ /* ptr(I) - pointer to data to copy out */ /* type(I) - type of structure being moved */ /* */ /* Copy in the contents of what the ipfobj_t points to. In future, we */ /* add things to check for version numbers, sizes, etc, to make it backward */ /* compatible at the ABI for user land. */ /* If objp is not NULL then we assume that the caller wants to see what is */ /* in the ipfobj_t structure being copied in. As an example, this can tell */ /* the caller what version of ipfilter the ioctl program was written to. */ /* ------------------------------------------------------------------------ */ int ipf_inobj(softc, data, objp, ptr, type) ipf_main_softc_t *softc; void *data; ipfobj_t *objp; void *ptr; int type; { ipfobj_t obj; int error; int size; if ((type < 0) || (type >= IPFOBJ_COUNT)) { IPFERROR(49); return EINVAL; } if (objp == NULL) objp = &obj; error = BCOPYIN(data, objp, sizeof(*objp)); if (error != 0) { IPFERROR(124); return EFAULT; } if (objp->ipfo_type != type) { IPFERROR(50); return EINVAL; } if (objp->ipfo_rev >= ipf_objbytes[type][2]) { if ((ipf_objbytes[type][0] & 1) != 0) { if (objp->ipfo_size < ipf_objbytes[type][1]) { IPFERROR(51); return EINVAL; } size = ipf_objbytes[type][1]; } else if (objp->ipfo_size == ipf_objbytes[type][1]) { size = objp->ipfo_size; } else { IPFERROR(52); return EINVAL; } error = COPYIN(objp->ipfo_ptr, ptr, size); if (error != 0) { IPFERROR(55); error = EFAULT; } } else { #ifdef IPFILTER_COMPAT error = ipf_in_compat(softc, objp, ptr, 0); #else IPFERROR(54); error = EINVAL; #endif } return error; } /* ------------------------------------------------------------------------ */ /* Function: ipf_inobjsz */ /* Returns: int - 0 = success, else failure */ /* Parameters: softc(I) - soft context pointerto work with */ /* data(I) - pointer to ioctl data */ /* ptr(I) - pointer to store real data in */ /* type(I) - type of structure being moved */ /* sz(I) - size of data to copy */ /* */ /* As per ipf_inobj, except the size of the object to copy in is passed in */ /* but it must not be smaller than the size defined for the type and the */ /* type must allow for varied sized objects. The extra requirement here is */ /* that sz must match the size of the object being passed in - this is not */ /* not possible nor required in ipf_inobj(). */ /* ------------------------------------------------------------------------ */ int ipf_inobjsz(softc, data, ptr, type, sz) ipf_main_softc_t *softc; void *data; void *ptr; int type, sz; { ipfobj_t obj; int error; if ((type < 0) || (type >= IPFOBJ_COUNT)) { IPFERROR(56); return EINVAL; } error = BCOPYIN(data, &obj, sizeof(obj)); if (error != 0) { IPFERROR(125); return EFAULT; } if (obj.ipfo_type != type) { IPFERROR(58); return EINVAL; } if (obj.ipfo_rev >= ipf_objbytes[type][2]) { if (((ipf_objbytes[type][0] & 1) == 0) || (sz < ipf_objbytes[type][1])) { IPFERROR(57); return EINVAL; } error = COPYIN(obj.ipfo_ptr, ptr, sz); if (error != 0) { IPFERROR(61); error = EFAULT; } } else { #ifdef IPFILTER_COMPAT error = ipf_in_compat(softc, &obj, ptr, sz); #else IPFERROR(60); error = EINVAL; #endif } return error; } /* ------------------------------------------------------------------------ */ /* Function: ipf_outobjsz */ /* Returns: int - 0 = success, else failure */ /* Parameters: data(I) - pointer to ioctl data */ /* ptr(I) - pointer to store real data in */ /* type(I) - type of structure being moved */ /* sz(I) - size of data to copy */ /* */ /* As per ipf_outobj, except the size of the object to copy out is passed in*/ /* but it must not be smaller than the size defined for the type and the */ /* type must allow for varied sized objects. The extra requirement here is */ /* that sz must match the size of the object being passed in - this is not */ /* not possible nor required in ipf_outobj(). */ /* ------------------------------------------------------------------------ */ int ipf_outobjsz(softc, data, ptr, type, sz) ipf_main_softc_t *softc; void *data; void *ptr; int type, sz; { ipfobj_t obj; int error; if ((type < 0) || (type >= IPFOBJ_COUNT)) { IPFERROR(62); return EINVAL; } error = BCOPYIN(data, &obj, sizeof(obj)); if (error != 0) { IPFERROR(127); return EFAULT; } if (obj.ipfo_type != type) { IPFERROR(63); return EINVAL; } if (obj.ipfo_rev >= ipf_objbytes[type][2]) { if (((ipf_objbytes[type][0] & 1) == 0) || (sz < ipf_objbytes[type][1])) { IPFERROR(146); return EINVAL; } error = COPYOUT(ptr, obj.ipfo_ptr, sz); if (error != 0) { IPFERROR(66); error = EFAULT; } } else { #ifdef IPFILTER_COMPAT error = ipf_out_compat(softc, &obj, ptr); #else IPFERROR(65); error = EINVAL; #endif } return error; } /* ------------------------------------------------------------------------ */ /* Function: ipf_outobj */ /* Returns: int - 0 = success, else failure */ /* Parameters: data(I) - pointer to ioctl data */ /* ptr(I) - pointer to store real data in */ /* type(I) - type of structure being moved */ /* */ /* Copy out the contents of what ptr is to where ipfobj points to. In */ /* future, we add things to check for version numbers, sizes, etc, to make */ /* it backward compatible at the ABI for user land. */ /* ------------------------------------------------------------------------ */ int ipf_outobj(softc, data, ptr, type) ipf_main_softc_t *softc; void *data; void *ptr; int type; { ipfobj_t obj; int error; if ((type < 0) || (type >= IPFOBJ_COUNT)) { IPFERROR(67); return EINVAL; } error = BCOPYIN(data, &obj, sizeof(obj)); if (error != 0) { IPFERROR(126); return EFAULT; } if (obj.ipfo_type != type) { IPFERROR(68); return EINVAL; } if (obj.ipfo_rev >= ipf_objbytes[type][2]) { if ((ipf_objbytes[type][0] & 1) != 0) { if (obj.ipfo_size < ipf_objbytes[type][1]) { IPFERROR(69); return EINVAL; } } else if (obj.ipfo_size != ipf_objbytes[type][1]) { IPFERROR(70); return EINVAL; } error = COPYOUT(ptr, obj.ipfo_ptr, obj.ipfo_size); if (error != 0) { IPFERROR(73); error = EFAULT; } } else { #ifdef IPFILTER_COMPAT error = ipf_out_compat(softc, &obj, ptr); #else IPFERROR(72); error = EINVAL; #endif } return error; } /* ------------------------------------------------------------------------ */ /* Function: ipf_outobjk */ /* Returns: int - 0 = success, else failure */ /* Parameters: obj(I) - pointer to data description structure */ /* ptr(I) - pointer to kernel data to copy out */ /* */ /* In the above functions, the ipfobj_t structure is copied into the kernel,*/ /* telling ipfilter how to copy out data. In this instance, the ipfobj_t is */ /* already populated with information and now we just need to use it. */ /* There is no need for this function to have a "type" parameter as there */ /* is no point in validating information that comes from the kernel with */ /* itself. */ /* ------------------------------------------------------------------------ */ int ipf_outobjk(softc, obj, ptr) ipf_main_softc_t *softc; ipfobj_t *obj; void *ptr; { int type = obj->ipfo_type; int error; if ((type < 0) || (type >= IPFOBJ_COUNT)) { IPFERROR(147); return EINVAL; } if (obj->ipfo_rev >= ipf_objbytes[type][2]) { if ((ipf_objbytes[type][0] & 1) != 0) { if (obj->ipfo_size < ipf_objbytes[type][1]) { IPFERROR(148); return EINVAL; } } else if (obj->ipfo_size != ipf_objbytes[type][1]) { IPFERROR(149); return EINVAL; } error = COPYOUT(ptr, obj->ipfo_ptr, obj->ipfo_size); if (error != 0) { IPFERROR(150); error = EFAULT; } } else { #ifdef IPFILTER_COMPAT error = ipf_out_compat(softc, obj, ptr); #else IPFERROR(151); error = EINVAL; #endif } return error; } /* ------------------------------------------------------------------------ */ /* Function: ipf_checkl4sum */ /* Returns: int - 0 = good, -1 = bad, 1 = cannot check */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* If possible, calculate the layer 4 checksum for the packet. If this is */ /* not possible, return without indicating a failure or success but in a */ /* way that is ditinguishable. This function should only be called by the */ /* ipf_checkv6sum() for each platform. */ /* ------------------------------------------------------------------------ */ INLINE int ipf_checkl4sum(fin) fr_info_t *fin; { u_short sum, hdrsum, *csump; udphdr_t *udp; int dosum; /* * If the TCP packet isn't a fragment, isn't too short and otherwise * isn't already considered "bad", then validate the checksum. If * this check fails then considered the packet to be "bad". */ if ((fin->fin_flx & (FI_FRAG|FI_SHORT|FI_BAD)) != 0) return 1; csump = NULL; hdrsum = 0; dosum = 0; sum = 0; switch (fin->fin_p) { case IPPROTO_TCP : csump = &((tcphdr_t *)fin->fin_dp)->th_sum; dosum = 1; break; case IPPROTO_UDP : udp = fin->fin_dp; if (udp->uh_sum != 0) { csump = &udp->uh_sum; dosum = 1; } break; #ifdef USE_INET6 case IPPROTO_ICMPV6 : csump = &((struct icmp6_hdr *)fin->fin_dp)->icmp6_cksum; dosum = 1; break; #endif case IPPROTO_ICMP : csump = &((struct icmp *)fin->fin_dp)->icmp_cksum; dosum = 1; break; default : return 1; /*NOTREACHED*/ } if (csump != NULL) hdrsum = *csump; if (dosum) { sum = fr_cksum(fin, fin->fin_ip, fin->fin_p, fin->fin_dp); } #if !defined(_KERNEL) if (sum == hdrsum) { FR_DEBUG(("checkl4sum: %hx == %hx\n", sum, hdrsum)); } else { FR_DEBUG(("checkl4sum: %hx != %hx\n", sum, hdrsum)); } #endif DT2(l4sums, u_short, hdrsum, u_short, sum); if (hdrsum == sum) { fin->fin_cksum = FI_CK_SUMOK; return 0; } fin->fin_cksum = FI_CK_BAD; return -1; } /* ------------------------------------------------------------------------ */ /* Function: ipf_ifpfillv4addr */ /* Returns: int - 0 = address update, -1 = address not updated */ /* Parameters: atype(I) - type of network address update to perform */ /* sin(I) - pointer to source of address information */ /* mask(I) - pointer to source of netmask information */ /* inp(I) - pointer to destination address store */ /* inpmask(I) - pointer to destination netmask store */ /* */ /* Given a type of network address update (atype) to perform, copy */ /* information from sin/mask into inp/inpmask. If ipnmask is NULL then no */ /* netmask update is performed unless FRI_NETMASKED is passed as atype, in */ /* which case the operation fails. For all values of atype other than */ /* FRI_NETMASKED, if inpmask is non-NULL then the mask is set to an all 1s */ /* value. */ /* ------------------------------------------------------------------------ */ int ipf_ifpfillv4addr(atype, sin, mask, inp, inpmask) int atype; struct sockaddr_in *sin, *mask; struct in_addr *inp, *inpmask; { if (inpmask != NULL && atype != FRI_NETMASKED) inpmask->s_addr = 0xffffffff; if (atype == FRI_NETWORK || atype == FRI_NETMASKED) { if (atype == FRI_NETMASKED) { if (inpmask == NULL) return -1; inpmask->s_addr = mask->sin_addr.s_addr; } inp->s_addr = sin->sin_addr.s_addr & mask->sin_addr.s_addr; } else { inp->s_addr = sin->sin_addr.s_addr; } return 0; } #ifdef USE_INET6 /* ------------------------------------------------------------------------ */ /* Function: ipf_ifpfillv6addr */ /* Returns: int - 0 = address update, -1 = address not updated */ /* Parameters: atype(I) - type of network address update to perform */ /* sin(I) - pointer to source of address information */ /* mask(I) - pointer to source of netmask information */ /* inp(I) - pointer to destination address store */ /* inpmask(I) - pointer to destination netmask store */ /* */ /* Given a type of network address update (atype) to perform, copy */ /* information from sin/mask into inp/inpmask. If ipnmask is NULL then no */ /* netmask update is performed unless FRI_NETMASKED is passed as atype, in */ /* which case the operation fails. For all values of atype other than */ /* FRI_NETMASKED, if inpmask is non-NULL then the mask is set to an all 1s */ /* value. */ /* ------------------------------------------------------------------------ */ int ipf_ifpfillv6addr(atype, sin, mask, inp, inpmask) int atype; struct sockaddr_in6 *sin, *mask; i6addr_t *inp, *inpmask; { i6addr_t *src, *and; src = (i6addr_t *)&sin->sin6_addr; and = (i6addr_t *)&mask->sin6_addr; if (inpmask != NULL && atype != FRI_NETMASKED) { inpmask->i6[0] = 0xffffffff; inpmask->i6[1] = 0xffffffff; inpmask->i6[2] = 0xffffffff; inpmask->i6[3] = 0xffffffff; } if (atype == FRI_NETWORK || atype == FRI_NETMASKED) { if (atype == FRI_NETMASKED) { if (inpmask == NULL) return -1; inpmask->i6[0] = and->i6[0]; inpmask->i6[1] = and->i6[1]; inpmask->i6[2] = and->i6[2]; inpmask->i6[3] = and->i6[3]; } inp->i6[0] = src->i6[0] & and->i6[0]; inp->i6[1] = src->i6[1] & and->i6[1]; inp->i6[2] = src->i6[2] & and->i6[2]; inp->i6[3] = src->i6[3] & and->i6[3]; } else { inp->i6[0] = src->i6[0]; inp->i6[1] = src->i6[1]; inp->i6[2] = src->i6[2]; inp->i6[3] = src->i6[3]; } return 0; } #endif /* ------------------------------------------------------------------------ */ /* Function: ipf_matchtag */ /* Returns: 0 == mismatch, 1 == match. */ /* Parameters: tag1(I) - pointer to first tag to compare */ /* tag2(I) - pointer to second tag to compare */ /* */ /* Returns true (non-zero) or false(0) if the two tag structures can be */ /* considered to be a match or not match, respectively. The tag is 16 */ /* bytes long (16 characters) but that is overlayed with 4 32bit ints so */ /* compare the ints instead, for speed. tag1 is the master of the */ /* comparison. This function should only be called with both tag1 and tag2 */ /* as non-NULL pointers. */ /* ------------------------------------------------------------------------ */ int ipf_matchtag(tag1, tag2) ipftag_t *tag1, *tag2; { if (tag1 == tag2) return 1; if ((tag1->ipt_num[0] == 0) && (tag2->ipt_num[0] == 0)) return 1; if ((tag1->ipt_num[0] == tag2->ipt_num[0]) && (tag1->ipt_num[1] == tag2->ipt_num[1]) && (tag1->ipt_num[2] == tag2->ipt_num[2]) && (tag1->ipt_num[3] == tag2->ipt_num[3])) return 1; return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_coalesce */ /* Returns: 1 == success, -1 == failure, 0 == no change */ /* Parameters: fin(I) - pointer to packet information */ /* */ /* Attempt to get all of the packet data into a single, contiguous buffer. */ /* If this call returns a failure then the buffers have also been freed. */ /* ------------------------------------------------------------------------ */ int ipf_coalesce(fin) fr_info_t *fin; { if ((fin->fin_flx & FI_COALESCE) != 0) return 1; /* * If the mbuf pointers indicate that there is no mbuf to work with, * return but do not indicate success or failure. */ if (fin->fin_m == NULL || fin->fin_mp == NULL) return 0; #if defined(_KERNEL) if (ipf_pullup(fin->fin_m, fin, fin->fin_plen) == NULL) { ipf_main_softc_t *softc = fin->fin_main_soft; DT1(frb_coalesce, fr_info_t *, fin); LBUMP(ipf_stats[fin->fin_out].fr_badcoalesces); # ifdef MENTAT FREE_MB_T(*fin->fin_mp); # endif fin->fin_reason = FRB_COALESCE; *fin->fin_mp = NULL; fin->fin_m = NULL; return -1; } #else fin = fin; /* LINT */ #endif return 1; } /* * The following table lists all of the tunable variables that can be * accessed via SIOCIPFGET/SIOCIPFSET/SIOCIPFGETNEXt. The format of each row * in the table below is as follows: * * pointer to value, name of value, minimum, maximum, size of the value's * container, value attribute flags * * For convienience, IPFT_RDONLY means the value is read-only, IPFT_WRDISABLED * means the value can only be written to when IPFilter is loaded but disabled. * The obvious implication is if neither of these are set then the value can be * changed at any time without harm. */ /* ------------------------------------------------------------------------ */ /* Function: ipf_tune_findbycookie */ /* Returns: NULL = search failed, else pointer to tune struct */ /* Parameters: cookie(I) - cookie value to search for amongst tuneables */ /* next(O) - pointer to place to store the cookie for the */ /* "next" tuneable, if it is desired. */ /* */ /* This function is used to walk through all of the existing tunables with */ /* successive calls. It searches the known tunables for the one which has */ /* a matching value for "cookie" - ie its address. When returning a match, */ /* the next one to be found may be returned inside next. */ /* ------------------------------------------------------------------------ */ static ipftuneable_t * ipf_tune_findbycookie(ptop, cookie, next) ipftuneable_t **ptop; void *cookie, **next; { ipftuneable_t *ta, **tap; for (ta = *ptop; ta->ipft_name != NULL; ta++) if (ta == cookie) { if (next != NULL) { /* * If the next entry in the array has a name * present, then return a pointer to it for * where to go next, else return a pointer to * the dynaminc list as a key to search there * next. This facilitates a weak linking of * the two "lists" together. */ if ((ta + 1)->ipft_name != NULL) *next = ta + 1; else *next = ptop; } return ta; } for (tap = ptop; (ta = *tap) != NULL; tap = &ta->ipft_next) if (tap == cookie) { if (next != NULL) *next = &ta->ipft_next; return ta; } if (next != NULL) *next = NULL; return NULL; } /* ------------------------------------------------------------------------ */ /* Function: ipf_tune_findbyname */ /* Returns: NULL = search failed, else pointer to tune struct */ /* Parameters: name(I) - name of the tuneable entry to find. */ /* */ /* Search the static array of tuneables and the list of dynamic tuneables */ /* for an entry with a matching name. If we can find one, return a pointer */ /* to the matching structure. */ /* ------------------------------------------------------------------------ */ static ipftuneable_t * ipf_tune_findbyname(top, name) ipftuneable_t *top; const char *name; { ipftuneable_t *ta; for (ta = top; ta != NULL; ta = ta->ipft_next) if (!strcmp(ta->ipft_name, name)) { return ta; } return NULL; } /* ------------------------------------------------------------------------ */ /* Function: ipf_tune_add_array */ /* Returns: int - 0 == success, else failure */ /* Parameters: newtune - pointer to new tune array to add to tuneables */ /* */ /* Appends tune structures from the array passed in (newtune) to the end of */ /* the current list of "dynamic" tuneable parameters. */ /* If any entry to be added is already present (by name) then the operation */ /* is aborted - entries that have been added are removed before returning. */ /* An entry with no name (NULL) is used as the indication that the end of */ /* the array has been reached. */ /* ------------------------------------------------------------------------ */ int ipf_tune_add_array(softc, newtune) ipf_main_softc_t *softc; ipftuneable_t *newtune; { ipftuneable_t *nt, *dt; int error = 0; for (nt = newtune; nt->ipft_name != NULL; nt++) { error = ipf_tune_add(softc, nt); if (error != 0) { for (dt = newtune; dt != nt; dt++) { (void) ipf_tune_del(softc, dt); } } } return error; } /* ------------------------------------------------------------------------ */ /* Function: ipf_tune_array_link */ /* Returns: 0 == success, -1 == failure */ /* Parameters: softc(I) - soft context pointerto work with */ /* array(I) - pointer to an array of tuneables */ /* */ /* Given an array of tunables (array), append them to the current list of */ /* tuneables for this context (softc->ipf_tuners.) To properly prepare the */ /* the array for being appended to the list, initialise all of the next */ /* pointers so we don't need to walk parts of it with ++ and others with */ /* next. The array is expected to have an entry with a NULL name as the */ /* terminator. Trying to add an array with no non-NULL names will return as */ /* a failure. */ /* ------------------------------------------------------------------------ */ int ipf_tune_array_link(softc, array) ipf_main_softc_t *softc; ipftuneable_t *array; { ipftuneable_t *t, **p; t = array; if (t->ipft_name == NULL) return -1; for (; t[1].ipft_name != NULL; t++) t[0].ipft_next = &t[1]; t->ipft_next = NULL; /* * Since a pointer to the last entry isn't kept, we need to find it * each time we want to add new variables to the list. */ for (p = &softc->ipf_tuners; (t = *p) != NULL; p = &t->ipft_next) if (t->ipft_name == NULL) break; *p = array; return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_tune_array_unlink */ /* Returns: 0 == success, -1 == failure */ /* Parameters: softc(I) - soft context pointerto work with */ /* array(I) - pointer to an array of tuneables */ /* */ /* ------------------------------------------------------------------------ */ int ipf_tune_array_unlink(softc, array) ipf_main_softc_t *softc; ipftuneable_t *array; { ipftuneable_t *t, **p; for (p = &softc->ipf_tuners; (t = *p) != NULL; p = &t->ipft_next) if (t == array) break; if (t == NULL) return -1; for (; t[1].ipft_name != NULL; t++) ; *p = t->ipft_next; return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_tune_array_copy */ /* Returns: NULL = failure, else pointer to new array */ /* Parameters: base(I) - pointer to structure base */ /* size(I) - size of the array at template */ /* template(I) - original array to copy */ /* */ /* Allocate memory for a new set of tuneable values and copy everything */ /* from template into the new region of memory. The new region is full of */ /* uninitialised pointers (ipft_next) so set them up. Now, ipftp_offset... */ /* */ /* NOTE: the following assumes that sizeof(long) == sizeof(void *) */ /* In the array template, ipftp_offset is the offset (in bytes) of the */ /* location of the tuneable value inside the structure pointed to by base. */ /* As ipftp_offset is a union over the pointers to the tuneable values, if */ /* we add base to the copy's ipftp_offset, copy ends up with a pointer in */ /* ipftp_void that points to the stored value. */ /* ------------------------------------------------------------------------ */ ipftuneable_t * ipf_tune_array_copy(base, size, template) void *base; size_t size; ipftuneable_t *template; { ipftuneable_t *copy; int i; KMALLOCS(copy, ipftuneable_t *, size); if (copy == NULL) { return NULL; } bcopy(template, copy, size); for (i = 0; copy[i].ipft_name; i++) { copy[i].ipft_una.ipftp_offset += (u_long)base; copy[i].ipft_next = copy + i + 1; } return copy; } /* ------------------------------------------------------------------------ */ /* Function: ipf_tune_add */ /* Returns: int - 0 == success, else failure */ /* Parameters: newtune - pointer to new tune entry to add to tuneables */ /* */ /* Appends tune structures from the array passed in (newtune) to the end of */ /* the current list of "dynamic" tuneable parameters. Once added, the */ /* owner of the object is not expected to ever change "ipft_next". */ /* ------------------------------------------------------------------------ */ int ipf_tune_add(softc, newtune) ipf_main_softc_t *softc; ipftuneable_t *newtune; { ipftuneable_t *ta, **tap; ta = ipf_tune_findbyname(softc->ipf_tuners, newtune->ipft_name); if (ta != NULL) { IPFERROR(74); return EEXIST; } for (tap = &softc->ipf_tuners; *tap != NULL; tap = &(*tap)->ipft_next) ; newtune->ipft_next = NULL; *tap = newtune; return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_tune_del */ /* Returns: int - 0 == success, else failure */ /* Parameters: oldtune - pointer to tune entry to remove from the list of */ /* current dynamic tuneables */ /* */ /* Search for the tune structure, by pointer, in the list of those that are */ /* dynamically added at run time. If found, adjust the list so that this */ /* structure is no longer part of it. */ /* ------------------------------------------------------------------------ */ int ipf_tune_del(softc, oldtune) ipf_main_softc_t *softc; ipftuneable_t *oldtune; { ipftuneable_t *ta, **tap; int error = 0; for (tap = &softc->ipf_tuners; (ta = *tap) != NULL; tap = &ta->ipft_next) { if (ta == oldtune) { *tap = oldtune->ipft_next; oldtune->ipft_next = NULL; break; } } if (ta == NULL) { error = ESRCH; IPFERROR(75); } return error; } /* ------------------------------------------------------------------------ */ /* Function: ipf_tune_del_array */ /* Returns: int - 0 == success, else failure */ /* Parameters: oldtune - pointer to tuneables array */ /* */ /* Remove each tuneable entry in the array from the list of "dynamic" */ /* tunables. If one entry should fail to be found, an error will be */ /* returned and no further ones removed. */ /* An entry with a NULL name is used as the indicator of the last entry in */ /* the array. */ /* ------------------------------------------------------------------------ */ int ipf_tune_del_array(softc, oldtune) ipf_main_softc_t *softc; ipftuneable_t *oldtune; { ipftuneable_t *ot; int error = 0; for (ot = oldtune; ot->ipft_name != NULL; ot++) { error = ipf_tune_del(softc, ot); if (error != 0) break; } return error; } /* ------------------------------------------------------------------------ */ /* Function: ipf_tune */ /* Returns: int - 0 == success, else failure */ /* Parameters: cmd(I) - ioctl command number */ /* data(I) - pointer to ioctl data structure */ /* */ /* Implement handling of SIOCIPFGETNEXT, SIOCIPFGET and SIOCIPFSET. These */ /* three ioctls provide the means to access and control global variables */ /* within IPFilter, allowing (for example) timeouts and table sizes to be */ /* changed without rebooting, reloading or recompiling. The initialisation */ /* and 'destruction' routines of the various components of ipfilter are all */ /* each responsible for handling their own values being too big. */ /* ------------------------------------------------------------------------ */ int ipf_ipftune(softc, cmd, data) ipf_main_softc_t *softc; ioctlcmd_t cmd; void *data; { ipftuneable_t *ta; ipftune_t tu; void *cookie; int error; error = ipf_inobj(softc, data, NULL, &tu, IPFOBJ_TUNEABLE); if (error != 0) return error; tu.ipft_name[sizeof(tu.ipft_name) - 1] = '\0'; cookie = tu.ipft_cookie; ta = NULL; switch (cmd) { case SIOCIPFGETNEXT : /* * If cookie is non-NULL, assume it to be a pointer to the last * entry we looked at, so find it (if possible) and return a * pointer to the next one after it. The last entry in the * the table is a NULL entry, so when we get to it, set cookie * to NULL and return that, indicating end of list, erstwhile * if we come in with cookie set to NULL, we are starting anew * at the front of the list. */ if (cookie != NULL) { ta = ipf_tune_findbycookie(&softc->ipf_tuners, cookie, &tu.ipft_cookie); } else { ta = softc->ipf_tuners; tu.ipft_cookie = ta + 1; } if (ta != NULL) { /* * Entry found, but does the data pointed to by that * row fit in what we can return? */ if (ta->ipft_sz > sizeof(tu.ipft_un)) { IPFERROR(76); return EINVAL; } tu.ipft_vlong = 0; if (ta->ipft_sz == sizeof(u_long)) tu.ipft_vlong = *ta->ipft_plong; else if (ta->ipft_sz == sizeof(u_int)) tu.ipft_vint = *ta->ipft_pint; else if (ta->ipft_sz == sizeof(u_short)) tu.ipft_vshort = *ta->ipft_pshort; else if (ta->ipft_sz == sizeof(u_char)) tu.ipft_vchar = *ta->ipft_pchar; tu.ipft_sz = ta->ipft_sz; tu.ipft_min = ta->ipft_min; tu.ipft_max = ta->ipft_max; tu.ipft_flags = ta->ipft_flags; bcopy(ta->ipft_name, tu.ipft_name, MIN(sizeof(tu.ipft_name), strlen(ta->ipft_name) + 1)); } error = ipf_outobj(softc, data, &tu, IPFOBJ_TUNEABLE); break; case SIOCIPFGET : case SIOCIPFSET : /* * Search by name or by cookie value for a particular entry * in the tuning paramter table. */ IPFERROR(77); error = ESRCH; if (cookie != NULL) { ta = ipf_tune_findbycookie(&softc->ipf_tuners, cookie, NULL); if (ta != NULL) error = 0; } else if (tu.ipft_name[0] != '\0') { ta = ipf_tune_findbyname(softc->ipf_tuners, tu.ipft_name); if (ta != NULL) error = 0; } if (error != 0) break; if (cmd == (ioctlcmd_t)SIOCIPFGET) { /* * Fetch the tuning parameters for a particular value */ tu.ipft_vlong = 0; if (ta->ipft_sz == sizeof(u_long)) tu.ipft_vlong = *ta->ipft_plong; else if (ta->ipft_sz == sizeof(u_int)) tu.ipft_vint = *ta->ipft_pint; else if (ta->ipft_sz == sizeof(u_short)) tu.ipft_vshort = *ta->ipft_pshort; else if (ta->ipft_sz == sizeof(u_char)) tu.ipft_vchar = *ta->ipft_pchar; tu.ipft_cookie = ta; tu.ipft_sz = ta->ipft_sz; tu.ipft_min = ta->ipft_min; tu.ipft_max = ta->ipft_max; tu.ipft_flags = ta->ipft_flags; error = ipf_outobj(softc, data, &tu, IPFOBJ_TUNEABLE); } else if (cmd == (ioctlcmd_t)SIOCIPFSET) { /* * Set an internal parameter. The hard part here is * getting the new value safely and correctly out of * the kernel (given we only know its size, not type.) */ u_long in; if (((ta->ipft_flags & IPFT_WRDISABLED) != 0) && (softc->ipf_running > 0)) { IPFERROR(78); error = EBUSY; break; } in = tu.ipft_vlong; if (in < ta->ipft_min || in > ta->ipft_max) { IPFERROR(79); error = EINVAL; break; } if (ta->ipft_func != NULL) { SPL_INT(s); SPL_NET(s); error = (*ta->ipft_func)(softc, ta, &tu.ipft_un); SPL_X(s); } else if (ta->ipft_sz == sizeof(u_long)) { tu.ipft_vlong = *ta->ipft_plong; *ta->ipft_plong = in; } else if (ta->ipft_sz == sizeof(u_int)) { tu.ipft_vint = *ta->ipft_pint; *ta->ipft_pint = (u_int)(in & 0xffffffff); } else if (ta->ipft_sz == sizeof(u_short)) { tu.ipft_vshort = *ta->ipft_pshort; *ta->ipft_pshort = (u_short)(in & 0xffff); } else if (ta->ipft_sz == sizeof(u_char)) { tu.ipft_vchar = *ta->ipft_pchar; *ta->ipft_pchar = (u_char)(in & 0xff); } error = ipf_outobj(softc, data, &tu, IPFOBJ_TUNEABLE); } break; default : IPFERROR(80); error = EINVAL; break; } return error; } /* ------------------------------------------------------------------------ */ /* Function: ipf_zerostats */ /* Returns: int - 0 = success, else failure */ /* Parameters: data(O) - pointer to pointer for copying data back to */ /* */ /* Copies the current statistics out to userspace and then zero's the */ /* current ones in the kernel. The lock is only held across the bzero() as */ /* the copyout may result in paging (ie network activity.) */ /* ------------------------------------------------------------------------ */ int ipf_zerostats(softc, data) ipf_main_softc_t *softc; caddr_t data; { friostat_t fio; ipfobj_t obj; int error; error = ipf_inobj(softc, data, &obj, &fio, IPFOBJ_IPFSTAT); if (error != 0) return error; ipf_getstat(softc, &fio, obj.ipfo_rev); error = ipf_outobj(softc, data, &fio, IPFOBJ_IPFSTAT); if (error != 0) return error; WRITE_ENTER(&softc->ipf_mutex); bzero(&softc->ipf_stats, sizeof(softc->ipf_stats)); RWLOCK_EXIT(&softc->ipf_mutex); return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_resolvedest */ /* Returns: Nil */ /* Parameters: softc(I) - pointer to soft context main structure */ /* base(I) - where strings are stored */ /* fdp(IO) - pointer to destination information to resolve */ /* v(I) - IP protocol version to match */ /* */ /* Looks up an interface name in the frdest structure pointed to by fdp and */ /* if a matching name can be found for the particular IP protocol version */ /* then store the interface pointer in the frdest struct. If no match is */ /* found, then set the interface pointer to be -1 as NULL is considered to */ /* indicate there is no information at all in the structure. */ /* ------------------------------------------------------------------------ */ int ipf_resolvedest(softc, base, fdp, v) ipf_main_softc_t *softc; char *base; frdest_t *fdp; int v; { int errval = 0; void *ifp; ifp = NULL; if (fdp->fd_name != -1) { if (fdp->fd_type == FRD_DSTLIST) { ifp = ipf_lookup_res_name(softc, IPL_LOGIPF, IPLT_DSTLIST, base + fdp->fd_name, NULL); if (ifp == NULL) { IPFERROR(144); errval = ESRCH; } } else { ifp = GETIFP(base + fdp->fd_name, v); if (ifp == NULL) ifp = (void *)-1; } } fdp->fd_ptr = ifp; if ((ifp != NULL) && (ifp != (void *)-1)) { fdp->fd_local = ipf_deliverlocal(softc, v, ifp, &fdp->fd_ip6); } return errval; } /* ------------------------------------------------------------------------ */ /* Function: ipf_resolvenic */ /* Returns: void* - NULL = wildcard name, -1 = failed to find NIC, else */ /* pointer to interface structure for NIC */ /* Parameters: softc(I)- pointer to soft context main structure */ /* name(I) - complete interface name */ /* v(I) - IP protocol version */ /* */ /* Look for a network interface structure that firstly has a matching name */ /* to that passed in and that is also being used for that IP protocol */ /* version (necessary on some platforms where there are separate listings */ /* for both IPv4 and IPv6 on the same physical NIC. */ /* ------------------------------------------------------------------------ */ void * ipf_resolvenic(softc, name, v) ipf_main_softc_t *softc; char *name; int v; { void *nic; softc = softc; /* gcc -Wextra */ if (name[0] == '\0') return NULL; if ((name[1] == '\0') && ((name[0] == '-') || (name[0] == '*'))) { return NULL; } nic = GETIFP(name, v); if (nic == NULL) nic = (void *)-1; return nic; } /* ------------------------------------------------------------------------ */ /* Function: ipf_token_expire */ /* Returns: None. */ /* Parameters: softc(I) - pointer to soft context main structure */ /* */ /* This function is run every ipf tick to see if there are any tokens that */ /* have been held for too long and need to be freed up. */ /* ------------------------------------------------------------------------ */ void ipf_token_expire(softc) ipf_main_softc_t *softc; { ipftoken_t *it; WRITE_ENTER(&softc->ipf_tokens); while ((it = softc->ipf_token_head) != NULL) { if (it->ipt_die > softc->ipf_ticks) break; ipf_token_deref(softc, it); } RWLOCK_EXIT(&softc->ipf_tokens); } /* ------------------------------------------------------------------------ */ /* Function: ipf_token_flush */ /* Returns: None. */ /* Parameters: softc(I) - pointer to soft context main structure */ /* */ /* Loop through all of the existing tokens and call deref to see if they */ /* can be freed. Normally a function like this might just loop on */ /* ipf_token_head but there is a chance that a token might have a ref count */ /* of greater than one and in that case the the reference would drop twice */ /* by code that is only entitled to drop it once. */ /* ------------------------------------------------------------------------ */ static void ipf_token_flush(softc) ipf_main_softc_t *softc; { ipftoken_t *it, *next; WRITE_ENTER(&softc->ipf_tokens); for (it = softc->ipf_token_head; it != NULL; it = next) { next = it->ipt_next; (void) ipf_token_deref(softc, it); } RWLOCK_EXIT(&softc->ipf_tokens); } /* ------------------------------------------------------------------------ */ /* Function: ipf_token_del */ /* Returns: int - 0 = success, else error */ /* Parameters: softc(I)- pointer to soft context main structure */ /* type(I) - the token type to match */ /* uid(I) - uid owning the token */ /* ptr(I) - context pointer for the token */ /* */ /* This function looks for a a token in the current list that matches up */ /* the fields (type, uid, ptr). If none is found, ESRCH is returned, else */ /* call ipf_token_dewref() to remove it from the list. In the event that */ /* the token has a reference held elsewhere, setting ipt_complete to 2 */ /* enables debugging to distinguish between the two paths that ultimately */ /* lead to a token to be deleted. */ /* ------------------------------------------------------------------------ */ int ipf_token_del(softc, type, uid, ptr) ipf_main_softc_t *softc; int type, uid; void *ptr; { ipftoken_t *it; int error; IPFERROR(82); error = ESRCH; WRITE_ENTER(&softc->ipf_tokens); for (it = softc->ipf_token_head; it != NULL; it = it->ipt_next) { if (ptr == it->ipt_ctx && type == it->ipt_type && uid == it->ipt_uid) { it->ipt_complete = 2; ipf_token_deref(softc, it); error = 0; break; } } RWLOCK_EXIT(&softc->ipf_tokens); return error; } /* ------------------------------------------------------------------------ */ /* Function: ipf_token_mark_complete */ /* Returns: None. */ /* Parameters: token(I) - pointer to token structure */ /* */ /* Mark a token as being ineligable for being found with ipf_token_find. */ /* ------------------------------------------------------------------------ */ void ipf_token_mark_complete(token) ipftoken_t *token; { if (token->ipt_complete == 0) token->ipt_complete = 1; } /* ------------------------------------------------------------------------ */ /* Function: ipf_token_find */ /* Returns: ipftoken_t * - NULL if no memory, else pointer to token */ /* Parameters: softc(I)- pointer to soft context main structure */ /* type(I) - the token type to match */ /* uid(I) - uid owning the token */ /* ptr(I) - context pointer for the token */ /* */ /* This function looks for a live token in the list of current tokens that */ /* matches the tuple (type, uid, ptr). If one cannot be found then one is */ /* allocated. If one is found then it is moved to the top of the list of */ /* currently active tokens. */ /* ------------------------------------------------------------------------ */ ipftoken_t * ipf_token_find(softc, type, uid, ptr) ipf_main_softc_t *softc; int type, uid; void *ptr; { ipftoken_t *it, *new; KMALLOC(new, ipftoken_t *); if (new != NULL) bzero((char *)new, sizeof(*new)); WRITE_ENTER(&softc->ipf_tokens); for (it = softc->ipf_token_head; it != NULL; it = it->ipt_next) { if ((ptr == it->ipt_ctx) && (type == it->ipt_type) && (uid == it->ipt_uid) && (it->ipt_complete < 2)) break; } if (it == NULL) { it = new; new = NULL; if (it == NULL) { RWLOCK_EXIT(&softc->ipf_tokens); return NULL; } it->ipt_ctx = ptr; it->ipt_uid = uid; it->ipt_type = type; it->ipt_ref = 1; } else { if (new != NULL) { KFREE(new); new = NULL; } if (it->ipt_complete > 0) it = NULL; else ipf_token_unlink(softc, it); } if (it != NULL) { it->ipt_pnext = softc->ipf_token_tail; *softc->ipf_token_tail = it; softc->ipf_token_tail = &it->ipt_next; it->ipt_next = NULL; it->ipt_ref++; it->ipt_die = softc->ipf_ticks + 20; } RWLOCK_EXIT(&softc->ipf_tokens); return it; } /* ------------------------------------------------------------------------ */ /* Function: ipf_token_unlink */ /* Returns: None. */ /* Parameters: softc(I) - pointer to soft context main structure */ /* token(I) - pointer to token structure */ /* Write Locks: ipf_tokens */ /* */ /* This function unlinks a token structure from the linked list of tokens */ /* that "own" it. The head pointer never needs to be explicitly adjusted */ /* but the tail does due to the linked list implementation. */ /* ------------------------------------------------------------------------ */ static void ipf_token_unlink(softc, token) ipf_main_softc_t *softc; ipftoken_t *token; { if (softc->ipf_token_tail == &token->ipt_next) softc->ipf_token_tail = token->ipt_pnext; *token->ipt_pnext = token->ipt_next; if (token->ipt_next != NULL) token->ipt_next->ipt_pnext = token->ipt_pnext; token->ipt_next = NULL; token->ipt_pnext = NULL; } /* ------------------------------------------------------------------------ */ /* Function: ipf_token_deref */ /* Returns: int - 0 == token freed, else reference count */ /* Parameters: softc(I) - pointer to soft context main structure */ /* token(I) - pointer to token structure */ /* Write Locks: ipf_tokens */ /* */ /* Drop the reference count on the token structure and if it drops to zero, */ /* call the dereference function for the token type because it is then */ /* possible to free the token data structure. */ /* ------------------------------------------------------------------------ */ int ipf_token_deref(softc, token) ipf_main_softc_t *softc; ipftoken_t *token; { void *data, **datap; ASSERT(token->ipt_ref > 0); token->ipt_ref--; if (token->ipt_ref > 0) return token->ipt_ref; data = token->ipt_data; datap = &data; if ((data != NULL) && (data != (void *)-1)) { switch (token->ipt_type) { case IPFGENITER_IPF : (void) ipf_derefrule(softc, (frentry_t **)datap); break; case IPFGENITER_IPNAT : WRITE_ENTER(&softc->ipf_nat); ipf_nat_rule_deref(softc, (ipnat_t **)datap); RWLOCK_EXIT(&softc->ipf_nat); break; case IPFGENITER_NAT : ipf_nat_deref(softc, (nat_t **)datap); break; case IPFGENITER_STATE : ipf_state_deref(softc, (ipstate_t **)datap); break; case IPFGENITER_FRAG : ipf_frag_pkt_deref(softc, (ipfr_t **)datap); break; case IPFGENITER_NATFRAG : ipf_frag_nat_deref(softc, (ipfr_t **)datap); break; case IPFGENITER_HOSTMAP : WRITE_ENTER(&softc->ipf_nat); ipf_nat_hostmapdel(softc, (hostmap_t **)datap); RWLOCK_EXIT(&softc->ipf_nat); break; default : ipf_lookup_iterderef(softc, token->ipt_type, data); break; } } ipf_token_unlink(softc, token); KFREE(token); return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_nextrule */ /* Returns: frentry_t * - NULL == no more rules, else pointer to next */ /* Parameters: softc(I) - pointer to soft context main structure */ /* fr(I) - pointer to filter rule */ /* out(I) - 1 == out rules, 0 == input rules */ /* */ /* Starting with "fr", find the next rule to visit. This includes visiting */ /* the list of rule groups if either fr is NULL (empty list) or it is the */ /* last rule in the list. When walking rule lists, it is either input or */ /* output rules that are returned, never both. */ /* ------------------------------------------------------------------------ */ static frentry_t * ipf_nextrule(softc, active, unit, fr, out) ipf_main_softc_t *softc; int active, unit; frentry_t *fr; int out; { frentry_t *next; frgroup_t *fg; if (fr != NULL && fr->fr_group != -1) { fg = ipf_findgroup(softc, fr->fr_names + fr->fr_group, unit, active, NULL); if (fg != NULL) fg = fg->fg_next; } else { fg = softc->ipf_groups[unit][active]; } while (fg != NULL) { next = fg->fg_start; while (next != NULL) { if (out) { if (next->fr_flags & FR_OUTQUE) return next; } else if (next->fr_flags & FR_INQUE) { return next; } next = next->fr_next; } if (next == NULL) fg = fg->fg_next; } return NULL; } /* ------------------------------------------------------------------------ */ /* Function: ipf_getnextrule */ /* Returns: int - 0 = success, else error */ /* Parameters: softc(I)- pointer to soft context main structure */ /* t(I) - pointer to destination information to resolve */ /* ptr(I) - pointer to ipfobj_t to copyin from user space */ /* */ /* This function's first job is to bring in the ipfruleiter_t structure via */ /* the ipfobj_t structure to determine what should be the next rule to */ /* return. Once the ipfruleiter_t has been brought in, it then tries to */ /* find the 'next rule'. This may include searching rule group lists or */ /* just be as simple as looking at the 'next' field in the rule structure. */ /* When we have found the rule to return, increase its reference count and */ /* if we used an existing rule to get here, decrease its reference count. */ /* ------------------------------------------------------------------------ */ int ipf_getnextrule(softc, t, ptr) ipf_main_softc_t *softc; ipftoken_t *t; void *ptr; { frentry_t *fr, *next, zero; ipfruleiter_t it; int error, out; frgroup_t *fg; ipfobj_t obj; int predict; char *dst; int unit; if (t == NULL || ptr == NULL) { IPFERROR(84); return EFAULT; } error = ipf_inobj(softc, ptr, &obj, &it, IPFOBJ_IPFITER); if (error != 0) return error; if ((it.iri_inout < 0) || (it.iri_inout > 3)) { IPFERROR(85); return EINVAL; } if ((it.iri_active != 0) && (it.iri_active != 1)) { IPFERROR(86); return EINVAL; } if (it.iri_nrules == 0) { IPFERROR(87); return ENOSPC; } if (it.iri_rule == NULL) { IPFERROR(88); return EFAULT; } fg = NULL; fr = t->ipt_data; if ((it.iri_inout & F_OUT) != 0) out = 1; else out = 0; if ((it.iri_inout & F_ACIN) != 0) unit = IPL_LOGCOUNT; else unit = IPL_LOGIPF; READ_ENTER(&softc->ipf_mutex); if (fr == NULL) { if (*it.iri_group == '\0') { if (unit == IPL_LOGCOUNT) { next = softc->ipf_acct[out][it.iri_active]; } else { next = softc->ipf_rules[out][it.iri_active]; } if (next == NULL) next = ipf_nextrule(softc, it.iri_active, unit, NULL, out); } else { fg = ipf_findgroup(softc, it.iri_group, unit, it.iri_active, NULL); if (fg != NULL) next = fg->fg_start; else next = NULL; } } else { next = fr->fr_next; if (next == NULL) next = ipf_nextrule(softc, it.iri_active, unit, fr, out); } if (next != NULL && next->fr_next != NULL) predict = 1; else if (ipf_nextrule(softc, it.iri_active, unit, next, out) != NULL) predict = 1; else predict = 0; if (fr != NULL) (void) ipf_derefrule(softc, &fr); obj.ipfo_type = IPFOBJ_FRENTRY; dst = (char *)it.iri_rule; if (next != NULL) { obj.ipfo_size = next->fr_size; MUTEX_ENTER(&next->fr_lock); next->fr_ref++; MUTEX_EXIT(&next->fr_lock); t->ipt_data = next; } else { obj.ipfo_size = sizeof(frentry_t); bzero(&zero, sizeof(zero)); next = &zero; t->ipt_data = NULL; } it.iri_rule = predict ? next : NULL; if (predict == 0) ipf_token_mark_complete(t); RWLOCK_EXIT(&softc->ipf_mutex); obj.ipfo_ptr = dst; error = ipf_outobjk(softc, &obj, next); if (error == 0 && t->ipt_data != NULL) { dst += obj.ipfo_size; if (next->fr_data != NULL) { ipfobj_t dobj; if (next->fr_type == FR_T_IPFEXPR) dobj.ipfo_type = IPFOBJ_IPFEXPR; else dobj.ipfo_type = IPFOBJ_FRIPF; dobj.ipfo_size = next->fr_dsize; dobj.ipfo_rev = obj.ipfo_rev; dobj.ipfo_ptr = dst; error = ipf_outobjk(softc, &dobj, next->fr_data); } } if ((fr != NULL) && (next == &zero)) (void) ipf_derefrule(softc, &fr); return error; } /* ------------------------------------------------------------------------ */ /* Function: ipf_frruleiter */ /* Returns: int - 0 = success, else error */ /* Parameters: softc(I)- pointer to soft context main structure */ /* data(I) - the token type to match */ /* uid(I) - uid owning the token */ /* ptr(I) - context pointer for the token */ /* */ /* This function serves as a stepping stone between ipf_ipf_ioctl and */ /* ipf_getnextrule. It's role is to find the right token in the kernel for */ /* the process doing the ioctl and use that to ask for the next rule. */ /* ------------------------------------------------------------------------ */ static int ipf_frruleiter(softc, data, uid, ctx) ipf_main_softc_t *softc; void *data, *ctx; int uid; { ipftoken_t *token; ipfruleiter_t it; ipfobj_t obj; int error; token = ipf_token_find(softc, IPFGENITER_IPF, uid, ctx); if (token != NULL) { error = ipf_getnextrule(softc, token, data); WRITE_ENTER(&softc->ipf_tokens); ipf_token_deref(softc, token); RWLOCK_EXIT(&softc->ipf_tokens); } else { error = ipf_inobj(softc, data, &obj, &it, IPFOBJ_IPFITER); if (error != 0) return error; it.iri_rule = NULL; error = ipf_outobj(softc, data, &it, IPFOBJ_IPFITER); } return error; } /* ------------------------------------------------------------------------ */ /* Function: ipf_geniter */ /* Returns: int - 0 = success, else error */ /* Parameters: softc(I) - pointer to soft context main structure */ /* token(I) - pointer to ipftoken_t structure */ /* itp(I) - pointer to iterator data */ /* */ /* Decide which iterator function to call using information passed through */ /* the ipfgeniter_t structure at itp. */ /* ------------------------------------------------------------------------ */ static int ipf_geniter(softc, token, itp) ipf_main_softc_t *softc; ipftoken_t *token; ipfgeniter_t *itp; { int error; switch (itp->igi_type) { case IPFGENITER_FRAG : error = ipf_frag_pkt_next(softc, token, itp); break; default : IPFERROR(92); error = EINVAL; break; } return error; } /* ------------------------------------------------------------------------ */ /* Function: ipf_genericiter */ /* Returns: int - 0 = success, else error */ /* Parameters: softc(I)- pointer to soft context main structure */ /* data(I) - the token type to match */ /* uid(I) - uid owning the token */ /* ptr(I) - context pointer for the token */ /* */ /* Handle the SIOCGENITER ioctl for the ipfilter device. The primary role */ /* ------------------------------------------------------------------------ */ int ipf_genericiter(softc, data, uid, ctx) ipf_main_softc_t *softc; void *data, *ctx; int uid; { ipftoken_t *token; ipfgeniter_t iter; int error; error = ipf_inobj(softc, data, NULL, &iter, IPFOBJ_GENITER); if (error != 0) return error; token = ipf_token_find(softc, iter.igi_type, uid, ctx); if (token != NULL) { token->ipt_subtype = iter.igi_type; error = ipf_geniter(softc, token, &iter); WRITE_ENTER(&softc->ipf_tokens); ipf_token_deref(softc, token); RWLOCK_EXIT(&softc->ipf_tokens); } else { IPFERROR(93); error = 0; } return error; } /* ------------------------------------------------------------------------ */ /* Function: ipf_ipf_ioctl */ /* Returns: int - 0 = success, else error */ /* Parameters: softc(I)- pointer to soft context main structure */ /* data(I) - the token type to match */ /* cmd(I) - the ioctl command number */ /* mode(I) - mode flags for the ioctl */ /* uid(I) - uid owning the token */ /* ptr(I) - context pointer for the token */ /* */ /* This function handles all of the ioctl command that are actually isssued */ /* to the /dev/ipl device. */ /* ------------------------------------------------------------------------ */ int ipf_ipf_ioctl(softc, data, cmd, mode, uid, ctx) ipf_main_softc_t *softc; caddr_t data; ioctlcmd_t cmd; int mode, uid; void *ctx; { friostat_t fio; int error, tmp; ipfobj_t obj; SPL_INT(s); switch (cmd) { case SIOCFRENB : if (!(mode & FWRITE)) { IPFERROR(94); error = EPERM; } else { error = BCOPYIN(data, &tmp, sizeof(tmp)); if (error != 0) { IPFERROR(95); error = EFAULT; break; } WRITE_ENTER(&softc->ipf_global); if (tmp) { if (softc->ipf_running > 0) error = 0; else error = ipfattach(softc); if (error == 0) softc->ipf_running = 1; else (void) ipfdetach(softc); } else { if (softc->ipf_running == 1) error = ipfdetach(softc); else error = 0; if (error == 0) softc->ipf_running = -1; } RWLOCK_EXIT(&softc->ipf_global); } break; case SIOCIPFSET : if (!(mode & FWRITE)) { IPFERROR(96); error = EPERM; break; } /* FALLTHRU */ case SIOCIPFGETNEXT : case SIOCIPFGET : error = ipf_ipftune(softc, cmd, (void *)data); break; case SIOCSETFF : if (!(mode & FWRITE)) { IPFERROR(97); error = EPERM; } else { error = BCOPYIN(data, &softc->ipf_flags, sizeof(softc->ipf_flags)); if (error != 0) { IPFERROR(98); error = EFAULT; } } break; case SIOCGETFF : error = BCOPYOUT(&softc->ipf_flags, data, sizeof(softc->ipf_flags)); if (error != 0) { IPFERROR(99); error = EFAULT; } break; case SIOCFUNCL : error = ipf_resolvefunc(softc, (void *)data); break; case SIOCINAFR : case SIOCRMAFR : case SIOCADAFR : case SIOCZRLST : if (!(mode & FWRITE)) { IPFERROR(100); error = EPERM; } else { error = frrequest(softc, IPL_LOGIPF, cmd, (caddr_t)data, softc->ipf_active, 1); } break; case SIOCINIFR : case SIOCRMIFR : case SIOCADIFR : if (!(mode & FWRITE)) { IPFERROR(101); error = EPERM; } else { error = frrequest(softc, IPL_LOGIPF, cmd, (caddr_t)data, 1 - softc->ipf_active, 1); } break; case SIOCSWAPA : if (!(mode & FWRITE)) { IPFERROR(102); error = EPERM; } else { WRITE_ENTER(&softc->ipf_mutex); error = BCOPYOUT(&softc->ipf_active, data, sizeof(softc->ipf_active)); if (error != 0) { IPFERROR(103); error = EFAULT; } else { softc->ipf_active = 1 - softc->ipf_active; } RWLOCK_EXIT(&softc->ipf_mutex); } break; case SIOCGETFS : error = ipf_inobj(softc, (void *)data, &obj, &fio, IPFOBJ_IPFSTAT); if (error != 0) break; ipf_getstat(softc, &fio, obj.ipfo_rev); error = ipf_outobj(softc, (void *)data, &fio, IPFOBJ_IPFSTAT); break; case SIOCFRZST : if (!(mode & FWRITE)) { IPFERROR(104); error = EPERM; } else error = ipf_zerostats(softc, (caddr_t)data); break; case SIOCIPFFL : if (!(mode & FWRITE)) { IPFERROR(105); error = EPERM; } else { error = BCOPYIN(data, &tmp, sizeof(tmp)); if (!error) { tmp = ipf_flush(softc, IPL_LOGIPF, tmp); error = BCOPYOUT(&tmp, data, sizeof(tmp)); if (error != 0) { IPFERROR(106); error = EFAULT; } } else { IPFERROR(107); error = EFAULT; } } break; #ifdef USE_INET6 case SIOCIPFL6 : if (!(mode & FWRITE)) { IPFERROR(108); error = EPERM; } else { error = BCOPYIN(data, &tmp, sizeof(tmp)); if (!error) { tmp = ipf_flush(softc, IPL_LOGIPF, tmp); error = BCOPYOUT(&tmp, data, sizeof(tmp)); if (error != 0) { IPFERROR(109); error = EFAULT; } } else { IPFERROR(110); error = EFAULT; } } break; #endif case SIOCSTLCK : if (!(mode & FWRITE)) { IPFERROR(122); error = EPERM; } else { error = BCOPYIN(data, &tmp, sizeof(tmp)); if (error == 0) { ipf_state_setlock(softc->ipf_state_soft, tmp); ipf_nat_setlock(softc->ipf_nat_soft, tmp); ipf_frag_setlock(softc->ipf_frag_soft, tmp); ipf_auth_setlock(softc->ipf_auth_soft, tmp); } else { IPFERROR(111); error = EFAULT; } } break; #ifdef IPFILTER_LOG case SIOCIPFFB : if (!(mode & FWRITE)) { IPFERROR(112); error = EPERM; } else { tmp = ipf_log_clear(softc, IPL_LOGIPF); error = BCOPYOUT(&tmp, data, sizeof(tmp)); if (error) { IPFERROR(113); error = EFAULT; } } break; #endif /* IPFILTER_LOG */ case SIOCFRSYN : if (!(mode & FWRITE)) { IPFERROR(114); error = EPERM; } else { WRITE_ENTER(&softc->ipf_global); #if (defined(MENTAT) && defined(_KERNEL)) && !defined(INSTANCES) error = ipfsync(); #else ipf_sync(softc, NULL); error = 0; #endif RWLOCK_EXIT(&softc->ipf_global); } break; case SIOCGFRST : error = ipf_outobj(softc, (void *)data, ipf_frag_stats(softc->ipf_frag_soft), IPFOBJ_FRAGSTAT); break; #ifdef IPFILTER_LOG case FIONREAD : tmp = ipf_log_bytesused(softc, IPL_LOGIPF); error = BCOPYOUT(&tmp, data, sizeof(tmp)); break; #endif case SIOCIPFITER : SPL_SCHED(s); error = ipf_frruleiter(softc, data, uid, ctx); SPL_X(s); break; case SIOCGENITER : SPL_SCHED(s); error = ipf_genericiter(softc, data, uid, ctx); SPL_X(s); break; case SIOCIPFDELTOK : error = BCOPYIN(data, &tmp, sizeof(tmp)); if (error == 0) { SPL_SCHED(s); error = ipf_token_del(softc, tmp, uid, ctx); SPL_X(s); } break; default : IPFERROR(115); error = EINVAL; break; } return error; } /* ------------------------------------------------------------------------ */ /* Function: ipf_decaps */ /* Returns: int - -1 == decapsulation failed, else bit mask of */ /* flags indicating packet filtering decision. */ /* Parameters: fin(I) - pointer to packet information */ /* pass(I) - IP protocol version to match */ /* l5proto(I) - layer 5 protocol to decode UDP data as. */ /* */ /* This function is called for packets that are wrapt up in other packets, */ /* for example, an IP packet that is the entire data segment for another IP */ /* packet. If the basic constraints for this are satisfied, change the */ /* buffer to point to the start of the inner packet and start processing */ /* rules belonging to the head group this rule specifies. */ /* ------------------------------------------------------------------------ */ u_32_t ipf_decaps(fin, pass, l5proto) fr_info_t *fin; u_32_t pass; int l5proto; { fr_info_t fin2, *fino = NULL; int elen, hlen, nh; grehdr_t gre; ip_t *ip; mb_t *m; if ((fin->fin_flx & FI_COALESCE) == 0) if (ipf_coalesce(fin) == -1) goto cantdecaps; m = fin->fin_m; hlen = fin->fin_hlen; switch (fin->fin_p) { case IPPROTO_UDP : /* * In this case, the specific protocol being decapsulated * inside UDP frames comes from the rule. */ nh = fin->fin_fr->fr_icode; break; case IPPROTO_GRE : /* 47 */ bcopy(fin->fin_dp, (char *)&gre, sizeof(gre)); hlen += sizeof(grehdr_t); if (gre.gr_R|gre.gr_s) goto cantdecaps; if (gre.gr_C) hlen += 4; if (gre.gr_K) hlen += 4; if (gre.gr_S) hlen += 4; nh = IPPROTO_IP; /* * If the routing options flag is set, validate that it is * there and bounce over it. */ #if 0 /* This is really heavy weight and lots of room for error, */ /* so for now, put it off and get the simple stuff right. */ if (gre.gr_R) { u_char off, len, *s; u_short af; int end; end = 0; s = fin->fin_dp; s += hlen; aplen = fin->fin_plen - hlen; while (aplen > 3) { af = (s[0] << 8) | s[1]; off = s[2]; len = s[3]; aplen -= 4; s += 4; if (af == 0 && len == 0) { end = 1; break; } if (aplen < len) break; s += len; aplen -= len; } if (end != 1) goto cantdecaps; hlen = s - (u_char *)fin->fin_dp; } #endif break; #ifdef IPPROTO_IPIP case IPPROTO_IPIP : /* 4 */ #endif nh = IPPROTO_IP; break; default : /* Includes ESP, AH is special for IPv4 */ goto cantdecaps; } switch (nh) { case IPPROTO_IP : case IPPROTO_IPV6 : break; default : goto cantdecaps; } bcopy((char *)fin, (char *)&fin2, sizeof(fin2)); fino = fin; fin = &fin2; elen = hlen; #if defined(MENTAT) && defined(_KERNEL) m->b_rptr += elen; #else m->m_data += elen; m->m_len -= elen; #endif fin->fin_plen -= elen; ip = (ip_t *)((char *)fin->fin_ip + elen); /* * Make sure we have at least enough data for the network layer * header. */ if (IP_V(ip) == 4) hlen = IP_HL(ip) << 2; #ifdef USE_INET6 else if (IP_V(ip) == 6) hlen = sizeof(ip6_t); #endif else goto cantdecaps2; if (fin->fin_plen < hlen) goto cantdecaps2; fin->fin_dp = (char *)ip + hlen; if (IP_V(ip) == 4) { /* * Perform IPv4 header checksum validation. */ if (ipf_cksum((u_short *)ip, hlen)) goto cantdecaps2; } if (ipf_makefrip(hlen, ip, fin) == -1) { cantdecaps2: if (m != NULL) { #if defined(MENTAT) && defined(_KERNEL) m->b_rptr -= elen; #else m->m_data -= elen; m->m_len += elen; #endif } cantdecaps: DT1(frb_decapfrip, fr_info_t *, fin); pass &= ~FR_CMDMASK; pass |= FR_BLOCK|FR_QUICK; fin->fin_reason = FRB_DECAPFRIP; return -1; } pass = ipf_scanlist(fin, pass); /* * Copy the packet filter "result" fields out of the fr_info_t struct * that is local to the decapsulation processing and back into the * one we were called with. */ fino->fin_flx = fin->fin_flx; fino->fin_rev = fin->fin_rev; fino->fin_icode = fin->fin_icode; fino->fin_rule = fin->fin_rule; (void) strncpy(fino->fin_group, fin->fin_group, FR_GROUPLEN); fino->fin_fr = fin->fin_fr; fino->fin_error = fin->fin_error; fino->fin_mp = fin->fin_mp; fino->fin_m = fin->fin_m; m = fin->fin_m; if (m != NULL) { #if defined(MENTAT) && defined(_KERNEL) m->b_rptr -= elen; #else m->m_data -= elen; m->m_len += elen; #endif } return pass; } /* ------------------------------------------------------------------------ */ /* Function: ipf_matcharray_load */ /* Returns: int - 0 = success, else error */ /* Parameters: softc(I) - pointer to soft context main structure */ /* data(I) - pointer to ioctl data */ /* objp(I) - ipfobj_t structure to load data into */ /* arrayptr(I) - pointer to location to store array pointer */ /* */ /* This function loads in a mathing array through the ipfobj_t struct that */ /* describes it. Sanity checking and array size limitations are enforced */ /* in this function to prevent userspace from trying to load in something */ /* that is insanely big. Once the size of the array is known, the memory */ /* required is malloc'd and returned through changing *arrayptr. The */ /* contents of the array are verified before returning. Only in the event */ /* of a successful call is the caller required to free up the malloc area. */ /* ------------------------------------------------------------------------ */ int ipf_matcharray_load(softc, data, objp, arrayptr) ipf_main_softc_t *softc; caddr_t data; ipfobj_t *objp; int **arrayptr; { int arraysize, *array, error; *arrayptr = NULL; error = BCOPYIN(data, objp, sizeof(*objp)); if (error != 0) { IPFERROR(116); return EFAULT; } if (objp->ipfo_type != IPFOBJ_IPFEXPR) { IPFERROR(117); return EINVAL; } if (((objp->ipfo_size & 3) != 0) || (objp->ipfo_size == 0) || (objp->ipfo_size > 1024)) { IPFERROR(118); return EINVAL; } arraysize = objp->ipfo_size * sizeof(*array); KMALLOCS(array, int *, arraysize); if (array == NULL) { IPFERROR(119); return ENOMEM; } error = COPYIN(objp->ipfo_ptr, array, arraysize); if (error != 0) { KFREES(array, arraysize); IPFERROR(120); return EFAULT; } if (ipf_matcharray_verify(array, arraysize) != 0) { KFREES(array, arraysize); IPFERROR(121); return EINVAL; } *arrayptr = array; return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_matcharray_verify */ /* Returns: Nil */ /* Parameters: array(I) - pointer to matching array */ /* arraysize(I) - number of elements in the array */ /* */ /* Verify the contents of a matching array by stepping through each element */ /* in it. The actual commands in the array are not verified for */ /* correctness, only that all of the sizes are correctly within limits. */ /* ------------------------------------------------------------------------ */ int ipf_matcharray_verify(array, arraysize) int *array, arraysize; { int i, nelem, maxidx; ipfexp_t *e; nelem = arraysize / sizeof(*array); /* * Currently, it makes no sense to have an array less than 6 * elements long - the initial size at the from, a single operation * (minimum 4 in length) and a trailer, for a total of 6. */ if ((array[0] < 6) || (arraysize < 24) || (arraysize > 4096)) { return -1; } /* * Verify the size of data pointed to by array with how long * the array claims to be itself. */ if (array[0] * sizeof(*array) != arraysize) { return -1; } maxidx = nelem - 1; /* * The last opcode in this array should be an IPF_EXP_END. */ if (array[maxidx] != IPF_EXP_END) { return -1; } for (i = 1; i < maxidx; ) { e = (ipfexp_t *)(array + i); /* * The length of the bits to check must be at least 1 * (or else there is nothing to comapre with!) and it * cannot exceed the length of the data present. */ if ((e->ipfe_size < 1 ) || (e->ipfe_size + i > maxidx)) { return -1; } i += e->ipfe_size; } return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_fr_matcharray */ /* Returns: int - 0 = match failed, else positive match */ /* Parameters: fin(I) - pointer to packet information */ /* array(I) - pointer to matching array */ /* */ /* This function is used to apply a matching array against a packet and */ /* return an indication of whether or not the packet successfully matches */ /* all of the commands in it. */ /* ------------------------------------------------------------------------ */ static int ipf_fr_matcharray(fin, array) fr_info_t *fin; int *array; { int i, n, *x, rv, p; ipfexp_t *e; rv = 0; n = array[0]; x = array + 1; for (; n > 0; x += 3 + x[3], rv = 0) { e = (ipfexp_t *)x; if (e->ipfe_cmd == IPF_EXP_END) break; n -= e->ipfe_size; /* * The upper 16 bits currently store the protocol value. * This is currently used with TCP and UDP port compares and * allows "tcp.port = 80" without requiring an explicit " "ip.pr = tcp" first. */ p = e->ipfe_cmd >> 16; if ((p != 0) && (p != fin->fin_p)) break; switch (e->ipfe_cmd) { case IPF_EXP_IP_PR : for (i = 0; !rv && i < e->ipfe_narg; i++) { rv |= (fin->fin_p == e->ipfe_arg0[i]); } break; case IPF_EXP_IP_SRCADDR : if (fin->fin_v != 4) break; for (i = 0; !rv && i < e->ipfe_narg; i++) { rv |= ((fin->fin_saddr & e->ipfe_arg0[i * 2 + 1]) == e->ipfe_arg0[i * 2]); } break; case IPF_EXP_IP_DSTADDR : if (fin->fin_v != 4) break; for (i = 0; !rv && i < e->ipfe_narg; i++) { rv |= ((fin->fin_daddr & e->ipfe_arg0[i * 2 + 1]) == e->ipfe_arg0[i * 2]); } break; case IPF_EXP_IP_ADDR : if (fin->fin_v != 4) break; for (i = 0; !rv && i < e->ipfe_narg; i++) { rv |= ((fin->fin_saddr & e->ipfe_arg0[i * 2 + 1]) == e->ipfe_arg0[i * 2]) || ((fin->fin_daddr & e->ipfe_arg0[i * 2 + 1]) == e->ipfe_arg0[i * 2]); } break; #ifdef USE_INET6 case IPF_EXP_IP6_SRCADDR : if (fin->fin_v != 6) break; for (i = 0; !rv && i < e->ipfe_narg; i++) { rv |= IP6_MASKEQ(&fin->fin_src6, &e->ipfe_arg0[i * 8 + 4], &e->ipfe_arg0[i * 8]); } break; case IPF_EXP_IP6_DSTADDR : if (fin->fin_v != 6) break; for (i = 0; !rv && i < e->ipfe_narg; i++) { rv |= IP6_MASKEQ(&fin->fin_dst6, &e->ipfe_arg0[i * 8 + 4], &e->ipfe_arg0[i * 8]); } break; case IPF_EXP_IP6_ADDR : if (fin->fin_v != 6) break; for (i = 0; !rv && i < e->ipfe_narg; i++) { rv |= IP6_MASKEQ(&fin->fin_src6, &e->ipfe_arg0[i * 8 + 4], &e->ipfe_arg0[i * 8]) || IP6_MASKEQ(&fin->fin_dst6, &e->ipfe_arg0[i * 8 + 4], &e->ipfe_arg0[i * 8]); } break; #endif case IPF_EXP_UDP_PORT : case IPF_EXP_TCP_PORT : for (i = 0; !rv && i < e->ipfe_narg; i++) { rv |= (fin->fin_sport == e->ipfe_arg0[i]) || (fin->fin_dport == e->ipfe_arg0[i]); } break; case IPF_EXP_UDP_SPORT : case IPF_EXP_TCP_SPORT : for (i = 0; !rv && i < e->ipfe_narg; i++) { rv |= (fin->fin_sport == e->ipfe_arg0[i]); } break; case IPF_EXP_UDP_DPORT : case IPF_EXP_TCP_DPORT : for (i = 0; !rv && i < e->ipfe_narg; i++) { rv |= (fin->fin_dport == e->ipfe_arg0[i]); } break; case IPF_EXP_TCP_FLAGS : for (i = 0; !rv && i < e->ipfe_narg; i++) { rv |= ((fin->fin_tcpf & e->ipfe_arg0[i * 2 + 1]) == e->ipfe_arg0[i * 2]); } break; } rv ^= e->ipfe_not; if (rv == 0) break; } return rv; } /* ------------------------------------------------------------------------ */ /* Function: ipf_queueflush */ /* Returns: int - number of entries flushed (0 = none) */ /* Parameters: softc(I) - pointer to soft context main structure */ /* deletefn(I) - function to call to delete entry */ /* ipfqs(I) - top of the list of ipf internal queues */ /* userqs(I) - top of the list of user defined timeouts */ /* */ /* This fucntion gets called when the state/NAT hash tables fill up and we */ /* need to try a bit harder to free up some space. The algorithm used here */ /* split into two parts but both halves have the same goal: to reduce the */ /* number of connections considered to be "active" to the low watermark. */ /* There are two steps in doing this: */ /* 1) Remove any TCP connections that are already considered to be "closed" */ /* but have not yet been removed from the state table. The two states */ /* TCPS_TIME_WAIT and TCPS_CLOSED are considered to be the perfect */ /* candidates for this style of removal. If freeing up entries in */ /* CLOSED or both CLOSED and TIME_WAIT brings us to the low watermark, */ /* we do not go on to step 2. */ /* */ /* 2) Look for the oldest entries on each timeout queue and free them if */ /* they are within the given window we are considering. Where the */ /* window starts and the steps taken to increase its size depend upon */ /* how long ipf has been running (ipf_ticks.) Anything modified in the */ /* last 30 seconds is not touched. */ /* touched */ /* die ipf_ticks 30*1.5 1800*1.5 | 43200*1.5 */ /* | | | | | | */ /* future <--+----------+--------+-----------+-----+-----+-----------> past */ /* now \_int=30s_/ \_int=1hr_/ \_int=12hr */ /* */ /* Points to note: */ /* - tqe_die is the time, in the future, when entries die. */ /* - tqe_die - ipf_ticks is how long left the connection has to live in ipf */ /* ticks. */ /* - tqe_touched is when the entry was last used by NAT/state */ /* - the closer tqe_touched is to ipf_ticks, the further tqe_die will be */ /* ipf_ticks any given timeout queue and vice versa. */ /* - both tqe_die and tqe_touched increase over time */ /* - timeout queues are sorted with the highest value of tqe_die at the */ /* bottom and therefore the smallest values of each are at the top */ /* - the pointer passed in as ipfqs should point to an array of timeout */ /* queues representing each of the TCP states */ /* */ /* We start by setting up a maximum range to scan for things to move of */ /* iend (newest) to istart (oldest) in chunks of "interval". If nothing is */ /* found in that range, "interval" is adjusted (so long as it isn't 30) and */ /* we start again with a new value for "iend" and "istart". This is */ /* continued until we either finish the scan of 30 second intervals or the */ /* low water mark is reached. */ /* ------------------------------------------------------------------------ */ int ipf_queueflush(softc, deletefn, ipfqs, userqs, activep, size, low) ipf_main_softc_t *softc; ipftq_delete_fn_t deletefn; ipftq_t *ipfqs, *userqs; u_int *activep; int size, low; { u_long interval, istart, iend; ipftq_t *ifq, *ifqnext; ipftqent_t *tqe, *tqn; int removed = 0; for (tqn = ipfqs[IPF_TCPS_CLOSED].ifq_head; ((tqe = tqn) != NULL); ) { tqn = tqe->tqe_next; if ((*deletefn)(softc, tqe->tqe_parent) == 0) removed++; } if ((*activep * 100 / size) > low) { for (tqn = ipfqs[IPF_TCPS_TIME_WAIT].ifq_head; ((tqe = tqn) != NULL); ) { tqn = tqe->tqe_next; if ((*deletefn)(softc, tqe->tqe_parent) == 0) removed++; } } if ((*activep * 100 / size) <= low) { return removed; } /* * NOTE: Use of "* 15 / 10" is required here because if "* 1.5" is * used then the operations are upgraded to floating point * and kernels don't like floating point... */ if (softc->ipf_ticks > IPF_TTLVAL(43200 * 15 / 10)) { istart = IPF_TTLVAL(86400 * 4); interval = IPF_TTLVAL(43200); } else if (softc->ipf_ticks > IPF_TTLVAL(1800 * 15 / 10)) { istart = IPF_TTLVAL(43200); interval = IPF_TTLVAL(1800); } else if (softc->ipf_ticks > IPF_TTLVAL(30 * 15 / 10)) { istart = IPF_TTLVAL(1800); interval = IPF_TTLVAL(30); } else { return 0; } if (istart > softc->ipf_ticks) { if (softc->ipf_ticks - interval < interval) istart = interval; else istart = (softc->ipf_ticks / interval) * interval; } iend = softc->ipf_ticks - interval; while ((*activep * 100 / size) > low) { u_long try; try = softc->ipf_ticks - istart; for (ifq = ipfqs; ifq != NULL; ifq = ifq->ifq_next) { for (tqn = ifq->ifq_head; ((tqe = tqn) != NULL); ) { if (try < tqe->tqe_touched) break; tqn = tqe->tqe_next; if ((*deletefn)(softc, tqe->tqe_parent) == 0) removed++; } } for (ifq = userqs; ifq != NULL; ifq = ifqnext) { ifqnext = ifq->ifq_next; for (tqn = ifq->ifq_head; ((tqe = tqn) != NULL); ) { if (try < tqe->tqe_touched) break; tqn = tqe->tqe_next; if ((*deletefn)(softc, tqe->tqe_parent) == 0) removed++; } } if (try >= iend) { if (interval == IPF_TTLVAL(43200)) { interval = IPF_TTLVAL(1800); } else if (interval == IPF_TTLVAL(1800)) { interval = IPF_TTLVAL(30); } else { break; } if (interval >= softc->ipf_ticks) break; iend = softc->ipf_ticks - interval; } istart -= interval; } return removed; } /* ------------------------------------------------------------------------ */ /* Function: ipf_deliverlocal */ /* Returns: int - 1 = local address, 0 = non-local address */ /* Parameters: softc(I) - pointer to soft context main structure */ /* ipversion(I) - IP protocol version (4 or 6) */ /* ifp(I) - network interface pointer */ /* ipaddr(I) - IPv4/6 destination address */ /* */ /* This fucntion is used to determine in the address "ipaddr" belongs to */ /* the network interface represented by ifp. */ /* ------------------------------------------------------------------------ */ int ipf_deliverlocal(softc, ipversion, ifp, ipaddr) ipf_main_softc_t *softc; int ipversion; void *ifp; i6addr_t *ipaddr; { i6addr_t addr; int islocal = 0; if (ipversion == 4) { if (ipf_ifpaddr(softc, 4, FRI_NORMAL, ifp, &addr, NULL) == 0) { if (addr.in4.s_addr == ipaddr->in4.s_addr) islocal = 1; } #ifdef USE_INET6 } else if (ipversion == 6) { if (ipf_ifpaddr(softc, 6, FRI_NORMAL, ifp, &addr, NULL) == 0) { if (IP6_EQ(&addr, ipaddr)) islocal = 1; } #endif } return islocal; } /* ------------------------------------------------------------------------ */ /* Function: ipf_settimeout */ /* Returns: int - 0 = success, -1 = failure */ /* Parameters: softc(I) - pointer to soft context main structure */ /* t(I) - pointer to tuneable array entry */ /* p(I) - pointer to values passed in to apply */ /* */ /* This function is called to set the timeout values for each distinct */ /* queue timeout that is available. When called, it calls into both the */ /* state and NAT code, telling them to update their timeout queues. */ /* ------------------------------------------------------------------------ */ static int ipf_settimeout(softc, t, p) struct ipf_main_softc_s *softc; ipftuneable_t *t; ipftuneval_t *p; { /* * ipf_interror should be set by the functions called here, not * by this function - it's just a middle man. */ if (ipf_state_settimeout(softc, t, p) == -1) return -1; if (ipf_nat_settimeout(softc, t, p) == -1) return -1; return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_apply_timeout */ /* Returns: int - 0 = success, -1 = failure */ /* Parameters: head(I) - pointer to tuneable array entry */ /* seconds(I) - pointer to values passed in to apply */ /* */ /* This function applies a timeout of "seconds" to the timeout queue that */ /* is pointed to by "head". All entries on this list have an expiration */ /* set to be the current tick value of ipf plus the ttl. Given that this */ /* function should only be called when the delta is non-zero, the task is */ /* to walk the entire list and apply the change. The sort order will not */ /* change. The only catch is that this is O(n) across the list, so if the */ /* queue has lots of entries (10s of thousands or 100s of thousands), it */ /* could take a relatively long time to work through them all. */ /* ------------------------------------------------------------------------ */ void ipf_apply_timeout(head, seconds) ipftq_t *head; u_int seconds; { u_int oldtimeout, newtimeout; ipftqent_t *tqe; int delta; MUTEX_ENTER(&head->ifq_lock); oldtimeout = head->ifq_ttl; newtimeout = IPF_TTLVAL(seconds); delta = oldtimeout - newtimeout; head->ifq_ttl = newtimeout; for (tqe = head->ifq_head; tqe != NULL; tqe = tqe->tqe_next) { tqe->tqe_die += delta; } MUTEX_EXIT(&head->ifq_lock); } /* ------------------------------------------------------------------------ */ /* Function: ipf_settimeout_tcp */ /* Returns: int - 0 = successfully applied, -1 = failed */ /* Parameters: t(I) - pointer to tuneable to change */ /* p(I) - pointer to new timeout information */ /* tab(I) - pointer to table of TCP queues */ /* */ /* This function applies the new timeout (p) to the TCP tunable (t) and */ /* updates all of the entries on the relevant timeout queue by calling */ /* ipf_apply_timeout(). */ /* ------------------------------------------------------------------------ */ int ipf_settimeout_tcp(t, p, tab) ipftuneable_t *t; ipftuneval_t *p; ipftq_t *tab; { if (!strcmp(t->ipft_name, "tcp_idle_timeout") || !strcmp(t->ipft_name, "tcp_established")) { ipf_apply_timeout(&tab[IPF_TCPS_ESTABLISHED], p->ipftu_int); } else if (!strcmp(t->ipft_name, "tcp_close_wait")) { ipf_apply_timeout(&tab[IPF_TCPS_CLOSE_WAIT], p->ipftu_int); } else if (!strcmp(t->ipft_name, "tcp_last_ack")) { ipf_apply_timeout(&tab[IPF_TCPS_LAST_ACK], p->ipftu_int); } else if (!strcmp(t->ipft_name, "tcp_timeout")) { ipf_apply_timeout(&tab[IPF_TCPS_LISTEN], p->ipftu_int); ipf_apply_timeout(&tab[IPF_TCPS_HALF_ESTAB], p->ipftu_int); ipf_apply_timeout(&tab[IPF_TCPS_CLOSING], p->ipftu_int); } else if (!strcmp(t->ipft_name, "tcp_listen")) { ipf_apply_timeout(&tab[IPF_TCPS_LISTEN], p->ipftu_int); } else if (!strcmp(t->ipft_name, "tcp_half_established")) { ipf_apply_timeout(&tab[IPF_TCPS_HALF_ESTAB], p->ipftu_int); } else if (!strcmp(t->ipft_name, "tcp_closing")) { ipf_apply_timeout(&tab[IPF_TCPS_CLOSING], p->ipftu_int); } else if (!strcmp(t->ipft_name, "tcp_syn_received")) { ipf_apply_timeout(&tab[IPF_TCPS_SYN_RECEIVED], p->ipftu_int); } else if (!strcmp(t->ipft_name, "tcp_syn_sent")) { ipf_apply_timeout(&tab[IPF_TCPS_SYN_SENT], p->ipftu_int); } else if (!strcmp(t->ipft_name, "tcp_closed")) { ipf_apply_timeout(&tab[IPF_TCPS_CLOSED], p->ipftu_int); } else if (!strcmp(t->ipft_name, "tcp_half_closed")) { ipf_apply_timeout(&tab[IPF_TCPS_CLOSED], p->ipftu_int); } else if (!strcmp(t->ipft_name, "tcp_time_wait")) { ipf_apply_timeout(&tab[IPF_TCPS_TIME_WAIT], p->ipftu_int); } else { /* * ipf_interror isn't set here because it should be set * by whatever called this function. */ return -1; } return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_main_soft_create */ /* Returns: NULL = failure, else success */ /* Parameters: arg(I) - pointer to soft context structure if already allocd */ /* */ /* Create the foundation soft context structure. In circumstances where it */ /* is not required to dynamically allocate the context, a pointer can be */ /* passed in (rather than NULL) to a structure to be initialised. */ /* The main thing of interest is that a number of locks are initialised */ /* here instead of in the where might be expected - in the relevant create */ /* function elsewhere. This is done because the current locking design has */ /* some areas where these locks are used outside of their module. */ /* Possibly the most important exercise that is done here is setting of all */ /* the timeout values, allowing them to be changed before init(). */ /* ------------------------------------------------------------------------ */ void * ipf_main_soft_create(arg) void *arg; { ipf_main_softc_t *softc; if (arg == NULL) { KMALLOC(softc, ipf_main_softc_t *); if (softc == NULL) return NULL; } else { softc = arg; } bzero((char *)softc, sizeof(*softc)); /* * This serves as a flag as to whether or not the softc should be * free'd when _destroy is called. */ softc->ipf_dynamic_softc = (arg == NULL) ? 1 : 0; softc->ipf_tuners = ipf_tune_array_copy(softc, sizeof(ipf_main_tuneables), ipf_main_tuneables); if (softc->ipf_tuners == NULL) { ipf_main_soft_destroy(softc); return NULL; } MUTEX_INIT(&softc->ipf_rw, "ipf rw mutex"); MUTEX_INIT(&softc->ipf_timeoutlock, "ipf timeout lock"); RWLOCK_INIT(&softc->ipf_global, "ipf filter load/unload mutex"); RWLOCK_INIT(&softc->ipf_mutex, "ipf filter rwlock"); RWLOCK_INIT(&softc->ipf_tokens, "ipf token rwlock"); RWLOCK_INIT(&softc->ipf_state, "ipf state rwlock"); RWLOCK_INIT(&softc->ipf_nat, "ipf IP NAT rwlock"); RWLOCK_INIT(&softc->ipf_poolrw, "ipf pool rwlock"); RWLOCK_INIT(&softc->ipf_frag, "ipf frag rwlock"); softc->ipf_token_head = NULL; softc->ipf_token_tail = &softc->ipf_token_head; softc->ipf_tcpidletimeout = FIVE_DAYS; softc->ipf_tcpclosewait = IPF_TTLVAL(2 * TCP_MSL); softc->ipf_tcplastack = IPF_TTLVAL(30); softc->ipf_tcptimewait = IPF_TTLVAL(2 * TCP_MSL); softc->ipf_tcptimeout = IPF_TTLVAL(2 * TCP_MSL); softc->ipf_tcpsynsent = IPF_TTLVAL(2 * TCP_MSL); softc->ipf_tcpsynrecv = IPF_TTLVAL(2 * TCP_MSL); softc->ipf_tcpclosed = IPF_TTLVAL(30); softc->ipf_tcphalfclosed = IPF_TTLVAL(2 * 3600); softc->ipf_udptimeout = IPF_TTLVAL(120); softc->ipf_udpacktimeout = IPF_TTLVAL(12); softc->ipf_icmptimeout = IPF_TTLVAL(60); softc->ipf_icmpacktimeout = IPF_TTLVAL(6); softc->ipf_iptimeout = IPF_TTLVAL(60); #if defined(IPFILTER_DEFAULT_BLOCK) softc->ipf_pass = FR_BLOCK|FR_NOMATCH; #else softc->ipf_pass = (IPF_DEFAULT_PASS)|FR_NOMATCH; #endif softc->ipf_minttl = 4; softc->ipf_icmpminfragmtu = 68; softc->ipf_flags = IPF_LOGGING; return softc; } /* ------------------------------------------------------------------------ */ /* Function: ipf_main_soft_init */ /* Returns: 0 = success, -1 = failure */ /* Parameters: softc(I) - pointer to soft context main structure */ /* */ /* A null-op function that exists as a placeholder so that the flow in */ /* other functions is obvious. */ /* ------------------------------------------------------------------------ */ /*ARGSUSED*/ int ipf_main_soft_init(softc) ipf_main_softc_t *softc; { return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_main_soft_destroy */ /* Returns: void */ /* Parameters: softc(I) - pointer to soft context main structure */ /* */ /* Undo everything that we did in ipf_main_soft_create. */ /* */ /* The most important check that needs to be made here is whether or not */ /* the structure was allocated by ipf_main_soft_create() by checking what */ /* value is stored in ipf_dynamic_main. */ /* ------------------------------------------------------------------------ */ /*ARGSUSED*/ void ipf_main_soft_destroy(softc) ipf_main_softc_t *softc; { RW_DESTROY(&softc->ipf_frag); RW_DESTROY(&softc->ipf_poolrw); RW_DESTROY(&softc->ipf_nat); RW_DESTROY(&softc->ipf_state); RW_DESTROY(&softc->ipf_tokens); RW_DESTROY(&softc->ipf_mutex); RW_DESTROY(&softc->ipf_global); MUTEX_DESTROY(&softc->ipf_timeoutlock); MUTEX_DESTROY(&softc->ipf_rw); if (softc->ipf_tuners != NULL) { KFREES(softc->ipf_tuners, sizeof(ipf_main_tuneables)); } if (softc->ipf_dynamic_softc == 1) { KFREE(softc); } } /* ------------------------------------------------------------------------ */ /* Function: ipf_main_soft_fini */ /* Returns: 0 = success, -1 = failure */ /* Parameters: softc(I) - pointer to soft context main structure */ /* */ /* Clean out the rules which have been added since _init was last called, */ /* the only dynamic part of the mainline. */ /* ------------------------------------------------------------------------ */ int ipf_main_soft_fini(softc) ipf_main_softc_t *softc; { (void) ipf_flush(softc, IPL_LOGIPF, FR_INQUE|FR_OUTQUE|FR_INACTIVE); (void) ipf_flush(softc, IPL_LOGIPF, FR_INQUE|FR_OUTQUE); (void) ipf_flush(softc, IPL_LOGCOUNT, FR_INQUE|FR_OUTQUE|FR_INACTIVE); (void) ipf_flush(softc, IPL_LOGCOUNT, FR_INQUE|FR_OUTQUE); return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_main_load */ /* Returns: 0 = success, -1 = failure */ /* Parameters: none */ /* */ /* Handle global initialisation that needs to be done for the base part of */ /* IPFilter. At present this just amounts to initialising some ICMP lookup */ /* arrays that get used by the state/NAT code. */ /* ------------------------------------------------------------------------ */ int ipf_main_load() { int i; /* fill icmp reply type table */ for (i = 0; i <= ICMP_MAXTYPE; i++) icmpreplytype4[i] = -1; icmpreplytype4[ICMP_ECHO] = ICMP_ECHOREPLY; icmpreplytype4[ICMP_TSTAMP] = ICMP_TSTAMPREPLY; icmpreplytype4[ICMP_IREQ] = ICMP_IREQREPLY; icmpreplytype4[ICMP_MASKREQ] = ICMP_MASKREPLY; #ifdef USE_INET6 /* fill icmp reply type table */ for (i = 0; i <= ICMP6_MAXTYPE; i++) icmpreplytype6[i] = -1; icmpreplytype6[ICMP6_ECHO_REQUEST] = ICMP6_ECHO_REPLY; icmpreplytype6[ICMP6_MEMBERSHIP_QUERY] = ICMP6_MEMBERSHIP_REPORT; icmpreplytype6[ICMP6_NI_QUERY] = ICMP6_NI_REPLY; icmpreplytype6[ND_ROUTER_SOLICIT] = ND_ROUTER_ADVERT; icmpreplytype6[ND_NEIGHBOR_SOLICIT] = ND_NEIGHBOR_ADVERT; #endif return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_main_unload */ /* Returns: 0 = success, -1 = failure */ /* Parameters: none */ /* */ /* A null-op function that exists as a placeholder so that the flow in */ /* other functions is obvious. */ /* ------------------------------------------------------------------------ */ int ipf_main_unload() { return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_load_all */ /* Returns: 0 = success, -1 = failure */ /* Parameters: none */ /* */ /* Work through all of the subsystems inside IPFilter and call the load */ /* function for each in an order that won't lead to a crash :) */ /* ------------------------------------------------------------------------ */ int ipf_load_all() { if (ipf_main_load() == -1) return -1; if (ipf_state_main_load() == -1) return -1; if (ipf_nat_main_load() == -1) return -1; if (ipf_frag_main_load() == -1) return -1; if (ipf_auth_main_load() == -1) return -1; if (ipf_proxy_main_load() == -1) return -1; return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_unload_all */ /* Returns: 0 = success, -1 = failure */ /* Parameters: none */ /* */ /* Work through all of the subsystems inside IPFilter and call the unload */ /* function for each in an order that won't lead to a crash :) */ /* ------------------------------------------------------------------------ */ int ipf_unload_all() { if (ipf_proxy_main_unload() == -1) return -1; if (ipf_auth_main_unload() == -1) return -1; if (ipf_frag_main_unload() == -1) return -1; if (ipf_nat_main_unload() == -1) return -1; if (ipf_state_main_unload() == -1) return -1; if (ipf_main_unload() == -1) return -1; return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_create_all */ /* Returns: NULL = failure, else success */ /* Parameters: arg(I) - pointer to soft context main structure */ /* */ /* Work through all of the subsystems inside IPFilter and call the create */ /* function for each in an order that won't lead to a crash :) */ /* ------------------------------------------------------------------------ */ ipf_main_softc_t * ipf_create_all(arg) void *arg; { ipf_main_softc_t *softc; softc = ipf_main_soft_create(arg); if (softc == NULL) return NULL; #ifdef IPFILTER_LOG softc->ipf_log_soft = ipf_log_soft_create(softc); if (softc->ipf_log_soft == NULL) { ipf_destroy_all(softc); return NULL; } #endif softc->ipf_lookup_soft = ipf_lookup_soft_create(softc); if (softc->ipf_lookup_soft == NULL) { ipf_destroy_all(softc); return NULL; } softc->ipf_sync_soft = ipf_sync_soft_create(softc); if (softc->ipf_sync_soft == NULL) { ipf_destroy_all(softc); return NULL; } softc->ipf_state_soft = ipf_state_soft_create(softc); if (softc->ipf_state_soft == NULL) { ipf_destroy_all(softc); return NULL; } softc->ipf_nat_soft = ipf_nat_soft_create(softc); if (softc->ipf_nat_soft == NULL) { ipf_destroy_all(softc); return NULL; } softc->ipf_frag_soft = ipf_frag_soft_create(softc); if (softc->ipf_frag_soft == NULL) { ipf_destroy_all(softc); return NULL; } softc->ipf_auth_soft = ipf_auth_soft_create(softc); if (softc->ipf_auth_soft == NULL) { ipf_destroy_all(softc); return NULL; } softc->ipf_proxy_soft = ipf_proxy_soft_create(softc); if (softc->ipf_proxy_soft == NULL) { ipf_destroy_all(softc); return NULL; } return softc; } /* ------------------------------------------------------------------------ */ /* Function: ipf_destroy_all */ /* Returns: void */ /* Parameters: softc(I) - pointer to soft context main structure */ /* */ /* Work through all of the subsystems inside IPFilter and call the destroy */ /* function for each in an order that won't lead to a crash :) */ /* */ /* Every one of these functions is expected to succeed, so there is no */ /* checking of return values. */ /* ------------------------------------------------------------------------ */ void ipf_destroy_all(softc) ipf_main_softc_t *softc; { if (softc->ipf_state_soft != NULL) { ipf_state_soft_destroy(softc, softc->ipf_state_soft); softc->ipf_state_soft = NULL; } if (softc->ipf_nat_soft != NULL) { ipf_nat_soft_destroy(softc, softc->ipf_nat_soft); softc->ipf_nat_soft = NULL; } if (softc->ipf_frag_soft != NULL) { ipf_frag_soft_destroy(softc, softc->ipf_frag_soft); softc->ipf_frag_soft = NULL; } if (softc->ipf_auth_soft != NULL) { ipf_auth_soft_destroy(softc, softc->ipf_auth_soft); softc->ipf_auth_soft = NULL; } if (softc->ipf_proxy_soft != NULL) { ipf_proxy_soft_destroy(softc, softc->ipf_proxy_soft); softc->ipf_proxy_soft = NULL; } if (softc->ipf_sync_soft != NULL) { ipf_sync_soft_destroy(softc, softc->ipf_sync_soft); softc->ipf_sync_soft = NULL; } if (softc->ipf_lookup_soft != NULL) { ipf_lookup_soft_destroy(softc, softc->ipf_lookup_soft); softc->ipf_lookup_soft = NULL; } #ifdef IPFILTER_LOG if (softc->ipf_log_soft != NULL) { ipf_log_soft_destroy(softc, softc->ipf_log_soft); softc->ipf_log_soft = NULL; } #endif ipf_main_soft_destroy(softc); } /* ------------------------------------------------------------------------ */ /* Function: ipf_init_all */ /* Returns: 0 = success, -1 = failure */ /* Parameters: softc(I) - pointer to soft context main structure */ /* */ /* Work through all of the subsystems inside IPFilter and call the init */ /* function for each in an order that won't lead to a crash :) */ /* ------------------------------------------------------------------------ */ int ipf_init_all(softc) ipf_main_softc_t *softc; { if (ipf_main_soft_init(softc) == -1) return -1; #ifdef IPFILTER_LOG if (ipf_log_soft_init(softc, softc->ipf_log_soft) == -1) return -1; #endif if (ipf_lookup_soft_init(softc, softc->ipf_lookup_soft) == -1) return -1; if (ipf_sync_soft_init(softc, softc->ipf_sync_soft) == -1) return -1; if (ipf_state_soft_init(softc, softc->ipf_state_soft) == -1) return -1; if (ipf_nat_soft_init(softc, softc->ipf_nat_soft) == -1) return -1; if (ipf_frag_soft_init(softc, softc->ipf_frag_soft) == -1) return -1; if (ipf_auth_soft_init(softc, softc->ipf_auth_soft) == -1) return -1; if (ipf_proxy_soft_init(softc, softc->ipf_proxy_soft) == -1) return -1; return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_fini_all */ /* Returns: 0 = success, -1 = failure */ /* Parameters: softc(I) - pointer to soft context main structure */ /* */ /* Work through all of the subsystems inside IPFilter and call the fini */ /* function for each in an order that won't lead to a crash :) */ /* ------------------------------------------------------------------------ */ int ipf_fini_all(softc) ipf_main_softc_t *softc; { ipf_token_flush(softc); if (ipf_proxy_soft_fini(softc, softc->ipf_proxy_soft) == -1) return -1; if (ipf_auth_soft_fini(softc, softc->ipf_auth_soft) == -1) return -1; if (ipf_frag_soft_fini(softc, softc->ipf_frag_soft) == -1) return -1; if (ipf_nat_soft_fini(softc, softc->ipf_nat_soft) == -1) return -1; if (ipf_state_soft_fini(softc, softc->ipf_state_soft) == -1) return -1; if (ipf_sync_soft_fini(softc, softc->ipf_sync_soft) == -1) return -1; if (ipf_lookup_soft_fini(softc, softc->ipf_lookup_soft) == -1) return -1; #ifdef IPFILTER_LOG if (ipf_log_soft_fini(softc, softc->ipf_log_soft) == -1) return -1; #endif if (ipf_main_soft_fini(softc) == -1) return -1; return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_rule_expire */ /* Returns: Nil */ /* Parameters: softc(I) - pointer to soft context main structure */ /* */ /* At present this function exists just to support temporary addition of */ /* firewall rules. Both inactive and active lists are scanned for items to */ /* purge, as by rights, the expiration is computed as soon as the rule is */ /* loaded in. */ /* ------------------------------------------------------------------------ */ void ipf_rule_expire(softc) ipf_main_softc_t *softc; { frentry_t *fr; if ((softc->ipf_rule_explist[0] == NULL) && (softc->ipf_rule_explist[1] == NULL)) return; WRITE_ENTER(&softc->ipf_mutex); while ((fr = softc->ipf_rule_explist[0]) != NULL) { /* * Because the list is kept sorted on insertion, the fist * one that dies in the future means no more work to do. */ if (fr->fr_die > softc->ipf_ticks) break; ipf_rule_delete(softc, fr, IPL_LOGIPF, 0); } while ((fr = softc->ipf_rule_explist[1]) != NULL) { /* * Because the list is kept sorted on insertion, the fist * one that dies in the future means no more work to do. */ if (fr->fr_die > softc->ipf_ticks) break; ipf_rule_delete(softc, fr, IPL_LOGIPF, 1); } RWLOCK_EXIT(&softc->ipf_mutex); } static int ipf_ht_node_cmp __P((struct host_node_s *, struct host_node_s *)); static void ipf_ht_node_make_key __P((host_track_t *, host_node_t *, int, i6addr_t *)); host_node_t RBI_ZERO(ipf_rb); RBI_CODE(ipf_rb, host_node_t, hn_entry, ipf_ht_node_cmp) /* ------------------------------------------------------------------------ */ /* Function: ipf_ht_node_cmp */ /* Returns: int - 0 == nodes are the same, .. */ /* Parameters: k1(I) - pointer to first key to compare */ /* k2(I) - pointer to second key to compare */ /* */ /* The "key" for the node is a combination of two fields: the address */ /* family and the address itself. */ /* */ /* Because we're not actually interpreting the address data, it isn't */ /* necessary to convert them to/from network/host byte order. The mask is */ /* just used to remove bits that aren't significant - it doesn't matter */ /* where they are, as long as they're always in the same place. */ /* */ /* As with IP6_EQ, comparing IPv6 addresses starts at the bottom because */ /* this is where individual ones will differ the most - but not true for */ /* for /48's, etc. */ /* ------------------------------------------------------------------------ */ static int ipf_ht_node_cmp(k1, k2) struct host_node_s *k1, *k2; { int i; i = (k2->hn_addr.adf_family - k1->hn_addr.adf_family); if (i != 0) return i; if (k1->hn_addr.adf_family == AF_INET) return (k2->hn_addr.adf_addr.in4.s_addr - k1->hn_addr.adf_addr.in4.s_addr); i = k2->hn_addr.adf_addr.i6[3] - k1->hn_addr.adf_addr.i6[3]; if (i != 0) return i; i = k2->hn_addr.adf_addr.i6[2] - k1->hn_addr.adf_addr.i6[2]; if (i != 0) return i; i = k2->hn_addr.adf_addr.i6[1] - k1->hn_addr.adf_addr.i6[1]; if (i != 0) return i; i = k2->hn_addr.adf_addr.i6[0] - k1->hn_addr.adf_addr.i6[0]; return i; } /* ------------------------------------------------------------------------ */ /* Function: ipf_ht_node_make_key */ /* Returns: Nil */ /* parameters: htp(I) - pointer to address tracking structure */ /* key(I) - where to store masked address for lookup */ /* family(I) - protocol family of address */ /* addr(I) - pointer to network address */ /* */ /* Using the "netmask" (number of bits) stored parent host tracking struct, */ /* copy the address passed in into the key structure whilst masking out the */ /* bits that we don't want. */ /* */ /* Because the parser will set ht_netmask to 128 if there is no protocol */ /* specified (the parser doesn't know if it should be a v4 or v6 rule), we */ /* have to be wary of that and not allow 32-128 to happen. */ /* ------------------------------------------------------------------------ */ static void ipf_ht_node_make_key(htp, key, family, addr) host_track_t *htp; host_node_t *key; int family; i6addr_t *addr; { key->hn_addr.adf_family = family; if (family == AF_INET) { u_32_t mask; int bits; key->hn_addr.adf_len = sizeof(key->hn_addr.adf_addr.in4); bits = htp->ht_netmask; if (bits >= 32) { mask = 0xffffffff; } else { mask = htonl(0xffffffff << (32 - bits)); } key->hn_addr.adf_addr.in4.s_addr = addr->in4.s_addr & mask; #ifdef USE_INET6 } else { int bits = htp->ht_netmask; key->hn_addr.adf_len = sizeof(key->hn_addr.adf_addr.in6); if (bits > 96) { key->hn_addr.adf_addr.i6[3] = addr->i6[3] & htonl(0xffffffff << (128 - bits)); key->hn_addr.adf_addr.i6[2] = addr->i6[2]; key->hn_addr.adf_addr.i6[1] = addr->i6[2]; key->hn_addr.adf_addr.i6[0] = addr->i6[2]; } else if (bits > 64) { key->hn_addr.adf_addr.i6[3] = 0; key->hn_addr.adf_addr.i6[2] = addr->i6[2] & htonl(0xffffffff << (96 - bits)); key->hn_addr.adf_addr.i6[1] = addr->i6[1]; key->hn_addr.adf_addr.i6[0] = addr->i6[0]; } else if (bits > 32) { key->hn_addr.adf_addr.i6[3] = 0; key->hn_addr.adf_addr.i6[2] = 0; key->hn_addr.adf_addr.i6[1] = addr->i6[1] & htonl(0xffffffff << (64 - bits)); key->hn_addr.adf_addr.i6[0] = addr->i6[0]; } else { key->hn_addr.adf_addr.i6[3] = 0; key->hn_addr.adf_addr.i6[2] = 0; key->hn_addr.adf_addr.i6[1] = 0; key->hn_addr.adf_addr.i6[0] = addr->i6[0] & htonl(0xffffffff << (32 - bits)); } #endif } } /* ------------------------------------------------------------------------ */ /* Function: ipf_ht_node_add */ /* Returns: int - 0 == success, -1 == failure */ /* Parameters: softc(I) - pointer to soft context main structure */ /* htp(I) - pointer to address tracking structure */ /* family(I) - protocol family of address */ /* addr(I) - pointer to network address */ /* */ /* NOTE: THIS FUNCTION MUST BE CALLED WITH AN EXCLUSIVE LOCK THAT PREVENTS */ /* ipf_ht_node_del FROM RUNNING CONCURRENTLY ON THE SAME htp. */ /* */ /* After preparing the key with the address information to find, look in */ /* the red-black tree to see if the address is known. A successful call to */ /* this function can mean one of two things: a new node was added to the */ /* tree or a matching node exists and we're able to bump up its activity. */ /* ------------------------------------------------------------------------ */ int ipf_ht_node_add(softc, htp, family, addr) ipf_main_softc_t *softc; host_track_t *htp; int family; i6addr_t *addr; { host_node_t *h; host_node_t k; ipf_ht_node_make_key(htp, &k, family, addr); h = RBI_SEARCH(ipf_rb, &htp->ht_root, &k); if (h == NULL) { if (htp->ht_cur_nodes >= htp->ht_max_nodes) return -1; KMALLOC(h, host_node_t *); if (h == NULL) { DT(ipf_rb_no_mem); LBUMP(ipf_rb_no_mem); return -1; } /* * If there was a macro to initialise the RB node then that * would get used here, but there isn't... */ bzero((char *)h, sizeof(*h)); h->hn_addr = k.hn_addr; h->hn_addr.adf_family = k.hn_addr.adf_family; RBI_INSERT(ipf_rb, &htp->ht_root, h); htp->ht_cur_nodes++; } else { if ((htp->ht_max_per_node != 0) && (h->hn_active >= htp->ht_max_per_node)) { DT(ipf_rb_node_max); LBUMP(ipf_rb_node_max); return -1; } } h->hn_active++; return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_ht_node_del */ /* Returns: int - 0 == success, -1 == failure */ /* parameters: htp(I) - pointer to address tracking structure */ /* family(I) - protocol family of address */ /* addr(I) - pointer to network address */ /* */ /* NOTE: THIS FUNCTION MUST BE CALLED WITH AN EXCLUSIVE LOCK THAT PREVENTS */ /* ipf_ht_node_add FROM RUNNING CONCURRENTLY ON THE SAME htp. */ /* */ /* Try and find the address passed in amongst the leavese on this tree to */ /* be friend. If found then drop the active account for that node drops by */ /* one. If that count reaches 0, it is time to free it all up. */ /* ------------------------------------------------------------------------ */ int ipf_ht_node_del(htp, family, addr) host_track_t *htp; int family; i6addr_t *addr; { host_node_t *h; host_node_t k; ipf_ht_node_make_key(htp, &k, family, addr); h = RBI_SEARCH(ipf_rb, &htp->ht_root, &k); if (h == NULL) { return -1; } else { h->hn_active--; if (h->hn_active == 0) { (void) RBI_DELETE(ipf_rb, &htp->ht_root, h); htp->ht_cur_nodes--; KFREE(h); } } return 0; } /* ------------------------------------------------------------------------ */ /* Function: ipf_rb_ht_init */ /* Returns: Nil */ /* Parameters: head(I) - pointer to host tracking structure */ /* */ /* Initialise the host tracking structure to be ready for use above. */ /* ------------------------------------------------------------------------ */ void ipf_rb_ht_init(head) host_track_t *head; { RBI_INIT(ipf_rb, &head->ht_root); } /* ------------------------------------------------------------------------ */ /* Function: ipf_rb_ht_freenode */ /* Returns: Nil */ /* Parameters: head(I) - pointer to host tracking structure */ /* arg(I) - additional argument from walk caller */ /* */ /* Free an actual host_node_t structure. */ /* ------------------------------------------------------------------------ */ void ipf_rb_ht_freenode(node, arg) host_node_t *node; void *arg; { KFREE(node); } /* ------------------------------------------------------------------------ */ /* Function: ipf_rb_ht_flush */ /* Returns: Nil */ /* Parameters: head(I) - pointer to host tracking structure */ /* */ /* Remove all of the nodes in the tree tracking hosts by calling a walker */ /* and free'ing each one. */ /* ------------------------------------------------------------------------ */ void ipf_rb_ht_flush(head) host_track_t *head; { RBI_WALK(ipf_rb, &head->ht_root, ipf_rb_ht_freenode, NULL); } /* ------------------------------------------------------------------------ */ /* Function: ipf_slowtimer */ /* Returns: Nil */ /* Parameters: ptr(I) - pointer to main ipf soft context structure */ /* */ /* Slowly expire held state for fragments. Timeouts are set * in */ /* expectation of this being called twice per second. */ /* ------------------------------------------------------------------------ */ void ipf_slowtimer(softc) ipf_main_softc_t *softc; { ipf_token_expire(softc); ipf_frag_expire(softc); ipf_state_expire(softc); ipf_nat_expire(softc); ipf_auth_expire(softc); ipf_lookup_expire(softc); ipf_rule_expire(softc); ipf_sync_expire(softc); softc->ipf_ticks++; # if defined(__OpenBSD__) timeout_add(&ipf_slowtimer_ch, hz/2); # endif } /* ------------------------------------------------------------------------ */ /* Function: ipf_inet_mask_add */ /* Returns: Nil */ /* Parameters: bits(I) - pointer to nat context information */ /* mtab(I) - pointer to mask hash table structure */ /* */ /* When called, bits represents the mask of a new NAT rule that has just */ /* been added. This function inserts a bitmask into the array of masks to */ /* search when searching for a matching NAT rule for a packet. */ /* Prevention of duplicate masks is achieved by checking the use count for */ /* a given netmask. */ /* ------------------------------------------------------------------------ */ void ipf_inet_mask_add(bits, mtab) int bits; ipf_v4_masktab_t *mtab; { u_32_t mask; int i, j; mtab->imt4_masks[bits]++; if (mtab->imt4_masks[bits] > 1) return; if (bits == 0) mask = 0; else mask = 0xffffffff << (32 - bits); for (i = 0; i < 33; i++) { if (ntohl(mtab->imt4_active[i]) < mask) { for (j = 32; j > i; j--) mtab->imt4_active[j] = mtab->imt4_active[j - 1]; mtab->imt4_active[i] = htonl(mask); break; } } mtab->imt4_max++; } /* ------------------------------------------------------------------------ */ /* Function: ipf_inet_mask_del */ /* Returns: Nil */ /* Parameters: bits(I) - number of bits set in the netmask */ /* mtab(I) - pointer to mask hash table structure */ /* */ /* Remove the 32bit bitmask represented by "bits" from the collection of */ /* netmasks stored inside of mtab. */ /* ------------------------------------------------------------------------ */ void ipf_inet_mask_del(bits, mtab) int bits; ipf_v4_masktab_t *mtab; { u_32_t mask; int i, j; mtab->imt4_masks[bits]--; if (mtab->imt4_masks[bits] > 0) return; mask = htonl(0xffffffff << (32 - bits)); for (i = 0; i < 33; i++) { if (mtab->imt4_active[i] == mask) { for (j = i + 1; j < 33; j++) mtab->imt4_active[j - 1] = mtab->imt4_active[j]; break; } } mtab->imt4_max--; ASSERT(mtab->imt4_max >= 0); } #ifdef USE_INET6 /* ------------------------------------------------------------------------ */ /* Function: ipf_inet6_mask_add */ /* Returns: Nil */ /* Parameters: bits(I) - number of bits set in mask */ /* mask(I) - pointer to mask to add */ /* mtab(I) - pointer to mask hash table structure */ /* */ /* When called, bitcount represents the mask of a IPv6 NAT map rule that */ /* has just been added. This function inserts a bitmask into the array of */ /* masks to search when searching for a matching NAT rule for a packet. */ /* Prevention of duplicate masks is achieved by checking the use count for */ /* a given netmask. */ /* ------------------------------------------------------------------------ */ void ipf_inet6_mask_add(bits, mask, mtab) int bits; i6addr_t *mask; ipf_v6_masktab_t *mtab; { i6addr_t zero; int i, j; mtab->imt6_masks[bits]++; if (mtab->imt6_masks[bits] > 1) return; if (bits == 0) { mask = &zero; zero.i6[0] = 0; zero.i6[1] = 0; zero.i6[2] = 0; zero.i6[3] = 0; } for (i = 0; i < 129; i++) { if (IP6_LT(&mtab->imt6_active[i], mask)) { for (j = 128; j > i; j--) mtab->imt6_active[j] = mtab->imt6_active[j - 1]; mtab->imt6_active[i] = *mask; break; } } mtab->imt6_max++; } /* ------------------------------------------------------------------------ */ /* Function: ipf_inet6_mask_del */ /* Returns: Nil */ /* Parameters: bits(I) - number of bits set in mask */ /* mask(I) - pointer to mask to remove */ /* mtab(I) - pointer to mask hash table structure */ /* */ /* Remove the 128bit bitmask represented by "bits" from the collection of */ /* netmasks stored inside of mtab. */ /* ------------------------------------------------------------------------ */ void ipf_inet6_mask_del(bits, mask, mtab) int bits; i6addr_t *mask; ipf_v6_masktab_t *mtab; { i6addr_t zero; int i, j; mtab->imt6_masks[bits]--; if (mtab->imt6_masks[bits] > 0) return; if (bits == 0) mask = &zero; zero.i6[0] = 0; zero.i6[1] = 0; zero.i6[2] = 0; zero.i6[3] = 0; for (i = 0; i < 129; i++) { if (IP6_EQ(&mtab->imt6_active[i], mask)) { for (j = i + 1; j < 129; j++) { mtab->imt6_active[j - 1] = mtab->imt6_active[j]; if (IP6_EQ(&mtab->imt6_active[j - 1], &zero)) break; } break; } } mtab->imt6_max--; ASSERT(mtab->imt6_max >= 0); } #endif Index: head/sys/contrib/ipfilter/netinet/ip_compat.h =================================================================== --- head/sys/contrib/ipfilter/netinet/ip_compat.h (revision 305823) +++ head/sys/contrib/ipfilter/netinet/ip_compat.h (revision 305824) @@ -1,1497 +1,1497 @@ /* * Copyright (C) 2012 by Darren Reed. * * See the IPFILTER.LICENCE file for details on licencing. * * @(#)ip_compat.h 1.8 1/14/96 * $FreeBSD$ * Id: ip_compat.h,v 2.142.2.57 2007/10/10 09:51:42 darrenr Exp $ */ #ifndef __IP_COMPAT_H__ #define __IP_COMPAT_H__ #ifndef __P # ifdef __STDC__ # define __P(x) x # else # define __P(x) () # endif #endif #ifndef __STDC__ # undef const # define const #endif #if defined(_KERNEL) || defined(KERNEL) || defined(__KERNEL__) # undef KERNEL # undef _KERNEL # undef __KERNEL__ # define KERNEL # define _KERNEL # define __KERNEL__ #endif #ifndef SOLARIS # if defined(sun) && (defined(__svr4__) || defined(__SVR4)) # define SOLARIS 1 # else # define SOLARIS 0 # endif #endif #if defined(__SVR4) || defined(__svr4__) || defined(__sgi) # define index strchr # if !defined(_KERNEL) # define bzero(a,b) memset(a,0,b) # define bcmp memcmp # define bcopy(a,b,c) memmove(b,a,c) # endif #endif #ifndef LIFNAMSIZ # ifdef IF_NAMESIZE # define LIFNAMSIZ IF_NAMESIZE # else # ifdef IFNAMSIZ # define LIFNAMSIZ IFNAMSIZ # else # define LIFNAMSIZ 16 # endif # endif #endif #if defined(__sgi) || defined(bsdi) || defined(__hpux) || defined(hpux) struct ether_addr { u_char ether_addr_octet[6]; }; #endif # ifdef __STDC__ # define IPL_EXTERN(ep) ipl##ep # else # define IPL_EXTERN(ep) ipl/**/ep # endif /* * This is a workaround for troubles on FreeBSD and OpenBSD. */ # ifndef _KERNEL # define ADD_KERNEL # define _KERNEL # define KERNEL # endif # include # ifdef ADD_KERNEL # undef _KERNEL # undef KERNEL # endif #define NETBSD_GE_REV(x) (defined(__NetBSD_Version__) && \ (__NetBSD_Version__ >= (x))) #define NETBSD_GT_REV(x) (defined(__NetBSD_Version__) && \ (__NetBSD_Version__ > (x))) #define NETBSD_LT_REV(x) (defined(__NetBSD_Version__) && \ (__NetBSD_Version__ < (x))) #define FREEBSD_GE_REV(x) (defined(__FreeBSD_version) && \ (__FreeBSD_version >= (x))) #define FREEBSD_GT_REV(x) (defined(__FreeBSD_version) && \ (__FreeBSD_version > (x))) #define FREEBSD_LT_REV(x) (defined(__FreeBSD_version) && \ (__FreeBSD_version < (x))) #define BSDOS_GE_REV(x) (defined(_BSDI_VERSION) && \ (_BSDI_VERSION >= (x))) #define BSDOS_GT_REV(x) (defined(_BSDI_VERSION) && \ (_BSDI_VERSION > (x))) #define BSDOS_LT_REV(x) (defined(_BSDI_VERSION) && \ (_BSDI_VERSION < (x))) #define OPENBSD_GE_REV(x) (defined(OpenBSD) && (OpenBSD >= (x))) #define OPENBSD_GT_REV(x) (defined(OpenBSD) && (OpenBSD > (x))) #define OPENBSD_LT_REV(x) (defined(OpenBSD) && (OpenBSD < (x))) #define BSD_GE_YEAR(x) (defined(BSD) && (BSD >= (x))) #define BSD_GT_YEAR(x) (defined(BSD) && (BSD > (x))) #define BSD_LT_YEAR(x) (defined(BSD) && (BSD < (x))) /* ----------------------------------------------------------------------- */ /* F R E E B S D */ /* ----------------------------------------------------------------------- */ # define HAS_SYS_MD5_H 1 # if defined(_KERNEL) # include "opt_bpf.h" # include "opt_inet6.h" # if defined(INET6) && !defined(USE_INET6) # define USE_INET6 # endif # else # if !defined(USE_INET6) && !defined(NOINET6) # define USE_INET6 # endif # endif # if defined(_KERNEL) # include # define p_cred td_ucred # define p_uid td_ucred->cr_ruid /* * When #define'd, the 5.2.1 kernel panics when used with the ftp proxy. * There may be other, safe, kernels but this is not extensively tested yet. */ # define HAVE_M_PULLDOWN # if !defined(IPFILTER_LKM) && (__FreeBSD_version >= 300000) # include "opt_ipfilter.h" # endif # define COPYIN(a,b,c) copyin((caddr_t)(a), (caddr_t)(b), (c)) # define COPYOUT(a,b,c) copyout((caddr_t)(a), (caddr_t)(b), (c)) # define NETBSD_PF # else # include # endif /* _KERNEL */ # include # include # include # include # define KRWLOCK_FILL_SZ 56 # define KMUTEX_FILL_SZ 56 # include # define KMUTEX_T struct mtx # define KRWLOCK_T struct rwlock #ifdef _KERNEL # define READ_ENTER(x) rw_rlock(&(x)->ipf_lk) # define WRITE_ENTER(x) rw_wlock(&(x)->ipf_lk) # define MUTEX_DOWNGRADE(x) rw_downgrade(&(x)->ipf_lk) # define RWLOCK_INIT(x,y) rw_init(&(x)->ipf_lk, (y)) # define RW_DESTROY(x) rw_destroy(&(x)->ipf_lk) # define RWLOCK_EXIT(x) do { \ if (rw_wowned(&(x)->ipf_lk)) \ rw_wunlock(&(x)->ipf_lk); \ else \ rw_runlock(&(x)->ipf_lk); \ } while (0) # include # define GETKTIME(x) microtime((struct timeval *)x) # include # include # include # define USE_MUTEXES # define MUTEX_ENTER(x) mtx_lock(&(x)->ipf_lk) # define MUTEX_EXIT(x) mtx_unlock(&(x)->ipf_lk) # define MUTEX_INIT(x,y) mtx_init(&(x)->ipf_lk, (y), NULL,\ MTX_DEF) # define MUTEX_DESTROY(x) mtx_destroy(&(x)->ipf_lk) # define MUTEX_NUKE(x) bzero((x), sizeof(*(x))) /* * Whilst the sx(9) locks on FreeBSD have the right semantics and interface * for what we want to use them for, despite testing showing they work - * with a WITNESS kernel, it generates LOR messages. */ # include # define ATOMIC_INC(x) { mtx_lock(&softc->ipf_rw.ipf_lk); (x)++; \ mtx_unlock(&softc->ipf_rw.ipf_lk); } # define ATOMIC_DEC(x) { mtx_lock(&softc->ipf_rw.ipf_lk); (x)--; \ mtx_unlock(&softc->ipf_rw.ipf_lk); } # define ATOMIC_INCL(x) atomic_add_long(&(x), 1) # define ATOMIC_INC64(x) ATOMIC_INC(x) # define ATOMIC_INC32(x) atomic_add_32((u_int *)&(x), 1) # define ATOMIC_DECL(x) atomic_add_long(&(x), -1) # define ATOMIC_DEC64(x) ATOMIC_DEC(x) # define ATOMIC_DEC32(x) atomic_add_32((u_int *)&(x), -1) # define SPL_X(x) ; # define SPL_NET(x) ; # define SPL_IMP(x) ; # define SPL_SCHED(x) ; # define GET_MINOR dev2unit # define MSGDSIZE(m) mbufchainlen(m) # define M_LEN(m) (m)->m_len # define M_ADJ(m,x) m_adj(m, x) -# define M_COPY(x) m_copy((x), 0, M_COPYALL) +# define M_COPYM(x) m_copym((x), 0, M_COPYALL, M_NOWAIT) # define M_DUP(m) m_dup(m, M_NOWAIT) # define IPF_PANIC(x,y) if (x) { printf y; panic("ipf_panic"); } typedef struct mbuf mb_t; #else /* !_KERNEL */ #ifndef _NET_IF_VAR_H_ /* * Userland emulation of struct ifnet. */ struct route; struct mbuf; struct ifnet { char if_xname[IFNAMSIZ]; TAILQ_HEAD(, ifaddr) if_addrlist; int (*if_output)(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); }; #endif /* _NET_IF_VAR_H_ */ #endif /* _KERNEL */ # define IFNAME(x) ((struct ifnet *)x)->if_xname # define COPYIFNAME(v, x, b) \ (void) strncpy(b, \ ((struct ifnet *)x)->if_xname, \ LIFNAMSIZ) typedef u_long ioctlcmd_t; typedef struct uio uio_t; typedef int minor_t; typedef u_int32_t u_32_t; # define U_32_T 1 /* ----------------------------------------------------------------------- */ /* G E N E R I C */ /* ----------------------------------------------------------------------- */ /* * For BSD kernels, if bpf is in the kernel, enable ipfilter to use bpf in * filter rules. */ #if !defined(IPFILTER_BPF) # if (defined(NBPF) && (NBPF > 0)) || (defined(DEV_BPF) && (DEV_BPF > 0)) || \ (defined(NBPFILTER) && (NBPFILTER > 0)) # define IPFILTER_BPF # endif #endif /* * Userland locking primitives */ #ifndef _KERNEL #if !defined(KMUTEX_FILL_SZ) # define KMUTEX_FILL_SZ 1 #endif #if !defined(KRWLOCK_FILL_SZ) # define KRWLOCK_FILL_SZ 1 #endif #endif typedef struct { char *eMm_owner; char *eMm_heldin; u_int eMm_magic; int eMm_held; int eMm_heldat; } eMmutex_t; typedef struct { char *eMrw_owner; char *eMrw_heldin; u_int eMrw_magic; short eMrw_read; short eMrw_write; int eMrw_heldat; } eMrwlock_t; typedef union { char _fill[KMUTEX_FILL_SZ]; #ifdef KMUTEX_T struct { KMUTEX_T ipf_slk; const char *ipf_lname; } ipf_lkun_s; #endif eMmutex_t ipf_emu; } ipfmutex_t; typedef union { char _fill[KRWLOCK_FILL_SZ]; #ifdef KRWLOCK_T struct { KRWLOCK_T ipf_slk; const char *ipf_lname; int ipf_sr; int ipf_sw; u_int ipf_magic; } ipf_lkun_s; #endif eMrwlock_t ipf_emu; } ipfrwlock_t; #define ipf_lk ipf_lkun_s.ipf_slk #define ipf_lname ipf_lkun_s.ipf_lname #define ipf_isr ipf_lkun_s.ipf_sr #define ipf_isw ipf_lkun_s.ipf_sw #define ipf_magic ipf_lkun_s.ipf_magic #if !defined(__GNUC__) || \ (defined(__FreeBSD_version) && (__FreeBSD_version >= 503000)) # ifndef INLINE # define INLINE # endif #else # define INLINE __inline__ #endif #if defined(__FreeBSD_version) && defined(_KERNEL) CTASSERT(sizeof(ipfrwlock_t) == KRWLOCK_FILL_SZ); CTASSERT(sizeof(ipfmutex_t) == KMUTEX_FILL_SZ); #endif /* * In a non-kernel environment, there are a lot of macros that need to be * filled in to be null-ops or to point to some compatibility function, * somewhere in userland. */ #ifndef _KERNEL typedef struct mb_s { struct mb_s *mb_next; char *mb_data; void *mb_ifp; int mb_len; int mb_flags; u_long mb_buf[2048]; } mb_t; # undef m_next # define m_next mb_next # undef m_len # define m_len mb_len # undef m_flags # define m_flags mb_flags # undef m_data # define m_data mb_data # undef M_MCAST # define M_MCAST 0x01 # undef M_BCAST # define M_BCAST 0x02 # undef M_MBCAST # define M_MBCAST 0x04 # define MSGDSIZE(m) msgdsize(m) # define M_LEN(m) (m)->mb_len # define M_ADJ(m,x) (m)->mb_len += x -# define M_COPY(m) dupmbt(m) +# define M_COPYM(m) dupmbt(m) # define M_DUP(m) dupmbt(m) # define GETKTIME(x) gettimeofday((struct timeval *)(x), NULL) # define MTOD(m, t) ((t)(m)->mb_data) # define FREE_MB_T(m) freembt(m) # define ALLOC_MB_T(m,l) (m) = allocmbt(l) # define PREP_MB_T(f, m) do { \ (m)->mb_next = *(f)->fin_mp; \ *(fin)->fin_mp = (m); \ (f)->fin_m = (m); \ } while (0) # define SLEEP(x,y) 1; # define WAKEUP(x,y) ; # define POLLWAKEUP(y) ; # define IPF_PANIC(x,y) ; # define PANIC(x,y) ; # define SPL_SCHED(x) ; # define SPL_NET(x) ; # define SPL_IMP(x) ; # define SPL_X(x) ; # define KMALLOC(a,b) (a) = (b)malloc(sizeof(*a)) # define KMALLOCS(a,b,c) (a) = (b)malloc(c) # define KFREE(x) free(x) # define KFREES(x,s) free(x) # define GETIFP(x, v) get_unit(x,v) # define GETIFMTU_4(x) 2048 # define GETIFMTU_6(x) 2048 # define COPYIN(a,b,c) bcopywrap((a), (b), (c)) # define COPYOUT(a,b,c) bcopywrap((a), (b), (c)) # define COPYDATA(m, o, l, b) bcopy(MTOD((mb_t *)m, char *) + (o), \ (b), (l)) # define COPYBACK(m, o, l, b) bcopy((b), \ MTOD((mb_t *)m, char *) + (o), \ (l)) # define UIOMOVE(a,b,c,d) ipfuiomove((caddr_t)a,b,c,d) extern void m_copydata __P((mb_t *, int, int, caddr_t)); extern int ipfuiomove __P((caddr_t, int, int, struct uio *)); extern int bcopywrap __P((void *, void *, size_t)); extern mb_t *allocmbt __P((size_t)); extern mb_t *dupmbt __P((mb_t *)); extern void freembt __P((mb_t *)); # define MUTEX_DESTROY(x) eMmutex_destroy(&(x)->ipf_emu, \ __FILE__, __LINE__) # define MUTEX_ENTER(x) eMmutex_enter(&(x)->ipf_emu, \ __FILE__, __LINE__) # define MUTEX_EXIT(x) eMmutex_exit(&(x)->ipf_emu, \ __FILE__, __LINE__) # define MUTEX_INIT(x,y) eMmutex_init(&(x)->ipf_emu, y, \ __FILE__, __LINE__) # define MUTEX_NUKE(x) bzero((x), sizeof(*(x))) # define MUTEX_DOWNGRADE(x) eMrwlock_downgrade(&(x)->ipf_emu, \ __FILE__, __LINE__) # define READ_ENTER(x) eMrwlock_read_enter(&(x)->ipf_emu, \ __FILE__, __LINE__) # define RWLOCK_INIT(x, y) eMrwlock_init(&(x)->ipf_emu, y) # define RWLOCK_EXIT(x) eMrwlock_exit(&(x)->ipf_emu) # define RW_DESTROY(x) eMrwlock_destroy(&(x)->ipf_emu) # define WRITE_ENTER(x) eMrwlock_write_enter(&(x)->ipf_emu, \ __FILE__, \ __LINE__) # define USE_MUTEXES 1 extern void eMmutex_destroy __P((eMmutex_t *, char *, int)); extern void eMmutex_enter __P((eMmutex_t *, char *, int)); extern void eMmutex_exit __P((eMmutex_t *, char *, int)); extern void eMmutex_init __P((eMmutex_t *, char *, char *, int)); extern void eMrwlock_destroy __P((eMrwlock_t *)); extern void eMrwlock_exit __P((eMrwlock_t *)); extern void eMrwlock_init __P((eMrwlock_t *, char *)); extern void eMrwlock_read_enter __P((eMrwlock_t *, char *, int)); extern void eMrwlock_write_enter __P((eMrwlock_t *, char *, int)); extern void eMrwlock_downgrade __P((eMrwlock_t *, char *, int)); #endif extern mb_t *allocmbt(size_t); #define MAX_IPV4HDR ((0xf << 2) + sizeof(struct icmp) + sizeof(ip_t) + 8) #ifndef IP_OFFMASK # define IP_OFFMASK 0x1fff #endif /* * On BSD's use quad_t as a guarantee for getting at least a 64bit sized * object. */ #if !defined(__amd64__) && BSD_GT_YEAR(199306) # define USE_QUAD_T # define U_QUAD_T unsigned long long # define QUAD_T long long #else /* BSD > 199306 */ # if !defined(U_QUAD_T) # define U_QUAD_T u_long # define QUAD_T long # endif #endif /* BSD > 199306 */ #ifdef USE_INET6 # if defined(__NetBSD__) || defined(__OpenBSD__) || defined(__FreeBSD__) || \ defined(__osf__) || defined(linux) # include # include # if defined(_KERNEL) && !defined(__osf__) # include # endif typedef struct ip6_hdr ip6_t; # endif #endif #ifndef MAX # define MAX(a,b) (((a) > (b)) ? (a) : (b)) #endif #if defined(_KERNEL) # if defined(MENTAT) && !defined(INSTANCES) # define COPYDATA mb_copydata # define COPYBACK mb_copyback # else # define COPYDATA m_copydata # define COPYBACK m_copyback # endif # if (defined(__NetBSD_Version__) && (__NetBSD_Version__ < 105180000)) || \ defined(__FreeBSD__) || (defined(OpenBSD) && (OpenBSD < 200206)) || \ defined(_BSDI_VERSION) # include # endif # if !defined(__FreeBSD__) || FREEBSD_GE_REV(300000) # if NETBSD_GE_REV(105180000) || OPENBSD_GE_REV(200111) # include # else # include extern vm_map_t kmem_map; # endif # include # else /* !__FreeBSD__ || (__FreeBSD__ && __FreeBSD_version >= 300000) */ # include # endif /* !__FreeBSD__ || (__FreeBSD__ && __FreeBSD_version >= 300000) */ # ifdef IPFILTER_M_IPFILTER # include MALLOC_DECLARE(M_IPFILTER); # define _M_IPF M_IPFILTER # else /* IPFILTER_M_IPFILTER */ # ifdef M_PFIL # define _M_IPF M_PFIL # else # ifdef M_IPFILTER # define _M_IPF M_IPFILTER # else # define _M_IPF M_TEMP # endif /* M_IPFILTER */ # endif /* M_PFIL */ # endif /* IPFILTER_M_IPFILTER */ # if !defined(KMALLOC) # define KMALLOC(a, b) MALLOC((a), b, sizeof(*(a)), _M_IPF, M_NOWAIT) # endif # if !defined(KMALLOCS) # define KMALLOCS(a, b, c) MALLOC((a), b, (c), _M_IPF, M_NOWAIT) # endif # if !defined(KFREE) # define KFREE(x) FREE((x), _M_IPF) # endif # if !defined(KFREES) # define KFREES(x,s) FREE((x), _M_IPF) # endif # define UIOMOVE(a,b,c,d) uiomove((caddr_t)a,b,d) # define SLEEP(id, n) tsleep((id), PPAUSE|PCATCH, n, 0) # define WAKEUP(id,x) wakeup(id+x) # if !defined(POLLWAKEUP) # define POLLWAKEUP(x) selwakeup(softc->ipf_selwait+x) # endif # define GETIFP(n, v) ifunit(n) # define GETIFMTU_4(x) ((struct ifnet *)x)->if_mtu # define GETIFMTU_6(x) ((struct ifnet *)x)->if_mtu # if !defined(USE_MUTEXES) && !defined(SPL_NET) # define SPL_IMP(x) x = splimp() # define SPL_NET(x) x = splnet() # if !defined(SPL_SCHED) # define SPL_SCHED(x) x = splsched() # endif # define SPL_X(x) (void) splx(x) # endif /* !USE_MUTEXES */ # ifndef FREE_MB_T # define FREE_MB_T(m) m_freem(m) # endif # ifndef ALLOC_MB_T # ifdef MGETHDR # define ALLOC_MB_T(m,l) do { \ MGETHDR((m), M_NOWAIT, MT_HEADER); \ if ((m) != NULL) { \ (m)->m_len = (l); \ (m)->m_pkthdr.len = (l); \ } \ } while (0) # else # define ALLOC_MB_T(m,l) do { \ MGET((m), M_NOWAIT, MT_HEADER); \ if ((m) != NULL) { \ (m)->m_len = (l); \ (m)->m_pkthdr.len = (l); \ } \ } while (0) # endif # endif # ifndef PREP_MB_T # define PREP_MB_T(f, m) do { \ mb_t *_o = *(f)->fin_mp; \ (m)->m_next = _o; \ *(fin)->fin_mp = (m); \ if (_o->m_flags & M_PKTHDR) { \ (m)->m_pkthdr.len += \ _o->m_pkthdr.len; \ (m)->m_pkthdr.rcvif = \ _o->m_pkthdr.rcvif; \ } \ } while (0) # endif # ifndef M_DUP # ifdef M_COPYALL # define M_DUP(m) m_dup(m, 0, M_COPYALL, 0) # else # define M_DUP(m) m_dup(m) # endif # endif # ifndef MTOD # define MTOD(m,t) mtod(m,t) # endif # ifndef COPYIN # define COPYIN(a,b,c) (bcopy((caddr_t)(a), (caddr_t)(b), (c)), 0) # define COPYOUT(a,b,c) (bcopy((caddr_t)(a), (caddr_t)(b), (c)), 0) # endif # ifndef KMALLOC # define KMALLOC(a,b) (a) = (b)new_kmem_alloc(sizeof(*(a)), \ KMEM_NOSLEEP) # define KMALLOCS(a,b,c) (a) = (b)new_kmem_alloc((c), KMEM_NOSLEEP) # endif # ifndef GET_MINOR # define GET_MINOR(x) dev2unit(x) # endif # define PANIC(x,y) if (x) panic y #endif /* _KERNEL */ #if !defined(IFNAME) && !defined(_KERNEL) # define IFNAME(x) get_ifname((struct ifnet *)x) #endif #ifndef COPYIFNAME # define NEED_FRGETIFNAME extern char *ipf_getifname __P((struct ifnet *, char *)); # define COPYIFNAME(v, x, b) \ ipf_getifname((struct ifnet *)x, b) #endif #ifndef ASSERT # ifdef _KERNEL # define ASSERT(x) # else # define ASSERT(x) do { if (!(x)) abort(); } while (0) # endif #endif #ifndef BCOPYIN # define BCOPYIN(a,b,c) (bcopy((caddr_t)(a), (caddr_t)(b), (c)), 0) # define BCOPYOUT(a,b,c) (bcopy((caddr_t)(a), (caddr_t)(b), (c)), 0) #endif /* * Because the ctype(3) posix definition, if used "safely" in code everywhere, * would mean all normal code that walks through strings needed casts. Yuck. */ #define ISALNUM(x) isalnum((u_char)(x)) #define ISALPHA(x) isalpha((u_char)(x)) #define ISDIGIT(x) isdigit((u_char)(x)) #define ISSPACE(x) isspace((u_char)(x)) #define ISUPPER(x) isupper((u_char)(x)) #define ISXDIGIT(x) isxdigit((u_char)(x)) #define ISLOWER(x) islower((u_char)(x)) #define TOUPPER(x) toupper((u_char)(x)) #define TOLOWER(x) tolower((u_char)(x)) /* * If mutexes aren't being used, turn all the mutex functions into null-ops. */ #if !defined(USE_MUTEXES) # define USE_SPL 1 # undef RW_DESTROY # undef MUTEX_INIT # undef MUTEX_NUKE # undef MUTEX_DESTROY # define MUTEX_ENTER(x) ; # define READ_ENTER(x) ; # define WRITE_ENTER(x) ; # define MUTEX_DOWNGRADE(x) ; # define RWLOCK_INIT(x, y) ; # define RWLOCK_EXIT(x) ; # define RW_DESTROY(x) ; # define MUTEX_EXIT(x) ; # define MUTEX_INIT(x,y) ; # define MUTEX_DESTROY(x) ; # define MUTEX_NUKE(x) ; #endif /* !USE_MUTEXES */ #ifndef ATOMIC_INC # define ATOMIC_INC(x) (x)++ # define ATOMIC_DEC(x) (x)-- #endif #if defined(USE_SPL) && defined(_KERNEL) # define SPL_INT(x) int x #else # define SPL_INT(x) #endif /* * If there are no atomic operations for bit sizes defined, define them to all * use a generic one that works for all sizes. */ #ifndef ATOMIC_INCL # define ATOMIC_INCL ATOMIC_INC # define ATOMIC_INC64 ATOMIC_INC # define ATOMIC_INC32 ATOMIC_INC # define ATOMIC_DECL ATOMIC_DEC # define ATOMIC_DEC64 ATOMIC_DEC # define ATOMIC_DEC32 ATOMIC_DEC #endif #ifndef HDR_T_PRIVATE typedef struct tcphdr tcphdr_t; typedef struct udphdr udphdr_t; #endif typedef struct icmp icmphdr_t; typedef struct ip ip_t; typedef struct ether_header ether_header_t; typedef struct tcpiphdr tcpiphdr_t; #ifndef FR_GROUPLEN # define FR_GROUPLEN 16 #endif #ifndef offsetof # define offsetof(t,m) (size_t)((&((t *)0L)->m)) #endif #ifndef stsizeof # define stsizeof(t,m) sizeof(((t *)0L)->m) #endif /* * This set of macros has been brought about because on Tru64 it is not * possible to easily assign or examine values in a structure that are * bit fields. */ #ifndef IP_V # define IP_V(x) (x)->ip_v #endif #ifndef IP_V_A # define IP_V_A(x,y) (x)->ip_v = (y) #endif #ifndef IP_HL # define IP_HL(x) (x)->ip_hl #endif #ifndef IP_HL_A # define IP_HL_A(x,y) (x)->ip_hl = ((y) & 0xf) #endif #ifndef TCP_X2 # define TCP_X2(x) (x)->th_x2 #endif #ifndef TCP_X2_A # define TCP_X2_A(x,y) (x)->th_x2 = (y) #endif #ifndef TCP_OFF # define TCP_OFF(x) (x)->th_off #endif #ifndef TCP_OFF_A # define TCP_OFF_A(x,y) (x)->th_off = (y) #endif #define IPMINLEN(i, h) ((i)->ip_len >= (IP_HL(i) * 4 + sizeof(struct h))) /* * XXX - This is one of those *awful* hacks which nobody likes */ #ifdef ultrix #define A_A #else #define A_A & #endif #define TCPF_ALL (TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|\ TH_ECN|TH_CWR) #if BSD_GE_YEAR(199306) && !defined(m_act) # define m_act m_nextpkt #endif /* * Security Options for Intenet Protocol (IPSO) as defined in RFC 1108. * * Basic Option * * 00000001 - (Reserved 4) * 00111101 - Top Secret * 01011010 - Secret * 10010110 - Confidential * 01100110 - (Reserved 3) * 11001100 - (Reserved 2) * 10101011 - Unclassified * 11110001 - (Reserved 1) */ #define IPSO_CLASS_RES4 0x01 #define IPSO_CLASS_TOPS 0x3d #define IPSO_CLASS_SECR 0x5a #define IPSO_CLASS_CONF 0x96 #define IPSO_CLASS_RES3 0x66 #define IPSO_CLASS_RES2 0xcc #define IPSO_CLASS_UNCL 0xab #define IPSO_CLASS_RES1 0xf1 #define IPSO_AUTH_GENSER 0x80 #define IPSO_AUTH_ESI 0x40 #define IPSO_AUTH_SCI 0x20 #define IPSO_AUTH_NSA 0x10 #define IPSO_AUTH_DOE 0x08 #define IPSO_AUTH_UN 0x06 #define IPSO_AUTH_FTE 0x01 /* * IP option #defines */ #undef IPOPT_RR #define IPOPT_RR 7 #undef IPOPT_ZSU #define IPOPT_ZSU 10 /* ZSU */ #undef IPOPT_MTUP #define IPOPT_MTUP 11 /* MTUP */ #undef IPOPT_MTUR #define IPOPT_MTUR 12 /* MTUR */ #undef IPOPT_ENCODE #define IPOPT_ENCODE 15 /* ENCODE */ #undef IPOPT_TS #define IPOPT_TS 68 #undef IPOPT_TR #define IPOPT_TR 82 /* TR */ #undef IPOPT_SECURITY #define IPOPT_SECURITY 130 #undef IPOPT_LSRR #define IPOPT_LSRR 131 #undef IPOPT_E_SEC #define IPOPT_E_SEC 133 /* E-SEC */ #undef IPOPT_CIPSO #define IPOPT_CIPSO 134 /* CIPSO */ #undef IPOPT_SATID #define IPOPT_SATID 136 #ifndef IPOPT_SID # define IPOPT_SID IPOPT_SATID #endif #undef IPOPT_SSRR #define IPOPT_SSRR 137 #undef IPOPT_ADDEXT #define IPOPT_ADDEXT 147 /* ADDEXT */ #undef IPOPT_VISA #define IPOPT_VISA 142 /* VISA */ #undef IPOPT_IMITD #define IPOPT_IMITD 144 /* IMITD */ #undef IPOPT_EIP #define IPOPT_EIP 145 /* EIP */ #undef IPOPT_RTRALRT #define IPOPT_RTRALRT 148 /* RTRALRT */ #undef IPOPT_SDB #define IPOPT_SDB 149 #undef IPOPT_NSAPA #define IPOPT_NSAPA 150 #undef IPOPT_DPS #define IPOPT_DPS 151 #undef IPOPT_UMP #define IPOPT_UMP 152 #undef IPOPT_FINN #define IPOPT_FINN 205 /* FINN */ #undef IPOPT_AH #define IPOPT_AH 256+IPPROTO_AH #ifndef TCPOPT_EOL # define TCPOPT_EOL 0 #endif #ifndef TCPOPT_NOP # define TCPOPT_NOP 1 #endif #ifndef TCPOPT_MAXSEG # define TCPOPT_MAXSEG 2 #endif #ifndef TCPOLEN_MAXSEG # define TCPOLEN_MAXSEG 4 #endif #ifndef TCPOPT_WINDOW # define TCPOPT_WINDOW 3 #endif #ifndef TCPOLEN_WINDOW # define TCPOLEN_WINDOW 3 #endif #ifndef TCPOPT_SACK_PERMITTED # define TCPOPT_SACK_PERMITTED 4 #endif #ifndef TCPOLEN_SACK_PERMITTED # define TCPOLEN_SACK_PERMITTED 2 #endif #ifndef TCPOPT_SACK # define TCPOPT_SACK 5 #endif #ifndef TCPOPT_TIMESTAMP # define TCPOPT_TIMESTAMP 8 #endif #ifndef ICMP_MINLEN # define ICMP_MINLEN 8 #endif #ifndef ICMP_ECHOREPLY # define ICMP_ECHOREPLY 0 #endif #ifndef ICMP_UNREACH # define ICMP_UNREACH 3 #endif #ifndef ICMP_UNREACH_NET # define ICMP_UNREACH_NET 0 #endif #ifndef ICMP_UNREACH_HOST # define ICMP_UNREACH_HOST 1 #endif #ifndef ICMP_UNREACH_PROTOCOL # define ICMP_UNREACH_PROTOCOL 2 #endif #ifndef ICMP_UNREACH_PORT # define ICMP_UNREACH_PORT 3 #endif #ifndef ICMP_UNREACH_NEEDFRAG # define ICMP_UNREACH_NEEDFRAG 4 #endif #ifndef ICMP_UNREACH_SRCFAIL # define ICMP_UNREACH_SRCFAIL 5 #endif #ifndef ICMP_UNREACH_NET_UNKNOWN # define ICMP_UNREACH_NET_UNKNOWN 6 #endif #ifndef ICMP_UNREACH_HOST_UNKNOWN # define ICMP_UNREACH_HOST_UNKNOWN 7 #endif #ifndef ICMP_UNREACH_ISOLATED # define ICMP_UNREACH_ISOLATED 8 #endif #ifndef ICMP_UNREACH_NET_PROHIB # define ICMP_UNREACH_NET_PROHIB 9 #endif #ifndef ICMP_UNREACH_HOST_PROHIB # define ICMP_UNREACH_HOST_PROHIB 10 #endif #ifndef ICMP_UNREACH_TOSNET # define ICMP_UNREACH_TOSNET 11 #endif #ifndef ICMP_UNREACH_TOSHOST # define ICMP_UNREACH_TOSHOST 12 #endif #ifndef ICMP_UNREACH_ADMIN_PROHIBIT # define ICMP_UNREACH_ADMIN_PROHIBIT 13 #endif #ifndef ICMP_UNREACH_FILTER # define ICMP_UNREACH_FILTER 13 #endif #ifndef ICMP_UNREACH_HOST_PRECEDENCE # define ICMP_UNREACH_HOST_PRECEDENCE 14 #endif #ifndef ICMP_UNREACH_PRECEDENCE_CUTOFF # define ICMP_UNREACH_PRECEDENCE_CUTOFF 15 #endif #ifndef ICMP_SOURCEQUENCH # define ICMP_SOURCEQUENCH 4 #endif #ifndef ICMP_REDIRECT_NET # define ICMP_REDIRECT_NET 0 #endif #ifndef ICMP_REDIRECT_HOST # define ICMP_REDIRECT_HOST 1 #endif #ifndef ICMP_REDIRECT_TOSNET # define ICMP_REDIRECT_TOSNET 2 #endif #ifndef ICMP_REDIRECT_TOSHOST # define ICMP_REDIRECT_TOSHOST 3 #endif #ifndef ICMP_ALTHOSTADDR # define ICMP_ALTHOSTADDR 6 #endif #ifndef ICMP_TIMXCEED # define ICMP_TIMXCEED 11 #endif #ifndef ICMP_TIMXCEED_INTRANS # define ICMP_TIMXCEED_INTRANS 0 #endif #ifndef ICMP_TIMXCEED_REASS # define ICMP_TIMXCEED_REASS 1 #endif #ifndef ICMP_PARAMPROB # define ICMP_PARAMPROB 12 #endif #ifndef ICMP_PARAMPROB_ERRATPTR # define ICMP_PARAMPROB_ERRATPTR 0 #endif #ifndef ICMP_PARAMPROB_OPTABSENT # define ICMP_PARAMPROB_OPTABSENT 1 #endif #ifndef ICMP_PARAMPROB_LENGTH # define ICMP_PARAMPROB_LENGTH 2 #endif #ifndef ICMP_TSTAMP # define ICMP_TSTAMP 13 #endif #ifndef ICMP_TSTAMPREPLY # define ICMP_TSTAMPREPLY 14 #endif #ifndef ICMP_IREQ # define ICMP_IREQ 15 #endif #ifndef ICMP_IREQREPLY # define ICMP_IREQREPLY 16 #endif #ifndef ICMP_MASKREQ # define ICMP_MASKREQ 17 #endif #ifndef ICMP_MASKREPLY # define ICMP_MASKREPLY 18 #endif #ifndef ICMP_TRACEROUTE # define ICMP_TRACEROUTE 30 #endif #ifndef ICMP_DATACONVERR # define ICMP_DATACONVERR 31 #endif #ifndef ICMP_MOBILE_REDIRECT # define ICMP_MOBILE_REDIRECT 32 #endif #ifndef ICMP_IPV6_WHEREAREYOU # define ICMP_IPV6_WHEREAREYOU 33 #endif #ifndef ICMP_IPV6_IAMHERE # define ICMP_IPV6_IAMHERE 34 #endif #ifndef ICMP_MOBILE_REGREQUEST # define ICMP_MOBILE_REGREQUEST 35 #endif #ifndef ICMP_MOBILE_REGREPLY # define ICMP_MOBILE_REGREPLY 36 #endif #ifndef ICMP_SKIP # define ICMP_SKIP 39 #endif #ifndef ICMP_PHOTURIS # define ICMP_PHOTURIS 40 #endif #ifndef ICMP_PHOTURIS_UNKNOWN_INDEX # define ICMP_PHOTURIS_UNKNOWN_INDEX 1 #endif #ifndef ICMP_PHOTURIS_AUTH_FAILED # define ICMP_PHOTURIS_AUTH_FAILED 2 #endif #ifndef ICMP_PHOTURIS_DECRYPT_FAILED # define ICMP_PHOTURIS_DECRYPT_FAILED 3 #endif #ifndef IPVERSION # define IPVERSION 4 #endif #ifndef IPOPT_MINOFF # define IPOPT_MINOFF 4 #endif #ifndef IPOPT_COPIED # define IPOPT_COPIED(x) ((x)&0x80) #endif #ifndef IPOPT_EOL # define IPOPT_EOL 0 #endif #ifndef IPOPT_NOP # define IPOPT_NOP 1 #endif #ifndef IP_MF # define IP_MF ((u_short)0x2000) #endif #ifndef ETHERTYPE_IP # define ETHERTYPE_IP ((u_short)0x0800) #endif #ifndef TH_FIN # define TH_FIN 0x01 #endif #ifndef TH_SYN # define TH_SYN 0x02 #endif #ifndef TH_RST # define TH_RST 0x04 #endif #ifndef TH_PUSH # define TH_PUSH 0x08 #endif #ifndef TH_ACK # define TH_ACK 0x10 #endif #ifndef TH_URG # define TH_URG 0x20 #endif #undef TH_ACKMASK #define TH_ACKMASK (TH_FIN|TH_SYN|TH_RST|TH_ACK) #ifndef IPOPT_EOL # define IPOPT_EOL 0 #endif #ifndef IPOPT_NOP # define IPOPT_NOP 1 #endif #ifndef IPOPT_RR # define IPOPT_RR 7 #endif #ifndef IPOPT_TS # define IPOPT_TS 68 #endif #ifndef IPOPT_SECURITY # define IPOPT_SECURITY 130 #endif #ifndef IPOPT_LSRR # define IPOPT_LSRR 131 #endif #ifndef IPOPT_SATID # define IPOPT_SATID 136 #endif #ifndef IPOPT_SSRR # define IPOPT_SSRR 137 #endif #ifndef IPOPT_SECUR_UNCLASS # define IPOPT_SECUR_UNCLASS ((u_short)0x0000) #endif #ifndef IPOPT_SECUR_CONFID # define IPOPT_SECUR_CONFID ((u_short)0xf135) #endif #ifndef IPOPT_SECUR_EFTO # define IPOPT_SECUR_EFTO ((u_short)0x789a) #endif #ifndef IPOPT_SECUR_MMMM # define IPOPT_SECUR_MMMM ((u_short)0xbc4d) #endif #ifndef IPOPT_SECUR_RESTR # define IPOPT_SECUR_RESTR ((u_short)0xaf13) #endif #ifndef IPOPT_SECUR_SECRET # define IPOPT_SECUR_SECRET ((u_short)0xd788) #endif #ifndef IPOPT_SECUR_TOPSECRET # define IPOPT_SECUR_TOPSECRET ((u_short)0x6bc5) #endif #ifndef IPOPT_OLEN # define IPOPT_OLEN 1 #endif #ifndef IPPROTO_HOPOPTS # define IPPROTO_HOPOPTS 0 #endif #ifndef IPPROTO_IPIP # define IPPROTO_IPIP 4 #endif #ifndef IPPROTO_ENCAP # define IPPROTO_ENCAP 98 #endif #ifndef IPPROTO_IPV6 # define IPPROTO_IPV6 41 #endif #ifndef IPPROTO_ROUTING # define IPPROTO_ROUTING 43 #endif #ifndef IPPROTO_FRAGMENT # define IPPROTO_FRAGMENT 44 #endif #ifndef IPPROTO_GRE # define IPPROTO_GRE 47 /* GRE encaps RFC 1701 */ #endif #ifndef IPPROTO_ESP # define IPPROTO_ESP 50 #endif #ifndef IPPROTO_AH # define IPPROTO_AH 51 #endif #ifndef IPPROTO_ICMPV6 # define IPPROTO_ICMPV6 58 #endif #ifndef IPPROTO_NONE # define IPPROTO_NONE 59 #endif #ifndef IPPROTO_DSTOPTS # define IPPROTO_DSTOPTS 60 #endif #ifndef IPPROTO_MOBILITY # define IPPROTO_MOBILITY 135 #endif #ifndef ICMP_ROUTERADVERT # define ICMP_ROUTERADVERT 9 #endif #ifndef ICMP_ROUTERSOLICIT # define ICMP_ROUTERSOLICIT 10 #endif #ifndef ICMP6_DST_UNREACH # define ICMP6_DST_UNREACH 1 #endif #ifndef ICMP6_PACKET_TOO_BIG # define ICMP6_PACKET_TOO_BIG 2 #endif #ifndef ICMP6_TIME_EXCEEDED # define ICMP6_TIME_EXCEEDED 3 #endif #ifndef ICMP6_PARAM_PROB # define ICMP6_PARAM_PROB 4 #endif #ifndef ICMP6_ECHO_REQUEST # define ICMP6_ECHO_REQUEST 128 #endif #ifndef ICMP6_ECHO_REPLY # define ICMP6_ECHO_REPLY 129 #endif #ifndef ICMP6_MEMBERSHIP_QUERY # define ICMP6_MEMBERSHIP_QUERY 130 #endif #ifndef MLD6_LISTENER_QUERY # define MLD6_LISTENER_QUERY 130 #endif #ifndef ICMP6_MEMBERSHIP_REPORT # define ICMP6_MEMBERSHIP_REPORT 131 #endif #ifndef MLD6_LISTENER_REPORT # define MLD6_LISTENER_REPORT 131 #endif #ifndef ICMP6_MEMBERSHIP_REDUCTION # define ICMP6_MEMBERSHIP_REDUCTION 132 #endif #ifndef MLD6_LISTENER_DONE # define MLD6_LISTENER_DONE 132 #endif #ifndef ND_ROUTER_SOLICIT # define ND_ROUTER_SOLICIT 133 #endif #ifndef ND_ROUTER_ADVERT # define ND_ROUTER_ADVERT 134 #endif #ifndef ND_NEIGHBOR_SOLICIT # define ND_NEIGHBOR_SOLICIT 135 #endif #ifndef ND_NEIGHBOR_ADVERT # define ND_NEIGHBOR_ADVERT 136 #endif #ifndef ND_REDIRECT # define ND_REDIRECT 137 #endif #ifndef ICMP6_ROUTER_RENUMBERING # define ICMP6_ROUTER_RENUMBERING 138 #endif #ifndef ICMP6_WRUREQUEST # define ICMP6_WRUREQUEST 139 #endif #ifndef ICMP6_WRUREPLY # define ICMP6_WRUREPLY 140 #endif #ifndef ICMP6_FQDN_QUERY # define ICMP6_FQDN_QUERY 139 #endif #ifndef ICMP6_FQDN_REPLY # define ICMP6_FQDN_REPLY 140 #endif #ifndef ICMP6_NI_QUERY # define ICMP6_NI_QUERY 139 #endif #ifndef ICMP6_NI_REPLY # define ICMP6_NI_REPLY 140 #endif #ifndef MLD6_MTRACE_RESP # define MLD6_MTRACE_RESP 200 #endif #ifndef MLD6_MTRACE # define MLD6_MTRACE 201 #endif #ifndef ICMP6_HADISCOV_REQUEST # define ICMP6_HADISCOV_REQUEST 202 #endif #ifndef ICMP6_HADISCOV_REPLY # define ICMP6_HADISCOV_REPLY 203 #endif #ifndef ICMP6_MOBILEPREFIX_SOLICIT # define ICMP6_MOBILEPREFIX_SOLICIT 204 #endif #ifndef ICMP6_MOBILEPREFIX_ADVERT # define ICMP6_MOBILEPREFIX_ADVERT 205 #endif #ifndef ICMP6_MAXTYPE # define ICMP6_MAXTYPE 205 #endif #ifndef ICMP6_DST_UNREACH_NOROUTE # define ICMP6_DST_UNREACH_NOROUTE 0 #endif #ifndef ICMP6_DST_UNREACH_ADMIN # define ICMP6_DST_UNREACH_ADMIN 1 #endif #ifndef ICMP6_DST_UNREACH_NOTNEIGHBOR # define ICMP6_DST_UNREACH_NOTNEIGHBOR 2 #endif #ifndef ICMP6_DST_UNREACH_BEYONDSCOPE # define ICMP6_DST_UNREACH_BEYONDSCOPE 2 #endif #ifndef ICMP6_DST_UNREACH_ADDR # define ICMP6_DST_UNREACH_ADDR 3 #endif #ifndef ICMP6_DST_UNREACH_NOPORT # define ICMP6_DST_UNREACH_NOPORT 4 #endif #ifndef ICMP6_TIME_EXCEED_TRANSIT # define ICMP6_TIME_EXCEED_TRANSIT 0 #endif #ifndef ICMP6_TIME_EXCEED_REASSEMBLY # define ICMP6_TIME_EXCEED_REASSEMBLY 1 #endif #ifndef ICMP6_NI_SUCCESS # define ICMP6_NI_SUCCESS 0 #endif #ifndef ICMP6_NI_REFUSED # define ICMP6_NI_REFUSED 1 #endif #ifndef ICMP6_NI_UNKNOWN # define ICMP6_NI_UNKNOWN 2 #endif #ifndef ICMP6_ROUTER_RENUMBERING_COMMAND # define ICMP6_ROUTER_RENUMBERING_COMMAND 0 #endif #ifndef ICMP6_ROUTER_RENUMBERING_RESULT # define ICMP6_ROUTER_RENUMBERING_RESULT 1 #endif #ifndef ICMP6_ROUTER_RENUMBERING_SEQNUM_RESET # define ICMP6_ROUTER_RENUMBERING_SEQNUM_RESET 255 #endif #ifndef ICMP6_PARAMPROB_HEADER # define ICMP6_PARAMPROB_HEADER 0 #endif #ifndef ICMP6_PARAMPROB_NEXTHEADER # define ICMP6_PARAMPROB_NEXTHEADER 1 #endif #ifndef ICMP6_PARAMPROB_OPTION # define ICMP6_PARAMPROB_OPTION 2 #endif #ifndef ICMP6_NI_SUBJ_IPV6 # define ICMP6_NI_SUBJ_IPV6 0 #endif #ifndef ICMP6_NI_SUBJ_FQDN # define ICMP6_NI_SUBJ_FQDN 1 #endif #ifndef ICMP6_NI_SUBJ_IPV4 # define ICMP6_NI_SUBJ_IPV4 2 #endif #ifndef MLD_MTRACE_RESP # define MLD_MTRACE_RESP 200 #endif #ifndef MLD_MTRACE # define MLD_MTRACE 201 #endif #ifndef MLD6_MTRACE_RESP # define MLD6_MTRACE_RESP MLD_MTRACE_RESP #endif #ifndef MLD6_MTRACE # define MLD6_MTRACE MLD_MTRACE #endif #if !defined(IPV6_FLOWINFO_MASK) # if (BYTE_ORDER == BIG_ENDIAN) || defined(_BIG_ENDIAN) # define IPV6_FLOWINFO_MASK 0x0fffffff /* flow info (28 bits) */ # else # if(BYTE_ORDER == LITTLE_ENDIAN) || !defined(_BIG_ENDIAN) # define IPV6_FLOWINFO_MASK 0xffffff0f /* flow info (28 bits) */ # endif /* LITTLE_ENDIAN */ # endif #endif #if !defined(IPV6_FLOWLABEL_MASK) # if (BYTE_ORDER == BIG_ENDIAN) || defined(_BIG_ENDIAN) # define IPV6_FLOWLABEL_MASK 0x000fffff /* flow label (20 bits) */ # else # if (BYTE_ORDER == LITTLE_ENDIAN) || !defined(_BIG_ENDIAN) # define IPV6_FLOWLABEL_MASK 0xffff0f00 /* flow label (20 bits) */ # endif /* LITTLE_ENDIAN */ # endif #endif /* * ECN is a new addition to TCP - RFC 2481 */ #ifndef TH_ECN # define TH_ECN 0x40 #endif #ifndef TH_CWR # define TH_CWR 0x80 #endif #define TH_ECNALL (TH_ECN|TH_CWR) /* * TCP States */ #define IPF_TCPS_LISTEN 0 /* listening for connection */ #define IPF_TCPS_SYN_SENT 1 /* active, have sent syn */ #define IPF_TCPS_SYN_RECEIVED 2 /* have send and received syn */ #define IPF_TCPS_HALF_ESTAB 3 /* for connections not fully "up" */ /* states < IPF_TCPS_ESTABLISHED are those where connections not established */ #define IPF_TCPS_ESTABLISHED 4 /* established */ #define IPF_TCPS_CLOSE_WAIT 5 /* rcvd fin, waiting for close */ /* states > IPF_TCPS_CLOSE_WAIT are those where user has closed */ #define IPF_TCPS_FIN_WAIT_1 6 /* have closed, sent fin */ #define IPF_TCPS_CLOSING 7 /* closed xchd FIN; await FIN ACK */ #define IPF_TCPS_LAST_ACK 8 /* had fin and close; await FIN ACK */ /* states > IPF_TCPS_CLOSE_WAIT && < IPF_TCPS_FIN_WAIT_2 await ACK of FIN */ #define IPF_TCPS_FIN_WAIT_2 9 /* have closed, fin is acked */ #define IPF_TCPS_TIME_WAIT 10 /* in 2*msl quiet wait after close */ #define IPF_TCPS_CLOSED 11 /* closed */ #define IPF_TCP_NSTATES 12 #define TCP_MSL 120 #undef ICMP_MAX_UNREACH #define ICMP_MAX_UNREACH 14 #undef ICMP_MAXTYPE #define ICMP_MAXTYPE 18 #ifndef IFNAMSIZ #define IFNAMSIZ 16 #endif #ifndef LOG_FTP # define LOG_FTP (11<<3) #endif #ifndef LOG_AUTHPRIV # define LOG_AUTHPRIV (10<<3) #endif #ifndef LOG_AUDIT # define LOG_AUDIT (13<<3) #endif #ifndef LOG_NTP # define LOG_NTP (12<<3) #endif #ifndef LOG_SECURITY # define LOG_SECURITY (13<<3) #endif #ifndef LOG_LFMT # define LOG_LFMT (14<<3) #endif #ifndef LOG_CONSOLE # define LOG_CONSOLE (14<<3) #endif /* * ICMP error replies have an IP header (20 bytes), 8 bytes of ICMP data, * another IP header and then 64 bits of data, totalling 56. Of course, * the last 64 bits is dependent on that being available. */ #define ICMPERR_ICMPHLEN 8 #define ICMPERR_IPICMPHLEN (20 + 8) #define ICMPERR_MINPKTLEN (20 + 8 + 20) #define ICMPERR_MAXPKTLEN (20 + 8 + 20 + 8) #define ICMP6ERR_MINPKTLEN (40 + 8) #define ICMP6ERR_IPICMPHLEN (40 + 8 + 40) #ifndef MIN # define MIN(a,b) (((a)<(b))?(a):(b)) #endif #ifdef RESCUE # undef IPFILTER_BPF #endif #ifdef IPF_DEBUG # define DPRINT(x) printf x #else # define DPRINT(x) #endif #ifndef AF_INET6 # define AF_INET6 26 #endif #ifdef DTRACE_PROBE # ifdef _KERNEL # define DT(_n) DTRACE_PROBE(_n) # define DT1(_n,_a,_b) DTRACE_PROBE1(_n,_a,_b) # define DT2(_n,_a,_b,_c,_d) DTRACE_PROBE2(_n,_a,_b,_c,_d) # define DT3(_n,_a,_b,_c,_d,_e,_f) \ DTRACE_PROBE3(_n,_a,_b,_c,_d,_e,_f) # define DT4(_n,_a,_b,_c,_d,_e,_f,_g,_h) \ DTRACE_PROBE4(_n,_a,_b,_c,_d,_e,_f,_g,_h) # else # define DT(_n) # define DT1(_n,_a,_b) # define DT2(_n,_a,_b,_c,_d) # define DT3(_n,_a,_b,_c,_d,_e,_f) # define DT4(_n,_a,_b,_c,_d,_e,_f,_g,_h) # endif #else # define DT(_n) # define DT1(_n,_a,_b) # define DT2(_n,_a,_b,_c,_d) # define DT3(_n,_a,_b,_c,_d,_e,_f) # define DT4(_n,_a,_b,_c,_d,_e,_f,_g,_h) #endif struct ip6_routing { u_char ip6r_nxt; /* next header */ u_char ip6r_len; /* length in units of 8 octets */ u_char ip6r_type; /* always zero */ u_char ip6r_segleft; /* segments left */ u_32_t ip6r_reserved; /* reserved field */ }; #endif /* __IP_COMPAT_H__ */ Index: head/sys/contrib/ipfilter/netinet/ip_fil_freebsd.c =================================================================== --- head/sys/contrib/ipfilter/netinet/ip_fil_freebsd.c (revision 305823) +++ head/sys/contrib/ipfilter/netinet/ip_fil_freebsd.c (revision 305824) @@ -1,1484 +1,1484 @@ /* $FreeBSD$ */ /* * Copyright (C) 2012 by Darren Reed. * * See the IPFILTER.LICENCE file for details on licencing. */ #if !defined(lint) static const char sccsid[] = "@(#)ip_fil.c 2.41 6/5/96 (C) 1993-2000 Darren Reed"; static const char rcsid[] = "@(#)$Id$"; #endif #if defined(KERNEL) || defined(_KERNEL) # undef KERNEL # undef _KERNEL # define KERNEL 1 # define _KERNEL 1 #endif #if defined(__FreeBSD_version) && (__FreeBSD_version >= 400000) && \ !defined(KLD_MODULE) && !defined(IPFILTER_LKM) # include "opt_inet6.h" #endif #if defined(__FreeBSD_version) && (__FreeBSD_version >= 440000) && \ !defined(KLD_MODULE) && !defined(IPFILTER_LKM) # include "opt_random_ip_id.h" #endif #include #include #include #include # include # include #include #include # include #if defined(__FreeBSD_version) && (__FreeBSD_version >= 800000) #include #endif # include # include # include #if !defined(__hpux) # include #endif #include # include # include #include # include # include #include #include #include #include #include #include #include #include #if defined(__FreeBSD_version) && (__FreeBSD_version >= 800000) #include #else #define CURVNET_SET(arg) #define CURVNET_RESTORE() #define VNET_DEFINE(_t, _v) _t _v #define VNET_DECLARE(_t, _v) extern _t _v #define VNET(arg) arg #endif #if defined(__osf__) # include #endif #include #include #include #include "netinet/ip_compat.h" #ifdef USE_INET6 # include #endif #include "netinet/ip_fil.h" #include "netinet/ip_nat.h" #include "netinet/ip_frag.h" #include "netinet/ip_state.h" #include "netinet/ip_proxy.h" #include "netinet/ip_auth.h" #include "netinet/ip_sync.h" #include "netinet/ip_lookup.h" #include "netinet/ip_dstlist.h" #ifdef IPFILTER_SCAN #include "netinet/ip_scan.h" #endif #include "netinet/ip_pool.h" # include #include #ifdef CSUM_DATA_VALID #include #endif extern int ip_optcopy __P((struct ip *, struct ip *)); # ifdef IPFILTER_M_IPFILTER MALLOC_DEFINE(M_IPFILTER, "ipfilter", "IP Filter packet filter data structures"); # endif static int ipf_send_ip __P((fr_info_t *, mb_t *)); static void ipf_timer_func __P((void *arg)); VNET_DEFINE(ipf_main_softc_t, ipfmain) = { .ipf_running = -2, }; #define V_ipfmain VNET(ipfmain) # include # if defined(NETBSD_PF) # include # endif /* NETBSD_PF */ static eventhandler_tag ipf_arrivetag, ipf_departtag; #if 0 /* * Disable the "cloner" event handler; we are getting interface * events before the firewall is fully initiallized and also no vnet * information thus leading to uninitialised memory accesses. * In addition it is unclear why we need it in first place. * If it turns out to be needed, well need a dedicated event handler * for it to deal with the ifc and the correct vnet. */ static eventhandler_tag ipf_clonetag; #endif static void ipf_ifevent(void *arg, struct ifnet *ifp); static void ipf_ifevent(arg, ifp) void *arg; struct ifnet *ifp; { CURVNET_SET(ifp->if_vnet); if (V_ipfmain.ipf_running > 0) ipf_sync(&V_ipfmain, NULL); CURVNET_RESTORE(); } static int ipf_check_wrapper(void *arg, struct mbuf **mp, struct ifnet *ifp, int dir) { struct ip *ip = mtod(*mp, struct ip *); int rv; /* * IPFilter expects evreything in network byte order */ #if (__FreeBSD_version < 1000019) ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); #endif CURVNET_SET(ifp->if_vnet); rv = ipf_check(&V_ipfmain, ip, ip->ip_hl << 2, ifp, (dir == PFIL_OUT), mp); CURVNET_RESTORE(); #if (__FreeBSD_version < 1000019) if ((rv == 0) && (*mp != NULL)) { ip = mtod(*mp, struct ip *); ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); } #endif return rv; } # ifdef USE_INET6 # include static int ipf_check_wrapper6(void *arg, struct mbuf **mp, struct ifnet *ifp, int dir) { int error; CURVNET_SET(ifp->if_vnet); error = ipf_check(&V_ipfmain, mtod(*mp, struct ip *), sizeof(struct ip6_hdr), ifp, (dir == PFIL_OUT), mp); CURVNET_RESTORE(); return (error); } # endif #if defined(IPFILTER_LKM) int ipf_identify(s) char *s; { if (strcmp(s, "ipl") == 0) return 1; return 0; } #endif /* IPFILTER_LKM */ static void ipf_timer_func(arg) void *arg; { ipf_main_softc_t *softc = arg; SPL_INT(s); SPL_NET(s); READ_ENTER(&softc->ipf_global); if (softc->ipf_running > 0) ipf_slowtimer(softc); if (softc->ipf_running == -1 || softc->ipf_running == 1) { #if 0 softc->ipf_slow_ch = timeout(ipf_timer_func, softc, hz/2); #endif callout_init(&softc->ipf_slow_ch, 1); callout_reset(&softc->ipf_slow_ch, (hz / IPF_HZ_DIVIDE) * IPF_HZ_MULT, ipf_timer_func, softc); } RWLOCK_EXIT(&softc->ipf_global); SPL_X(s); } int ipfattach(softc) ipf_main_softc_t *softc; { #ifdef USE_SPL int s; #endif SPL_NET(s); if (softc->ipf_running > 0) { SPL_X(s); return EBUSY; } if (ipf_init_all(softc) < 0) { SPL_X(s); return EIO; } bzero((char *)V_ipfmain.ipf_selwait, sizeof(V_ipfmain.ipf_selwait)); softc->ipf_running = 1; if (softc->ipf_control_forwarding & 1) V_ipforwarding = 1; SPL_X(s); #if 0 softc->ipf_slow_ch = timeout(ipf_timer_func, softc, (hz / IPF_HZ_DIVIDE) * IPF_HZ_MULT); #endif callout_init(&softc->ipf_slow_ch, 1); callout_reset(&softc->ipf_slow_ch, (hz / IPF_HZ_DIVIDE) * IPF_HZ_MULT, ipf_timer_func, softc); return 0; } /* * Disable the filter by removing the hooks from the IP input/output * stream. */ int ipfdetach(softc) ipf_main_softc_t *softc; { #ifdef USE_SPL int s; #endif if (softc->ipf_control_forwarding & 2) V_ipforwarding = 0; SPL_NET(s); #if 0 if (softc->ipf_slow_ch.callout != NULL) untimeout(ipf_timer_func, softc, softc->ipf_slow_ch); bzero(&softc->ipf_slow, sizeof(softc->ipf_slow)); #endif callout_drain(&softc->ipf_slow_ch); ipf_fini_all(softc); softc->ipf_running = -2; SPL_X(s); return 0; } /* * Filter ioctl interface. */ int ipfioctl(dev, cmd, data, mode , p) struct thread *p; # define p_cred td_ucred # define p_uid td_ucred->cr_ruid struct cdev *dev; ioctlcmd_t cmd; caddr_t data; int mode; { int error = 0, unit = 0; SPL_INT(s); CURVNET_SET(TD_TO_VNET(p)); #if (BSD >= 199306) if (securelevel_ge(p->p_cred, 3) && (mode & FWRITE)) { V_ipfmain.ipf_interror = 130001; CURVNET_RESTORE(); return EPERM; } #endif unit = GET_MINOR(dev); if ((IPL_LOGMAX < unit) || (unit < 0)) { V_ipfmain.ipf_interror = 130002; CURVNET_RESTORE(); return ENXIO; } if (V_ipfmain.ipf_running <= 0) { if (unit != IPL_LOGIPF && cmd != SIOCIPFINTERROR) { V_ipfmain.ipf_interror = 130003; CURVNET_RESTORE(); return EIO; } if (cmd != SIOCIPFGETNEXT && cmd != SIOCIPFGET && cmd != SIOCIPFSET && cmd != SIOCFRENB && cmd != SIOCGETFS && cmd != SIOCGETFF && cmd != SIOCIPFINTERROR) { V_ipfmain.ipf_interror = 130004; CURVNET_RESTORE(); return EIO; } } SPL_NET(s); error = ipf_ioctlswitch(&V_ipfmain, unit, data, cmd, mode, p->p_uid, p); CURVNET_RESTORE(); if (error != -1) { SPL_X(s); return error; } SPL_X(s); return error; } /* * ipf_send_reset - this could conceivably be a call to tcp_respond(), but that * requires a large amount of setting up and isn't any more efficient. */ int ipf_send_reset(fin) fr_info_t *fin; { struct tcphdr *tcp, *tcp2; int tlen = 0, hlen; struct mbuf *m; #ifdef USE_INET6 ip6_t *ip6; #endif ip_t *ip; tcp = fin->fin_dp; if (tcp->th_flags & TH_RST) return -1; /* feedback loop */ if (ipf_checkl4sum(fin) == -1) return -1; tlen = fin->fin_dlen - (TCP_OFF(tcp) << 2) + ((tcp->th_flags & TH_SYN) ? 1 : 0) + ((tcp->th_flags & TH_FIN) ? 1 : 0); #ifdef USE_INET6 hlen = (fin->fin_v == 6) ? sizeof(ip6_t) : sizeof(ip_t); #else hlen = sizeof(ip_t); #endif #ifdef MGETHDR MGETHDR(m, M_NOWAIT, MT_HEADER); #else MGET(m, M_NOWAIT, MT_HEADER); #endif if (m == NULL) return -1; if (sizeof(*tcp2) + hlen > MLEN) { if (!(MCLGET(m, M_NOWAIT))) { FREE_MB_T(m); return -1; } } m->m_len = sizeof(*tcp2) + hlen; #if (BSD >= 199103) m->m_data += max_linkhdr; m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = (struct ifnet *)0; #endif ip = mtod(m, struct ip *); bzero((char *)ip, hlen); #ifdef USE_INET6 ip6 = (ip6_t *)ip; #endif tcp2 = (struct tcphdr *)((char *)ip + hlen); tcp2->th_sport = tcp->th_dport; tcp2->th_dport = tcp->th_sport; if (tcp->th_flags & TH_ACK) { tcp2->th_seq = tcp->th_ack; tcp2->th_flags = TH_RST; tcp2->th_ack = 0; } else { tcp2->th_seq = 0; tcp2->th_ack = ntohl(tcp->th_seq); tcp2->th_ack += tlen; tcp2->th_ack = htonl(tcp2->th_ack); tcp2->th_flags = TH_RST|TH_ACK; } TCP_X2_A(tcp2, 0); TCP_OFF_A(tcp2, sizeof(*tcp2) >> 2); tcp2->th_win = tcp->th_win; tcp2->th_sum = 0; tcp2->th_urp = 0; #ifdef USE_INET6 if (fin->fin_v == 6) { ip6->ip6_flow = ((ip6_t *)fin->fin_ip)->ip6_flow; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_hlim = 0; ip6->ip6_src = fin->fin_dst6.in6; ip6->ip6_dst = fin->fin_src6.in6; tcp2->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*ip6), sizeof(*tcp2)); return ipf_send_ip(fin, m); } #endif ip->ip_p = IPPROTO_TCP; ip->ip_len = htons(sizeof(struct tcphdr)); ip->ip_src.s_addr = fin->fin_daddr; ip->ip_dst.s_addr = fin->fin_saddr; tcp2->th_sum = in_cksum(m, hlen + sizeof(*tcp2)); ip->ip_len = htons(hlen + sizeof(*tcp2)); return ipf_send_ip(fin, m); } /* * ip_len must be in network byte order when called. */ static int ipf_send_ip(fin, m) fr_info_t *fin; mb_t *m; { fr_info_t fnew; ip_t *ip, *oip; int hlen; ip = mtod(m, ip_t *); bzero((char *)&fnew, sizeof(fnew)); fnew.fin_main_soft = fin->fin_main_soft; IP_V_A(ip, fin->fin_v); switch (fin->fin_v) { case 4 : oip = fin->fin_ip; hlen = sizeof(*oip); fnew.fin_v = 4; fnew.fin_p = ip->ip_p; fnew.fin_plen = ntohs(ip->ip_len); IP_HL_A(ip, sizeof(*oip) >> 2); ip->ip_tos = oip->ip_tos; ip->ip_id = fin->fin_ip->ip_id; #if defined(FreeBSD) && (__FreeBSD_version > 460000) ip->ip_off = htons(path_mtu_discovery ? IP_DF : 0); #else ip->ip_off = 0; #endif ip->ip_ttl = V_ip_defttl; ip->ip_sum = 0; break; #ifdef USE_INET6 case 6 : { ip6_t *ip6 = (ip6_t *)ip; ip6->ip6_vfc = 0x60; ip6->ip6_hlim = IPDEFTTL; hlen = sizeof(*ip6); fnew.fin_p = ip6->ip6_nxt; fnew.fin_v = 6; fnew.fin_plen = ntohs(ip6->ip6_plen) + hlen; break; } #endif default : return EINVAL; } #ifdef IPSEC m->m_pkthdr.rcvif = NULL; #endif fnew.fin_ifp = fin->fin_ifp; fnew.fin_flx = FI_NOCKSUM; fnew.fin_m = m; fnew.fin_ip = ip; fnew.fin_mp = &m; fnew.fin_hlen = hlen; fnew.fin_dp = (char *)ip + hlen; (void) ipf_makefrip(hlen, ip, &fnew); return ipf_fastroute(m, &m, &fnew, NULL); } int ipf_send_icmp_err(type, fin, dst) int type; fr_info_t *fin; int dst; { int err, hlen, xtra, iclen, ohlen, avail, code; struct in_addr dst4; struct icmp *icmp; struct mbuf *m; i6addr_t dst6; void *ifp; #ifdef USE_INET6 ip6_t *ip6; #endif ip_t *ip, *ip2; if ((type < 0) || (type >= ICMP_MAXTYPE)) return -1; code = fin->fin_icode; #ifdef USE_INET6 #if 0 /* XXX Fix an off by one error: s/>/>=/ was: if ((code < 0) || (code > sizeof(icmptoicmp6unreach)/sizeof(int))) Fix obtained from NetBSD ip_fil_netbsd.c r1.4: */ #endif if ((code < 0) || (code >= sizeof(icmptoicmp6unreach)/sizeof(int))) return -1; #endif if (ipf_checkl4sum(fin) == -1) return -1; #ifdef MGETHDR MGETHDR(m, M_NOWAIT, MT_HEADER); #else MGET(m, M_NOWAIT, MT_HEADER); #endif if (m == NULL) return -1; avail = MHLEN; xtra = 0; hlen = 0; ohlen = 0; dst4.s_addr = 0; ifp = fin->fin_ifp; if (fin->fin_v == 4) { if ((fin->fin_p == IPPROTO_ICMP) && !(fin->fin_flx & FI_SHORT)) switch (ntohs(fin->fin_data[0]) >> 8) { case ICMP_ECHO : case ICMP_TSTAMP : case ICMP_IREQ : case ICMP_MASKREQ : break; default : FREE_MB_T(m); return 0; } if (dst == 0) { if (ipf_ifpaddr(&V_ipfmain, 4, FRI_NORMAL, ifp, &dst6, NULL) == -1) { FREE_MB_T(m); return -1; } dst4 = dst6.in4; } else dst4.s_addr = fin->fin_daddr; hlen = sizeof(ip_t); ohlen = fin->fin_hlen; iclen = hlen + offsetof(struct icmp, icmp_ip) + ohlen; if (fin->fin_hlen < fin->fin_plen) xtra = MIN(fin->fin_dlen, 8); else xtra = 0; } #ifdef USE_INET6 else if (fin->fin_v == 6) { hlen = sizeof(ip6_t); ohlen = sizeof(ip6_t); iclen = hlen + offsetof(struct icmp, icmp_ip) + ohlen; type = icmptoicmp6types[type]; if (type == ICMP6_DST_UNREACH) code = icmptoicmp6unreach[code]; if (iclen + max_linkhdr + fin->fin_plen > avail) { if (!(MCLGET(m, M_NOWAIT))) { FREE_MB_T(m); return -1; } avail = MCLBYTES; } xtra = MIN(fin->fin_plen, avail - iclen - max_linkhdr); xtra = MIN(xtra, IPV6_MMTU - iclen); if (dst == 0) { if (ipf_ifpaddr(&V_ipfmain, 6, FRI_NORMAL, ifp, &dst6, NULL) == -1) { FREE_MB_T(m); return -1; } } else dst6 = fin->fin_dst6; } #endif else { FREE_MB_T(m); return -1; } avail -= (max_linkhdr + iclen); if (avail < 0) { FREE_MB_T(m); return -1; } if (xtra > avail) xtra = avail; iclen += xtra; m->m_data += max_linkhdr; m->m_pkthdr.rcvif = (struct ifnet *)0; m->m_pkthdr.len = iclen; m->m_len = iclen; ip = mtod(m, ip_t *); icmp = (struct icmp *)((char *)ip + hlen); ip2 = (ip_t *)&icmp->icmp_ip; icmp->icmp_type = type; icmp->icmp_code = fin->fin_icode; icmp->icmp_cksum = 0; #ifdef icmp_nextmtu if (type == ICMP_UNREACH && fin->fin_icode == ICMP_UNREACH_NEEDFRAG) { if (fin->fin_mtu != 0) { icmp->icmp_nextmtu = htons(fin->fin_mtu); } else if (ifp != NULL) { icmp->icmp_nextmtu = htons(GETIFMTU_4(ifp)); } else { /* make up a number... */ icmp->icmp_nextmtu = htons(fin->fin_plen - 20); } } #endif bcopy((char *)fin->fin_ip, (char *)ip2, ohlen); #ifdef USE_INET6 ip6 = (ip6_t *)ip; if (fin->fin_v == 6) { ip6->ip6_flow = ((ip6_t *)fin->fin_ip)->ip6_flow; ip6->ip6_plen = htons(iclen - hlen); ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 0; ip6->ip6_src = dst6.in6; ip6->ip6_dst = fin->fin_src6.in6; if (xtra > 0) bcopy((char *)fin->fin_ip + ohlen, (char *)&icmp->icmp_ip + ohlen, xtra); icmp->icmp_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), iclen - hlen); } else #endif { ip->ip_p = IPPROTO_ICMP; ip->ip_src.s_addr = dst4.s_addr; ip->ip_dst.s_addr = fin->fin_saddr; if (xtra > 0) bcopy((char *)fin->fin_ip + ohlen, (char *)&icmp->icmp_ip + ohlen, xtra); icmp->icmp_cksum = ipf_cksum((u_short *)icmp, sizeof(*icmp) + 8); ip->ip_len = htons(iclen); ip->ip_p = IPPROTO_ICMP; } err = ipf_send_ip(fin, m); return err; } /* * m0 - pointer to mbuf where the IP packet starts * mpp - pointer to the mbuf pointer that is the start of the mbuf chain */ int ipf_fastroute(m0, mpp, fin, fdp) mb_t *m0, **mpp; fr_info_t *fin; frdest_t *fdp; { register struct ip *ip, *mhip; register struct mbuf *m = *mpp; int len, off, error = 0, hlen, code; struct ifnet *ifp, *sifp; struct sockaddr_in dst; struct nhop4_extended nh4; int has_nhop = 0; u_long fibnum = 0; u_short ip_off; frdest_t node; frentry_t *fr; #ifdef M_WRITABLE /* * HOT FIX/KLUDGE: * * If the mbuf we're about to send is not writable (because of * a cluster reference, for example) we'll need to make a copy * of it since this routine modifies the contents. * * If you have non-crappy network hardware that can transmit data * from the mbuf, rather than making a copy, this is gonna be a * problem. */ if (M_WRITABLE(m) == 0) { m0 = m_dup(m, M_NOWAIT); if (m0 != NULL) { FREE_MB_T(m); m = m0; *mpp = m; } else { error = ENOBUFS; FREE_MB_T(m); goto done; } } #endif #ifdef USE_INET6 if (fin->fin_v == 6) { /* * currently "to " and "to :ip#" are not supported * for IPv6 */ return ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } #endif hlen = fin->fin_hlen; ip = mtod(m0, struct ip *); ifp = NULL; /* * Route packet. */ bzero(&dst, sizeof (dst)); dst.sin_family = AF_INET; dst.sin_addr = ip->ip_dst; dst.sin_len = sizeof(dst); fr = fin->fin_fr; if ((fr != NULL) && !(fr->fr_flags & FR_KEEPSTATE) && (fdp != NULL) && (fdp->fd_type == FRD_DSTLIST)) { if (ipf_dstlist_select_node(fin, fdp->fd_ptr, NULL, &node) == 0) fdp = &node; } if (fdp != NULL) ifp = fdp->fd_ptr; else ifp = fin->fin_ifp; if ((ifp == NULL) && ((fr == NULL) || !(fr->fr_flags & FR_FASTROUTE))) { error = -2; goto bad; } if ((fdp != NULL) && (fdp->fd_ip.s_addr != 0)) dst.sin_addr = fdp->fd_ip; fibnum = M_GETFIB(m0); if (fib4_lookup_nh_ext(fibnum, dst.sin_addr, NHR_REF, 0, &nh4) != 0) { if (in_localaddr(ip->ip_dst)) error = EHOSTUNREACH; else error = ENETUNREACH; goto bad; } has_nhop = 1; if (ifp == NULL) ifp = nh4.nh_ifp; if (nh4.nh_flags & NHF_GATEWAY) dst.sin_addr = nh4.nh_addr; /* * For input packets which are being "fastrouted", they won't * go back through output filtering and miss their chance to get * NAT'd and counted. Duplicated packets aren't considered to be * part of the normal packet stream, so do not NAT them or pass * them through stateful checking, etc. */ if ((fdp != &fr->fr_dif) && (fin->fin_out == 0)) { sifp = fin->fin_ifp; fin->fin_ifp = ifp; fin->fin_out = 1; (void) ipf_acctpkt(fin, NULL); fin->fin_fr = NULL; if (!fr || !(fr->fr_flags & FR_RETMASK)) { u_32_t pass; (void) ipf_state_check(fin, &pass); } switch (ipf_nat_checkout(fin, NULL)) { case 0 : break; case 1 : ip->ip_sum = 0; break; case -1 : error = -1; goto bad; break; } fin->fin_ifp = sifp; fin->fin_out = 0; } else ip->ip_sum = 0; /* * If small enough for interface, can just send directly. */ if (ntohs(ip->ip_len) <= ifp->if_mtu) { if (!ip->ip_sum) ip->ip_sum = in_cksum(m, hlen); error = (*ifp->if_output)(ifp, m, (struct sockaddr *)&dst, NULL ); goto done; } /* * Too large for interface; fragment if possible. * Must be able to put at least 8 bytes per fragment. */ ip_off = ntohs(ip->ip_off); if (ip_off & IP_DF) { error = EMSGSIZE; goto bad; } len = (ifp->if_mtu - hlen) &~ 7; if (len < 8) { error = EMSGSIZE; goto bad; } { int mhlen, firstlen = len; struct mbuf **mnext = &m->m_act; /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. */ m0 = m; mhlen = sizeof (struct ip); for (off = hlen + len; off < ntohs(ip->ip_len); off += len) { #ifdef MGETHDR MGETHDR(m, M_NOWAIT, MT_HEADER); #else MGET(m, M_NOWAIT, MT_HEADER); #endif if (m == NULL) { m = m0; error = ENOBUFS; goto bad; } m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); bcopy((char *)ip, (char *)mhip, sizeof(*ip)); if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); IP_HL_A(mhip, mhlen >> 2); } m->m_len = mhlen; mhip->ip_off = ((off - hlen) >> 3) + ip_off; if (off + len >= ntohs(ip->ip_len)) len = ntohs(ip->ip_len) - off; else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); *mnext = m; - m->m_next = m_copy(m0, off, len); + m->m_next = m_copym(m0, off, len, M_NOWAIT); if (m->m_next == 0) { error = ENOBUFS; /* ??? */ goto sendorfree; } m->m_pkthdr.len = mhlen + len; m->m_pkthdr.rcvif = NULL; mhip->ip_off = htons((u_short)mhip->ip_off); mhip->ip_sum = 0; mhip->ip_sum = in_cksum(m, mhlen); mnext = &m->m_act; } /* * Update first fragment by trimming what's been copied out * and updating header, then send each fragment (in order). */ m_adj(m0, hlen + firstlen - ip->ip_len); ip->ip_len = htons((u_short)(hlen + firstlen)); ip->ip_off = htons((u_short)IP_MF); ip->ip_sum = 0; ip->ip_sum = in_cksum(m0, hlen); sendorfree: for (m = m0; m; m = m0) { m0 = m->m_act; m->m_act = 0; if (error == 0) error = (*ifp->if_output)(ifp, m, (struct sockaddr *)&dst, NULL ); else FREE_MB_T(m); } } done: if (!error) V_ipfmain.ipf_frouteok[0]++; else V_ipfmain.ipf_frouteok[1]++; if (has_nhop) fib4_free_nh_ext(fibnum, &nh4); return 0; bad: if (error == EMSGSIZE) { sifp = fin->fin_ifp; code = fin->fin_icode; fin->fin_icode = ICMP_UNREACH_NEEDFRAG; fin->fin_ifp = ifp; (void) ipf_send_icmp_err(ICMP_UNREACH, fin, 1); fin->fin_ifp = sifp; fin->fin_icode = code; } FREE_MB_T(m); goto done; } int ipf_verifysrc(fin) fr_info_t *fin; { struct nhop4_basic nh4; if (fib4_lookup_nh_basic(0, fin->fin_src, 0, 0, &nh4) != 0) return (0); return (fin->fin_ifp == nh4.nh_ifp); } /* * return the first IP Address associated with an interface */ int ipf_ifpaddr(softc, v, atype, ifptr, inp, inpmask) ipf_main_softc_t *softc; int v, atype; void *ifptr; i6addr_t *inp, *inpmask; { #ifdef USE_INET6 struct in6_addr *inp6 = NULL; #endif struct sockaddr *sock, *mask; struct sockaddr_in *sin; struct ifaddr *ifa; struct ifnet *ifp; if ((ifptr == NULL) || (ifptr == (void *)-1)) return -1; sin = NULL; ifp = ifptr; if (v == 4) inp->in4.s_addr = 0; #ifdef USE_INET6 else if (v == 6) bzero((char *)inp, sizeof(*inp)); #endif ifa = TAILQ_FIRST(&ifp->if_addrhead); sock = ifa->ifa_addr; while (sock != NULL && ifa != NULL) { sin = (struct sockaddr_in *)sock; if ((v == 4) && (sin->sin_family == AF_INET)) break; #ifdef USE_INET6 if ((v == 6) && (sin->sin_family == AF_INET6)) { inp6 = &((struct sockaddr_in6 *)sin)->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(inp6) && !IN6_IS_ADDR_LOOPBACK(inp6)) break; } #endif ifa = TAILQ_NEXT(ifa, ifa_link); if (ifa != NULL) sock = ifa->ifa_addr; } if (ifa == NULL || sin == NULL) return -1; mask = ifa->ifa_netmask; if (atype == FRI_BROADCAST) sock = ifa->ifa_broadaddr; else if (atype == FRI_PEERADDR) sock = ifa->ifa_dstaddr; if (sock == NULL) return -1; #ifdef USE_INET6 if (v == 6) { return ipf_ifpfillv6addr(atype, (struct sockaddr_in6 *)sock, (struct sockaddr_in6 *)mask, inp, inpmask); } #endif return ipf_ifpfillv4addr(atype, (struct sockaddr_in *)sock, (struct sockaddr_in *)mask, &inp->in4, &inpmask->in4); } u_32_t ipf_newisn(fin) fr_info_t *fin; { u_32_t newiss; newiss = arc4random(); return newiss; } INLINE int ipf_checkv4sum(fin) fr_info_t *fin; { #ifdef CSUM_DATA_VALID int manual = 0; u_short sum; ip_t *ip; mb_t *m; if ((fin->fin_flx & FI_NOCKSUM) != 0) return 0; if ((fin->fin_flx & FI_SHORT) != 0) return 1; if (fin->fin_cksum != FI_CK_NEEDED) return (fin->fin_cksum > FI_CK_NEEDED) ? 0 : -1; m = fin->fin_m; if (m == NULL) { manual = 1; goto skipauto; } ip = fin->fin_ip; if ((m->m_pkthdr.csum_flags & (CSUM_IP_CHECKED|CSUM_IP_VALID)) == CSUM_IP_CHECKED) { fin->fin_cksum = FI_CK_BAD; fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv4sum_csum_ip_checked, fr_info_t *, fin, u_int, m->m_pkthdr.csum_flags & (CSUM_IP_CHECKED|CSUM_IP_VALID)); return -1; } if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { /* Depending on the driver, UDP may have zero checksum */ if (fin->fin_p == IPPROTO_UDP && (fin->fin_flx & (FI_FRAG|FI_SHORT|FI_BAD)) == 0) { udphdr_t *udp = fin->fin_dp; if (udp->uh_sum == 0) { /* * we're good no matter what the hardware * checksum flags and csum_data say (handling * of csum_data for zero UDP checksum is not * consistent across all drivers) */ fin->fin_cksum = 1; return 0; } } if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) sum = m->m_pkthdr.csum_data; else sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + fin->fin_dlen + fin->fin_p)); sum ^= 0xffff; if (sum != 0) { fin->fin_cksum = FI_CK_BAD; fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv4sum_sum, fr_info_t *, fin, u_int, sum); } else { fin->fin_cksum = FI_CK_SUMOK; return 0; } } else { if (m->m_pkthdr.csum_flags == CSUM_DELAY_DATA) { fin->fin_cksum = FI_CK_L4FULL; return 0; } else if (m->m_pkthdr.csum_flags == CSUM_TCP || m->m_pkthdr.csum_flags == CSUM_UDP) { fin->fin_cksum = FI_CK_L4PART; return 0; } else if (m->m_pkthdr.csum_flags == CSUM_IP) { fin->fin_cksum = FI_CK_L4PART; return 0; } else { manual = 1; } } skipauto: if (manual != 0) { if (ipf_checkl4sum(fin) == -1) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv4sum_manual, fr_info_t *, fin, u_int, manual); return -1; } } #else if (ipf_checkl4sum(fin) == -1) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv4sum_checkl4sum, fr_info_t *, fin, u_int, -1); return -1; } #endif return 0; } #ifdef USE_INET6 INLINE int ipf_checkv6sum(fin) fr_info_t *fin; { if ((fin->fin_flx & FI_NOCKSUM) != 0) { DT(ipf_checkv6sum_fi_nocksum); return 0; } if ((fin->fin_flx & FI_SHORT) != 0) { DT(ipf_checkv6sum_fi_short); return 1; } if (fin->fin_cksum != FI_CK_NEEDED) { DT(ipf_checkv6sum_fi_ck_needed); return (fin->fin_cksum > FI_CK_NEEDED) ? 0 : -1; } if (ipf_checkl4sum(fin) == -1) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv6sum_checkl4sum, fr_info_t *, fin, u_int, -1); return -1; } return 0; } #endif /* USE_INET6 */ size_t mbufchainlen(m0) struct mbuf *m0; { size_t len; if ((m0->m_flags & M_PKTHDR) != 0) { len = m0->m_pkthdr.len; } else { struct mbuf *m; for (m = m0, len = 0; m != NULL; m = m->m_next) len += m->m_len; } return len; } /* ------------------------------------------------------------------------ */ /* Function: ipf_pullup */ /* Returns: NULL == pullup failed, else pointer to protocol header */ /* Parameters: xmin(I)- pointer to buffer where data packet starts */ /* fin(I) - pointer to packet information */ /* len(I) - number of bytes to pullup */ /* */ /* Attempt to move at least len bytes (from the start of the buffer) into a */ /* single buffer for ease of access. Operating system native functions are */ /* used to manage buffers - if necessary. If the entire packet ends up in */ /* a single buffer, set the FI_COALESCE flag even though ipf_coalesce() has */ /* not been called. Both fin_ip and fin_dp are updated before exiting _IF_ */ /* and ONLY if the pullup succeeds. */ /* */ /* We assume that 'xmin' is a pointer to a buffer that is part of the chain */ /* of buffers that starts at *fin->fin_mp. */ /* ------------------------------------------------------------------------ */ void * ipf_pullup(xmin, fin, len) mb_t *xmin; fr_info_t *fin; int len; { int dpoff, ipoff; mb_t *m = xmin; char *ip; if (m == NULL) return NULL; ip = (char *)fin->fin_ip; if ((fin->fin_flx & FI_COALESCE) != 0) return ip; ipoff = fin->fin_ipoff; if (fin->fin_dp != NULL) dpoff = (char *)fin->fin_dp - (char *)ip; else dpoff = 0; if (M_LEN(m) < len) { mb_t *n = *fin->fin_mp; /* * Assume that M_PKTHDR is set and just work with what is left * rather than check.. * Should not make any real difference, anyway. */ if (m != n) { /* * Record the mbuf that points to the mbuf that we're * about to go to work on so that we can update the * m_next appropriately later. */ for (; n->m_next != m; n = n->m_next) ; } else { n = NULL; } #ifdef MHLEN if (len > MHLEN) #else if (len > MLEN) #endif { #ifdef HAVE_M_PULLDOWN if (m_pulldown(m, 0, len, NULL) == NULL) m = NULL; #else FREE_MB_T(*fin->fin_mp); m = NULL; n = NULL; #endif } else { m = m_pullup(m, len); } if (n != NULL) n->m_next = m; if (m == NULL) { /* * When n is non-NULL, it indicates that m pointed to * a sub-chain (tail) of the mbuf and that the head * of this chain has not yet been free'd. */ if (n != NULL) { FREE_MB_T(*fin->fin_mp); } *fin->fin_mp = NULL; fin->fin_m = NULL; return NULL; } if (n == NULL) *fin->fin_mp = m; while (M_LEN(m) == 0) { m = m->m_next; } fin->fin_m = m; ip = MTOD(m, char *) + ipoff; fin->fin_ip = (ip_t *)ip; if (fin->fin_dp != NULL) fin->fin_dp = (char *)fin->fin_ip + dpoff; if (fin->fin_fraghdr != NULL) fin->fin_fraghdr = (char *)ip + ((char *)fin->fin_fraghdr - (char *)fin->fin_ip); } if (len == fin->fin_plen) fin->fin_flx |= FI_COALESCE; return ip; } int ipf_inject(fin, m) fr_info_t *fin; mb_t *m; { int error = 0; if (fin->fin_out == 0) { netisr_dispatch(NETISR_IP, m); } else { fin->fin_ip->ip_len = ntohs(fin->fin_ip->ip_len); fin->fin_ip->ip_off = ntohs(fin->fin_ip->ip_off); error = ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); } return error; } int ipf_pfil_unhook(void) { #if defined(NETBSD_PF) && (__FreeBSD_version >= 500011) struct pfil_head *ph_inet; # ifdef USE_INET6 struct pfil_head *ph_inet6; # endif #endif #ifdef NETBSD_PF ph_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET); if (ph_inet != NULL) pfil_remove_hook((void *)ipf_check_wrapper, NULL, PFIL_IN|PFIL_OUT|PFIL_WAITOK, ph_inet); # ifdef USE_INET6 ph_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6); if (ph_inet6 != NULL) pfil_remove_hook((void *)ipf_check_wrapper6, NULL, PFIL_IN|PFIL_OUT|PFIL_WAITOK, ph_inet6); # endif #endif return (0); } int ipf_pfil_hook(void) { #if defined(NETBSD_PF) && (__FreeBSD_version >= 500011) struct pfil_head *ph_inet; # ifdef USE_INET6 struct pfil_head *ph_inet6; # endif #endif # ifdef NETBSD_PF ph_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET); # ifdef USE_INET6 ph_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6); # endif if (ph_inet == NULL # ifdef USE_INET6 && ph_inet6 == NULL # endif ) { return ENODEV; } if (ph_inet != NULL) pfil_add_hook((void *)ipf_check_wrapper, NULL, PFIL_IN|PFIL_OUT|PFIL_WAITOK, ph_inet); # ifdef USE_INET6 if (ph_inet6 != NULL) pfil_add_hook((void *)ipf_check_wrapper6, NULL, PFIL_IN|PFIL_OUT|PFIL_WAITOK, ph_inet6); # endif # endif return (0); } void ipf_event_reg(void) { ipf_arrivetag = EVENTHANDLER_REGISTER(ifnet_arrival_event, \ ipf_ifevent, NULL, \ EVENTHANDLER_PRI_ANY); ipf_departtag = EVENTHANDLER_REGISTER(ifnet_departure_event, \ ipf_ifevent, NULL, \ EVENTHANDLER_PRI_ANY); #if 0 ipf_clonetag = EVENTHANDLER_REGISTER(if_clone_event, ipf_ifevent, \ NULL, EVENTHANDLER_PRI_ANY); #endif } void ipf_event_dereg(void) { if (ipf_arrivetag != NULL) { EVENTHANDLER_DEREGISTER(ifnet_arrival_event, ipf_arrivetag); } if (ipf_departtag != NULL) { EVENTHANDLER_DEREGISTER(ifnet_departure_event, ipf_departtag); } #if 0 if (ipf_clonetag != NULL) { EVENTHANDLER_DEREGISTER(if_clone_event, ipf_clonetag); } #endif } u_32_t ipf_random() { return arc4random(); } u_int ipf_pcksum(fin, hlen, sum) fr_info_t *fin; int hlen; u_int sum; { struct mbuf *m; u_int sum2; int off; m = fin->fin_m; off = (char *)fin->fin_dp - (char *)fin->fin_ip; m->m_data += hlen; m->m_len -= hlen; sum2 = in_cksum(fin->fin_m, fin->fin_plen - off); m->m_len += hlen; m->m_data -= hlen; /* * Both sum and sum2 are partial sums, so combine them together. */ sum += ~sum2 & 0xffff; while (sum > 0xffff) sum = (sum & 0xffff) + (sum >> 16); sum2 = ~sum & 0xffff; return sum2; } Index: head/sys/kern/uipc_socket.c =================================================================== --- head/sys/kern/uipc_socket.c (revision 305823) +++ head/sys/kern/uipc_socket.c (revision 305824) @@ -1,3727 +1,3728 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2004 The FreeBSD Foundation * Copyright (c) 2004-2008 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 */ /* * Comments on the socket life cycle: * * soalloc() sets of socket layer state for a socket, called only by * socreate() and sonewconn(). Socket layer private. * * sodealloc() tears down socket layer state for a socket, called only by * sofree() and sonewconn(). Socket layer private. * * pru_attach() associates protocol layer state with an allocated socket; * called only once, may fail, aborting socket allocation. This is called * from socreate() and sonewconn(). Socket layer private. * * pru_detach() disassociates protocol layer state from an attached socket, * and will be called exactly once for sockets in which pru_attach() has * been successfully called. If pru_attach() returned an error, * pru_detach() will not be called. Socket layer private. * * pru_abort() and pru_close() notify the protocol layer that the last * consumer of a socket is starting to tear down the socket, and that the * protocol should terminate the connection. Historically, pru_abort() also * detached protocol state from the socket state, but this is no longer the * case. * * socreate() creates a socket and attaches protocol state. This is a public * interface that may be used by socket layer consumers to create new * sockets. * * sonewconn() creates a socket and attaches protocol state. This is a * public interface that may be used by protocols to create new sockets when * a new connection is received and will be available for accept() on a * listen socket. * * soclose() destroys a socket after possibly waiting for it to disconnect. * This is a public interface that socket consumers should use to close and * release a socket when done with it. * * soabort() destroys a socket without waiting for it to disconnect (used * only for incoming connections that are already partially or fully * connected). This is used internally by the socket layer when clearing * listen socket queues (due to overflow or close on the listen socket), but * is also a public interface protocols may use to abort connections in * their incomplete listen queues should they no longer be required. Sockets * placed in completed connection listen queues should not be aborted for * reasons described in the comment above the soclose() implementation. This * is not a general purpose close routine, and except in the specific * circumstances described here, should not be used. * * sofree() will free a socket and its protocol state if all references on * the socket have been released, and is the public interface to attempt to * free a socket when a reference is removed. This is a socket layer private * interface. * * NOTE: In addition to socreate() and soclose(), which provide a single * socket reference to the consumer to be managed as required, there are two * calls to explicitly manage socket references, soref(), and sorele(). * Currently, these are generally required only when transitioning a socket * from a listen queue to a file descriptor, in order to prevent garbage * collection of the socket at an untimely moment. For a number of reasons, * these interfaces are not preferred, and should be avoided. * * NOTE: With regard to VNETs the general rule is that callers do not set * curvnet. Exceptions to this rule include soabort(), sodisconnect(), * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() * and sorflush(), which are usually called from a pre-set VNET context. * sopoll() currently does not need a VNET context to be set. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include /* for struct knote */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #include #endif static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); static int filt_solisten(struct knote *kn, long hint); static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); fo_kqfilter_t soo_kqfilter; static struct filterops solisten_filtops = { .f_isfd = 1, .f_detach = filt_sordetach, .f_event = filt_solisten, }; static struct filterops soread_filtops = { .f_isfd = 1, .f_detach = filt_sordetach, .f_event = filt_soread, }; static struct filterops sowrite_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_sowrite, }; so_gen_t so_gencnt; /* generation count for sockets */ MALLOC_DEFINE(M_SONAME, "soname", "socket name"); MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); #define VNET_SO_ASSERT(so) \ VNET_ASSERT(curvnet != NULL, \ ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); #define V_socket_hhh VNET(socket_hhh) /* * Limit on the number of connections in the listen queue waiting * for accept(2). * NB: The original sysctl somaxconn is still available but hidden * to prevent confusion about the actual purpose of this number. */ static u_int somaxconn = SOMAXCONN; static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS) { int error; int val; val = somaxconn; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); /* * The purpose of the UINT_MAX / 3 limit, is so that the formula * 3 * so_qlimit / 2 * below, will not overflow. */ if (val < 1 || val > UINT_MAX / 3) return (EINVAL); somaxconn = val; return (0); } SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size"); SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size (compat)"); static int numopensockets; SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, &numopensockets, 0, "Number of open sockets"); /* * accept_mtx locks down per-socket fields relating to accept queues. See * socketvar.h for an annotation of the protected fields of struct socket. */ struct mtx accept_mtx; MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); /* * so_global_mtx protects so_gencnt, numopensockets, and the per-socket * so_gencnt field. */ static struct mtx so_global_mtx; MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); /* * General IPC sysctl name space, used by sockets and a variety of other IPC * types. */ SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); /* * Initialize the socket subsystem and set up the socket * memory allocator. */ static uma_zone_t socket_zone; int maxsockets; static void socket_zone_change(void *tag) { maxsockets = uma_zone_set_max(socket_zone, maxsockets); } static void socket_hhook_register(int subtype) { if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, &V_socket_hhh[subtype], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register hook\n", __func__); } static void socket_hhook_deregister(int subtype) { if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) printf("%s: WARNING: unable to deregister hook\n", __func__); } static void socket_init(void *tag) { socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); maxsockets = uma_zone_set_max(socket_zone, maxsockets); uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); static void socket_vnet_init(const void *unused __unused) { int i; /* We expect a contiguous range */ for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_register(i); } VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_init, NULL); static void socket_vnet_uninit(const void *unused __unused) { int i; for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_deregister(i); } VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_uninit, NULL); /* * Initialise maxsockets. This SYSINIT must be run after * tunable_mbinit(). */ static void init_maxsockets(void *ignored) { TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); maxsockets = imax(maxsockets, maxfiles); } SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); /* * Sysctl to get and set the maximum global sockets limit. Notify protocols * of the change so that they can update their dependent limits as required. */ static int sysctl_maxsockets(SYSCTL_HANDLER_ARGS) { int error, newmaxsockets; newmaxsockets = maxsockets; error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); if (error == 0 && req->newptr) { if (newmaxsockets > maxsockets && newmaxsockets <= maxfiles) { maxsockets = newmaxsockets; EVENTHANDLER_INVOKE(maxsockets_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW, &maxsockets, 0, sysctl_maxsockets, "IU", "Maximum number of sockets available"); /* * Socket operation routines. These routines are called by the routines in * sys_socket.c or from a system process, and implement the semantics of * socket operations by switching out to the protocol specific routines. */ /* * Get a socket structure from our zone, and initialize it. Note that it * would probably be better to allocate socket and PCB at the same time, but * I'm not convinced that all the protocols can be easily modified to do * this. * * soalloc() returns a socket with a ref count of 0. */ static struct socket * soalloc(struct vnet *vnet) { struct socket *so; so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); if (so == NULL) return (NULL); #ifdef MAC if (mac_socket_init(so, M_NOWAIT) != 0) { uma_zfree(socket_zone, so); return (NULL); } #endif if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { uma_zfree(socket_zone, so); return (NULL); } SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); sx_init(&so->so_snd.sb_sx, "so_snd_sx"); sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); TAILQ_INIT(&so->so_rcv.sb_aiojobq); TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); #ifdef VIMAGE VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet = vnet; #endif /* We shouldn't need the so_global_mtx */ if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { /* Do we need more comprehensive error returns? */ uma_zfree(socket_zone, so); return (NULL); } mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; #ifdef VIMAGE vnet->vnet_sockcnt++; #endif mtx_unlock(&so_global_mtx); return (so); } /* * Free the storage associated with a socket at the socket layer, tear down * locks, labels, etc. All protocol state is assumed already to have been * torn down (and possibly never set up) by the caller. */ static void sodealloc(struct socket *so) { KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ #ifdef VIMAGE VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet->vnet_sockcnt--; #endif mtx_unlock(&so_global_mtx); if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); /* remove accept filter if one is present. */ if (so->so_accf != NULL) do_setopt_accept_filter(so, NULL); #ifdef MAC mac_socket_destroy(so); #endif hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); crfree(so->so_cred); khelp_destroy_osd(&so->osd); sx_destroy(&so->so_snd.sb_sx); sx_destroy(&so->so_rcv.sb_sx); SOCKBUF_LOCK_DESTROY(&so->so_snd); SOCKBUF_LOCK_DESTROY(&so->so_rcv); uma_zfree(socket_zone, so); } /* * socreate returns a socket with a ref count of 1. The socket should be * closed with soclose(). */ int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td) { struct protosw *prp; struct socket *so; int error; if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); if (prp == NULL) { /* No support for domain. */ if (pffinddomain(dom) == NULL) return (EAFNOSUPPORT); /* No support for socket type. */ if (proto == 0 && type != 0) return (EPROTOTYPE); return (EPROTONOSUPPORT); } if (prp->pr_usrreqs->pru_attach == NULL || prp->pr_usrreqs->pru_attach == pru_attach_notsupp) return (EPROTONOSUPPORT); if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) return (EPROTONOSUPPORT); if (prp->pr_type != type) return (EPROTOTYPE); so = soalloc(CRED_TO_VNET(cred)); if (so == NULL) return (ENOBUFS); TAILQ_INIT(&so->so_incomp); TAILQ_INIT(&so->so_comp); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || (prp->pr_domain->dom_family == PF_INET6) || (prp->pr_domain->dom_family == PF_ROUTE)) so->so_fibnum = td->td_proc->p_fibnum; else so->so_fibnum = 0; so->so_proto = prp; #ifdef MAC mac_socket_create(cred, so); #endif knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); so->so_count = 1; /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ CURVNET_SET(so->so_vnet); error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); CURVNET_RESTORE(); if (error) { KASSERT(so->so_count == 1, ("socreate: so_count %d", so->so_count)); so->so_count = 0; sodealloc(so); return (error); } *aso = so; return (0); } #ifdef REGRESSION static int regression_sonewconn_earlytest = 1; SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); #endif /* * When an attempt at a new connection is noted on a socket which accepts * connections, sonewconn is called. If the connection is possible (subject * to space constraints, etc.) then we allocate a new structure, properly * linked into the data structure of the original socket, and return this. * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. * * Note: the ref count on the socket is 0 on return. */ struct socket * sonewconn(struct socket *head, int connstatus) { static struct timeval lastover; static struct timeval overinterval = { 60, 0 }; static int overcount; struct socket *so; int over; ACCEPT_LOCK(); over = (head->so_qlen > 3 * head->so_qlimit / 2); ACCEPT_UNLOCK(); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else if (over) { #endif overcount++; if (ratecheck(&lastover, &overinterval)) { log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences)\n", __func__, head->so_pcb, head->so_qlen, overcount); overcount = 0; } return (NULL); } VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", __func__, __LINE__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } if ((head->so_options & SO_ACCEPTFILTER) != 0) connstatus = 0; so->so_head = head; so->so_type = head->so_type; so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; so->so_state = head->so_state | SS_NOFDREF; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); VNET_SO_ASSERT(head); if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; so->so_snd.sb_lowat = head->so_snd.sb_lowat; so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; so->so_snd.sb_timeo = head->so_snd.sb_timeo; so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; so->so_state |= connstatus; ACCEPT_LOCK(); /* * The accept socket may be tearing down but we just * won a race on the ACCEPT_LOCK. * However, if sctp_peeloff() is called on a 1-to-many * style socket, the SO_ACCEPTCONN doesn't need to be set. */ if (!(head->so_options & SO_ACCEPTCONN) && ((head->so_proto->pr_protocol != IPPROTO_SCTP) || (head->so_type != SOCK_SEQPACKET))) { SOCK_LOCK(so); so->so_head = NULL; sofree(so); /* NB: returns ACCEPT_UNLOCK'ed. */ return (NULL); } if (connstatus) { TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); so->so_qstate |= SQ_COMP; head->so_qlen++; } else { /* * Keep removing sockets from the head until there's room for * us to insert on the tail. In pre-locking revisions, this * was a simple if(), but as we could be racing with other * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ while (head->so_incqlen > head->so_qlimit) { struct socket *sp; sp = TAILQ_FIRST(&head->so_incomp); TAILQ_REMOVE(&head->so_incomp, sp, so_list); head->so_incqlen--; sp->so_qstate &= ~SQ_INCOMP; sp->so_head = NULL; ACCEPT_UNLOCK(); soabort(sp); ACCEPT_LOCK(); } TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); so->so_qstate |= SQ_INCOMP; head->so_incqlen++; } ACCEPT_UNLOCK(); if (connstatus) { sorwakeup(head); wakeup_one(&head->so_timeo); } return (so); } int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); CURVNET_RESTORE(); return (error); } int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td); CURVNET_RESTORE(); return (error); } /* * solisten() transitions a socket from a non-listening state to a listening * state, but can also be used to update the listen queue depth on an * existing listen socket. The protocol will call back into the sockets * layer using solisten_proto_check() and solisten_proto() to check and set * socket-layer listen state. Call backs are used so that the protocol can * acquire both protocol and socket layer locks in whatever order is required * by the protocol. * * Protocol implementors are advised to hold the socket lock across the * socket-layer test and set to avoid races at the socket layer. */ int solisten(struct socket *so, int backlog, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); CURVNET_RESTORE(); return (error); } int solisten_proto_check(struct socket *so) { SOCK_LOCK_ASSERT(so); if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) return (EINVAL); return (0); } void solisten_proto(struct socket *so, int backlog) { SOCK_LOCK_ASSERT(so); if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->so_qlimit = backlog; so->so_options |= SO_ACCEPTCONN; } /* * Evaluate the reference count and named references on a socket; if no * references remain, free it. This should be called whenever a reference is * released, such as in sorele(), but also when named reference flags are * cleared in socket or protocol code. * * sofree() will free the socket if: * * - There are no outstanding file descriptor references or related consumers * (so_count == 0). * * - The socket has been closed by user space, if ever open (SS_NOFDREF). * * - The protocol does not have an outstanding strong reference on the socket * (SS_PROTOREF). * * - The socket is not in a completed connection queue, so a process has been * notified that it is present. If it is removed, the user process may * block in accept() despite select() saying the socket was ready. */ void sofree(struct socket *so) { struct protosw *pr = so->so_proto; struct socket *head; ACCEPT_LOCK_ASSERT(); SOCK_LOCK_ASSERT(so); if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { SOCK_UNLOCK(so); ACCEPT_UNLOCK(); return; } head = so->so_head; if (head != NULL) { KASSERT((so->so_qstate & SQ_COMP) != 0 || (so->so_qstate & SQ_INCOMP) != 0, ("sofree: so_head != NULL, but neither SQ_COMP nor " "SQ_INCOMP")); KASSERT((so->so_qstate & SQ_COMP) == 0 || (so->so_qstate & SQ_INCOMP) == 0, ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); TAILQ_REMOVE(&head->so_incomp, so, so_list); head->so_incqlen--; so->so_qstate &= ~SQ_INCOMP; so->so_head = NULL; } KASSERT((so->so_qstate & SQ_COMP) == 0 && (so->so_qstate & SQ_INCOMP) == 0, ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); if (so->so_options & SO_ACCEPTCONN) { KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated")); KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_incomp populated")); } SOCK_UNLOCK(so); ACCEPT_UNLOCK(); VNET_SO_ASSERT(so); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(so); if (pr->pr_usrreqs->pru_detach != NULL) (*pr->pr_usrreqs->pru_detach)(so); /* * From this point on, we assume that no other references to this * socket exist anywhere else in the stack. Therefore, no locks need * to be acquired or held. * * We used to do a lot of socket buffer and socket locking here, as * well as invoke sorflush() and perform wakeups. The direct call to * dom_dispose() and sbrelease_internal() are an inlining of what was * necessary from sorflush(). * * Notice that the socket buffer and kqueue state are torn down * before calling pru_detach. This means that protocols shold not * assume they can perform socket wakeups, etc, in their detach code. */ sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); seldrain(&so->so_snd.sb_sel); seldrain(&so->so_rcv.sb_sel); knlist_destroy(&so->so_rcv.sb_sel.si_note); knlist_destroy(&so->so_snd.sb_sel.si_note); sodealloc(so); } /* * Close a socket on last file table reference removal. Initiate disconnect * if connected. Free socket when disconnect complete. * * This function will sorele() the socket. Note that soclose() may be called * prior to the ref count reaching zero. The actual socket structure will * not be freed until the ref count reaches zero. */ int soclose(struct socket *so) { int error = 0; KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error) { if (error == ENOTCONN) error = 0; goto drop; } } if (so->so_options & SO_LINGER) { if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { error = tsleep(&so->so_timeo, PSOCK | PCATCH, "soclos", so->so_linger * hz); if (error) break; } } } drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); ACCEPT_LOCK(); if (so->so_options & SO_ACCEPTCONN) { struct socket *sp; /* * Prevent new additions to the accept queues due * to ACCEPT_LOCK races while we are draining them. */ so->so_options &= ~SO_ACCEPTCONN; while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { TAILQ_REMOVE(&so->so_incomp, sp, so_list); so->so_incqlen--; sp->so_qstate &= ~SQ_INCOMP; sp->so_head = NULL; ACCEPT_UNLOCK(); soabort(sp); ACCEPT_LOCK(); } while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { TAILQ_REMOVE(&so->so_comp, sp, so_list); so->so_qlen--; sp->so_qstate &= ~SQ_COMP; sp->so_head = NULL; ACCEPT_UNLOCK(); soabort(sp); ACCEPT_LOCK(); } KASSERT((TAILQ_EMPTY(&so->so_comp)), ("%s: so_comp populated", __func__)); KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("%s: so_incomp populated", __func__)); } SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; sorele(so); /* NB: Returns with ACCEPT_UNLOCK(). */ CURVNET_RESTORE(); return (error); } /* * soabort() is used to abruptly tear down a connection, such as when a * resource limit is reached (listen queue depth exceeded), or if a listen * socket is closed while there are sockets waiting to be accepted. * * This interface is tricky, because it is called on an unreferenced socket, * and must be called only by a thread that has actually removed the socket * from the listen queue it was on, or races with other threads are risked. * * This interface will call into the protocol code, so must not be called * with any socket locks held. Protocols do call it while holding their own * recursible protocol mutexes, but this is something that should be subject * to review in the future. */ void soabort(struct socket *so) { /* * In as much as is possible, assert that no references to this * socket are held. This is not quite the same as asserting that the * current thread is responsible for arranging for no references, but * is as close as we can get for now. */ KASSERT(so->so_count == 0, ("soabort: so_count")); KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); VNET_SO_ASSERT(so); if (so->so_proto->pr_usrreqs->pru_abort != NULL) (*so->so_proto->pr_usrreqs->pru_abort)(so); ACCEPT_LOCK(); SOCK_LOCK(so); sofree(so); } int soaccept(struct socket *so, struct sockaddr **nam) { int error; SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); so->so_state &= ~SS_NOFDREF; SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); CURVNET_RESTORE(); return (error); } int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (soconnectat(AT_FDCWD, so, nam, td)); } int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); CURVNET_SET(so->so_vnet); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. This allows * user to disconnect by connecting to, e.g., a null address. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) { error = EISCONN; } else { /* * Prevent accumulated error from previous connection from * biting us. */ so->so_error = 0; if (fd == AT_FDCWD) { error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); } else { error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd, so, nam, td); } } CURVNET_RESTORE(); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { int error; CURVNET_SET(so1->so_vnet); error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); CURVNET_RESTORE(); return (error); } int sodisconnect(struct socket *so) { int error; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) return (EALREADY); VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); return (error); } #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); KASSERT(so->so_proto->pr_flags & PR_ATOMIC, ("sosend_dgram: !PR_ATOMIC")); if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. */ if (resid < 0) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection-based * socket if it supports implied connect. Return ENOTCONN if * not connected and no address is supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto out; } } else if (addr == NULL) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; SOCKBUF_UNLOCK(&so->so_snd); goto out; } } /* * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a * problem and need fixing. */ space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; space -= clen; SOCKBUF_UNLOCK(&so->so_snd); if (resid > space) { error = EMSGSIZE; goto out; } if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf chain. * If no data is to be copied in, a single empty mbuf * is returned. */ top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); if (top == NULL) { error = EFAULT; /* only possible error */ goto out; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } KASSERT(resid == 0, ("sosend_dgram: resid != 0")); /* * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock * than with. */ if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously done could be out * of date. We could have received a reset packet in an interrupt or * maybe we slept while doing page faults in uiomove() etc. We could * probably recheck again inside the locking protection here, but * there are probably other places that this also happens. We must * rethink this. */ VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_send)(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands this flag and * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME */ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } /* * Send on a socket. If send must go all at once and message is larger than * send buffering, then hard error. Lock against other senders. If must go * all at once and not enough room now, then inform user that this would * block and do nothing. Otherwise, if nonblocking, send as much as * possible. The data to be sent is described by "uio" if nonzero, otherwise * by the mbuf chain "top" (which must be null if uio is not). Data provided * in mbuf chain must be small enough to send all at once. * * Returns nonzero on error, timeout or signal; callers must check for short * counts if EINTR/ERESTART are returned. Data and control buffers are freed * on return. */ int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. * * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) goto out; restart: do { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto release; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection- * based socket if it supports implied connect. * Return ENOTCONN if not connected and no address is * supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto release; } } else if (addr == NULL) { SOCKBUF_UNLOCK(&so->so_snd); if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; goto release; } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) { SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; goto release; } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto release; } error = sbwait(&so->so_snd); SOCKBUF_UNLOCK(&so->so_snd); if (error) goto release; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); space -= clen; do { if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf * chain. If resid is 0, which can happen * only if we have control to send, then * a single empty mbuf is returned. This * is a workaround to prevent protocol send * methods to panic. */ top = m_uiotombuf(uio, M_WAITOK, space, (atomic ? max_hdr : 0), (atomic ? M_PKTHDR : 0) | ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { error = EFAULT; /* only possible error */ goto release; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date. We could have received * a reset packet in an interrupt or maybe we slept * while doing page faults in uiomove() etc. We * could probably recheck again inside the locking * protection here, but there are probably other * places that this also happens. We must rethink * this. */ VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_send)(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands * this flag and nothing left to send then use * PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; if (error) goto release; } while (resid && space > 0); } while (resid); release: sbunlock(&so->so_snd); out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, control, flags, td); CURVNET_RESTORE(); return (error); } /* * The part of soreceive() that implements reading non-inline out-of-band * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is * unable to return an mbuf chain to the caller. */ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { struct protosw *pr = so->so_proto; struct mbuf *m; int error; KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); VNET_SO_ASSERT(so); m = m_get(M_WAITOK, MT_DATA); error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) goto bad; do { error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: if (m != NULL) m_freem(m); return (error); } /* * Following replacement or removal of the first mbuf on the first mbuf chain * of a socket buffer, push necessary state changes back into the socket * buffer so that other consumers see the values consistently. 'nextrecord' * is the callers locally stored value of the original value of * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. * NOTE: 'nextrecord' may be NULL. */ static __inline void sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) { SOCKBUF_LOCK_ASSERT(sb); /* * First, update for the new value of nextrecord. If necessary, make * it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect the new * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the * addition of a second clause that takes care of the case where * sb_mb has been updated, but remains the last record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } /* * Implement receive operations on a socket. We depend on the way that * records are added to the sockbuf by sbappend. In particular, each record * (mbufs linked through m_next) must begin with an address if the protocol * so specifies, followed by an optional mbuf or mbufs containing ancillary * data, and then zero or more mbufs of data. In order to allow parallelism * between network receive and copying to user space, as well as avoid * sleeping with a mutex held, we release the socket buffer mutex during the * user space copy. Although the sockbuf is locked, new data may still be * appended, and thus we must maintain consistency of the sockbuf during that * time. * * The caller may receive the data as a single mbuf chain by supplying an * mbuf **mp0 for use in returning the chain. The uio is then used only for * the count in uio_resid. */ int soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, **mp; int flags, error, offset; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; ssize_t orig_resid = uio->uio_resid; mp = mp0; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp != NULL) *mp = NULL; if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) && uio->uio_resid) { VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, 0); } error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); if (error) return (error); restart: SOCKBUF_LOCK(&so->so_rcv); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more (subject * to any timeout) if: * 1. the current count is less than the low water mark, or * 2. MSG_DONTWAIT is not set */ if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && sbavail(&so->so_rcv) < uio->uio_resid) && sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { KASSERT(m != NULL || !sbavail(&so->so_rcv), ("receive: m == %p sbavail == %u", m, sbavail(&so->so_rcv))); if (so->so_error) { if (m != NULL) goto dontblock; error = so->so_error; if ((flags & MSG_PEEK) == 0) so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); goto release; } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { if (m == NULL) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } else goto dontblock; } for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENOTCONN; goto release; } if (uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (error) goto release; goto restart; } dontblock: /* * From this point onward, we maintain 'nextrecord' as a cache of the * pointer to the next record in the socket buffer. We must keep the * various socket buffer pointers and local stack versions of the * pointers in sync, pushing out modifications before dropping the * socket buffer mutex, and re-reading them when picking it up. * * Otherwise, we will race with the network stack appending new data * or records onto the socket buffer by using inconsistent/stale * versions of the field, possibly resulting in socket buffer * corruption. * * By holding the high-level sblock(), we prevent simultaneous * readers from pulling off the front of the socket buffer. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); orig_resid = 0; if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; sockbuf_pushsync(&so->so_rcv, nextrecord); } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization (or freeing if controlp == NULL). */ if (m != NULL && m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { if (flags & MSG_PEEK) { if (controlp != NULL) { - *controlp = m_copy(m, 0, m->m_len); + *controlp = m_copym(m, 0, m->m_len, + M_NOWAIT); controlp = &(*controlp)->m_next; } m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sockbuf_pushsync(&so->so_rcv, nextrecord); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); SOCKBUF_LOCK(&so->so_rcv); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { orig_resid = 0; while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } if (m != NULL) nextrecord = so->so_rcv.sb_mb->m_nextpkt; else nextrecord = so->so_rcv.sb_mb; orig_resid = 0; } if (m != NULL) { if ((flags & MSG_PEEK) == 0) { KASSERT(m->m_nextpkt == nextrecord, ("soreceive: post-control, nextrecord !sync")); if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_mb == m, ("soreceive: post-control, sb_mb!=m")); KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive: post-control, lastrecord!=m")); } } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; } else { if ((flags & MSG_PEEK) == 0) { KASSERT(so->so_rcv.sb_mb == nextrecord, ("soreceive: sb_mb != nextrecord")); if (so->so_rcv.sb_mb == NULL) { KASSERT(so->so_rcv.sb_lastrecord == NULL, ("soreceive: sb_lastercord != NULL")); } } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * Now continue to read any data mbufs off of the head of the socket * buffer until the read request is satisfied. Note that 'type' is * used to store the type of any mbuf reads that have happened so far * such that soreceive() can stop reading if the type changes, which * causes soreceive() to return only one of regular data and inline * out-of-band data in a single socket receive operation. */ moff = 0; offset = 0; while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 && error == 0) { /* * If the type of mbuf has changed since the last mbuf * examined ('type'), end the receive operation. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { if (type != m->m_type) break; } else if (type == MT_OOBDATA) break; else KASSERT(m->m_type == MT_DATA, ("m->m_type == %d", m->m_type)); so->so_rcv.sb_state &= ~SBS_RCVATMARK; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. Otherwise copy * them out via the uio, then free. Sockbuf must be * consistent here (points to current mbuf, it points to next * record) when we drop priority; we must note any additions * to the sockbuf when we block interrupts again. */ if (mp == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); error = uiomove(mtod(m, char *) + moff, (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) { /* * The MT_SONAME mbuf has already been removed * from the record, so it is necessary to * remove the data mbufs, if any, to preserve * the invariant in the case of PR_ADDR that * requires MT_SONAME mbufs at the head of * each record. */ if (m && pr->pr_flags & PR_ATOMIC && ((flags & MSG_PEEK) == 0)) (void)sbdroprecord_locked(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } else uio->uio_resid -= len; SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); if (mp != NULL) { m->m_nextpkt = NULL; *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } sockbuf_pushsync(&so->so_rcv, nextrecord); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); } } else { if (flags & MSG_PEEK) moff += len; else { if (mp != NULL) { if (flags & MSG_DONTWAIT) { *mp = m_copym(m, 0, len, M_NOWAIT); if (*mp == NULL) { /* * m_copym() couldn't * allocate an mbuf. * Adjust uio_resid back * (it was adjusted * down by len bytes, * which we didn't end * up "copying" over). */ uio->uio_resid += len; break; } } else { SOCKBUF_UNLOCK(&so->so_rcv); *mp = m_copym(m, 0, len, M_WAITOK); SOCKBUF_LOCK(&so->so_rcv); } } sbcut_locked(&so->so_rcv, len); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_rcv.sb_state |= SBS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), we * must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return with a * short count but without error. Keep sockbuf locked * against other readers. */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) break; /* * Notify the protocol that some data has been * drained before blocking. */ if (pr->pr_flags & PR_WANTRCVD) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * We could receive some data while was notifying * the protocol. Skip blocking in this case. */ if (so->so_rcv.sb_mb == NULL) { error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } m = so->so_rcv.sb_mb; if (m != NULL) nextrecord = m->m_nextpkt; } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m != NULL && pr->pr_flags & PR_ATOMIC) { flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord_locked(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * If soreceive() is being done from the socket callback, * then don't need to generate ACK to peer to update window, * since ACK will be generated on return to TCP. */ if (!(flags & MSG_SOCALLBCK) && (pr->pr_flags & PR_WANTRCVD)) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto restart; } SOCKBUF_UNLOCK(&so->so_rcv); if (flagsp != NULL) *flagsp |= flags; release: sbunlock(&so->so_rcv); return (error); } /* * Optimized version of soreceive() for stream (TCP) sockets. * XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled. */ int soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int len = 0, error = 0, flags, oresid; struct sockbuf *sb; struct mbuf *m, *n = NULL; /* We only do stream sockets. */ if (so->so_type != SOCK_STREAM) return (EINVAL); if (psa != NULL) *psa = NULL; if (controlp != NULL) return (EINVAL); if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp0 != NULL) *mp0 = NULL; sb = &so->so_rcv; /* Prevent other readers from entering the socket. */ error = sblock(sb, SBLOCKWAIT(flags)); if (error) goto out; SOCKBUF_LOCK(sb); /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) { error = EINVAL; goto out; } oresid = uio->uio_resid; /* We will never ever get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { error = ENOTCONN; goto out; } restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); /* Abort if socket has reported problems. */ if (so->so_error) { if (sbavail(sb) > 0) goto deliver; if (oresid > uio->uio_resid) goto out; error = so->so_error; if (!(flags & MSG_PEEK)) so->so_error = 0; goto out; } /* Door is closed. Deliver what is left, if any. */ if (sb->sb_state & SBS_CANTRCVMORE) { if (sbavail(sb) > 0) goto deliver; else goto out; } /* Socket buffer is empty and we shall not block. */ if (sbavail(sb) == 0 && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { error = EAGAIN; goto out; } /* Socket buffer got some data that we shall deliver now. */ if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)) || sbavail(sb) >= sb->sb_lowat || sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat) ) { goto deliver; } /* On MSG_WAITALL we must wait until all data or error arrives. */ if ((flags & MSG_WAITALL) && (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) goto deliver; /* * Wait and block until (more) data comes in. * NB: Drops the sockbuf lock during wait. */ error = sbwait(sb); if (error) goto out; goto restart; deliver: SOCKBUF_LOCK_ASSERT(&so->so_rcv); KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); /* Statistics. */ if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; /* Fill uio until full or current end of socket buffer is reached. */ len = min(uio->uio_resid, sbavail(sb)); if (mp0 != NULL) { /* Dequeue as many mbufs as possible. */ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { if (*mp0 == NULL) *mp0 = sb->sb_mb; else m_cat(*mp0, sb->sb_mb); for (m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p not available", __func__, m)); len -= m->m_len; uio->uio_resid -= m->m_len; sbfree(sb, m); n = m; } n->m_next = NULL; sb->sb_mb = m; sb->sb_lastrecord = sb->sb_mb; if (sb->sb_mb == NULL) SB_EMPTY_FIXUP(sb); } /* Copy the remainder. */ if (len > 0) { KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else uio->uio_resid -= len; if (*mp0 != NULL) m_cat(*mp0, m); else *mp0 = m; if (*mp0 == NULL) { error = ENOBUFS; goto out; } } } else { /* NB: Must unlock socket buffer as uiomove may sleep. */ SOCKBUF_UNLOCK(sb); error = m_mbuftouio(uio, sb->sb_mb, len); SOCKBUF_LOCK(sb); if (error) goto out; } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); /* * Remove the delivered data from the socket buffer unless we * were only peeking. */ if (!(flags & MSG_PEEK)) { if (len > 0) sbdrop_locked(sb, len); /* Notify protocol that we drained some data. */ if ((so->so_proto->pr_flags & PR_WANTRCVD) && (((flags & MSG_WAITALL) && uio->uio_resid > 0) || !(flags & MSG_SOCALLBCK))) { SOCKBUF_UNLOCK(sb); VNET_SO_ASSERT(so); (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(sb); } } /* * For MSG_WAITALL we may have to loop again and wait for * more data to come in. */ if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: SOCKBUF_LOCK_ASSERT(sb); SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); sbunlock(sb); return (error); } /* * Optimized version of soreceive() for simple datagram cases from userspace. * Unlike in the stream case, we're able to drop a datagram if copyout() * fails, and because we handle datagrams atomically, we don't need to use a * sleep lock to prevent I/O interlacing. */ int soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, *m2; int flags, error; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; /* * For any complicated cases, fall back to the full * soreceive_generic(). */ if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); /* * Enforce restrictions on use. */ KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, ("soreceive_dgram: wantrcvd")); KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, ("soreceive_dgram: SBS_RCVATMARK")); KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, ("soreceive_dgram: P_CONNREQUIRED")); /* * Loop blocking while waiting for a datagram. */ SOCKBUF_LOCK(&so->so_rcv); while ((m = so->so_rcv.sb_mb) == NULL) { KASSERT(sbavail(&so->so_rcv) == 0, ("soreceive_dgram: sb_mb NULL but sbavail %u", sbavail(&so->so_rcv))); if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); return (error); } if (so->so_rcv.sb_state & SBS_CANTRCVMORE || uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); return (0); } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); return (EWOULDBLOCK); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); return (error); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive_dgram: lastrecord != m")); } KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, ("soreceive_dgram: m_nextpkt != nextrecord")); /* * Pull 'm' and its chain off the front of the packet queue. */ so->so_rcv.sb_mb = NULL; sockbuf_pushsync(&so->so_rcv, nextrecord); /* * Walk 'm's chain and free that many bytes from the socket buffer. */ for (m2 = m; m2 != NULL; m2 = m2->m_next) sbfree(&so->so_rcv, m2); /* * Do a few last checks before we let go of the lock. */ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); m = m_free(m); } if (m == NULL) { /* XXXRW: Can this happen? */ return (0); } /* * Packet to copyout() is now in 'm' and it is disconnected from the * queue. * * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. We call into the * protocol to perform externalization (or freeing if controlp == * NULL). In some cases there can be only MT_CONTROL mbufs without * MT_DATA mbufs. */ if (m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { m2 = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = m2; } while (m != NULL && m->m_type == MT_CONTROL); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } } KASSERT(m == NULL || m->m_type == MT_DATA, ("soreceive_dgram: !data")); while (m != NULL && uio->uio_resid > 0) { len = uio->uio_resid; if (len > m->m_len) len = m->m_len; error = uiomove(mtod(m, char *), (int)len, uio); if (error) { m_freem(m); return (error); } if (len == m->m_len) m = m_free(m); else { m->m_data += len; m->m_len -= len; } } if (m != NULL) { flags |= MSG_TRUNC; m_freem(m); } if (flagsp != NULL) *flagsp |= flags; return (0); } int soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int error; CURVNET_SET(so->so_vnet); error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, controlp, flagsp)); CURVNET_RESTORE(); return (error); } int soshutdown(struct socket *so, int how) { struct protosw *pr = so->so_proto; int error; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) return (EINVAL); if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) return (ENOTCONN); CURVNET_SET(so->so_vnet); if (pr->pr_usrreqs->pru_flush != NULL) (*pr->pr_usrreqs->pru_flush)(so, how); if (how != SHUT_WR) sorflush(so); if (how != SHUT_RD) { error = (*pr->pr_usrreqs->pru_shutdown)(so); wakeup(&so->so_timeo); CURVNET_RESTORE(); return (error); } wakeup(&so->so_timeo); CURVNET_RESTORE(); return (0); } void sorflush(struct socket *so) { struct sockbuf *sb = &so->so_rcv; struct protosw *pr = so->so_proto; struct socket aso; VNET_SO_ASSERT(so); /* * In order to avoid calling dom_dispose with the socket buffer mutex * held, and in order to generally avoid holding the lock for a long * time, we make a copy of the socket buffer and clear the original * (except locks, state). The new socket buffer copy won't have * initialized locks so we can only call routines that won't use or * assert those locks. * * Dislodge threads currently blocked in receive and wait to acquire * a lock against other simultaneous readers before clearing the * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. */ socantrcvmore(so); (void) sblock(sb, SBL_WAIT | SBL_NOINTR); /* * Invalidate/clear most of the sockbuf structure, but leave selinfo * and mutex data unchanged. */ SOCKBUF_LOCK(sb); bzero(&aso, sizeof(aso)); aso.so_pcb = so->so_pcb; bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); bzero(&sb->sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); SOCKBUF_UNLOCK(sb); sbunlock(sb); /* * Dispose of special rights and flush the copied socket. Don't call * any unsafe routines (that rely on locks being initialized) on aso. */ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(&aso); sbrelease_internal(&aso.so_rcv, so); } /* * Wrapper for Socket established helper hook. * Parameters: socket, context of the hook point, hook id. */ static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) { struct socket_hhook_data hhook_data = { .so = so, .hctx = hctx, .m = NULL, .status = 0 }; CURVNET_SET(so->so_vnet); HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); CURVNET_RESTORE(); /* Ugly but needed, since hhooks return void for now */ return (hhook_data.status); } /* * Perhaps this routine, and sooptcopyout(), below, ought to come in an * additional variant to handle the case where the option value needs to be * some kind of integer, but not a specific size. In addition to their use * here, these functions are also called by the protocol-level pr_ctloutput() * routines. */ int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) { size_t valsize; /* * If the user gives us more than we wanted, we ignore it, but if we * don't get the minimum length the caller wants, we return EINVAL. * On success, sopt->sopt_valsize is set to however much we actually * retrieved. */ if ((valsize = sopt->sopt_valsize) < minlen) return EINVAL; if (valsize > len) sopt->sopt_valsize = valsize = len; if (sopt->sopt_td != NULL) return (copyin(sopt->sopt_val, buf, valsize)); bcopy(sopt->sopt_val, buf, valsize); return (0); } /* * Kernel version of setsockopt(2). * * XXX: optlen is size_t, not socklen_t */ int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen) { struct sockopt sopt; sopt.sopt_level = level; sopt.sopt_name = optname; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = optval; sopt.sopt_valsize = optlen; sopt.sopt_td = NULL; return (sosetopt(so, &sopt)); } int sosetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; sbintime_t val; uint32_t val32; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) { error = (*so->so_proto->pr_ctloutput)(so, sopt); CURVNET_RESTORE(); return (error); } error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = do_setopt_accept_filter(so, sopt); if (error) goto bad; break; case SO_LINGER: error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) so->so_options |= SO_LINGER; else so->so_options &= ~SO_LINGER; SOCK_UNLOCK(so); break; case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; SOCK_LOCK(so); if (optval) so->so_options |= sopt->sopt_name; else so->so_options &= ~sopt->sopt_name; SOCK_UNLOCK(so); break; case SO_SETFIB: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval >= rt_numfibs) { error = EINVAL; goto bad; } if (((so->so_proto->pr_domain->dom_family == PF_INET) || (so->so_proto->pr_domain->dom_family == PF_INET6) || (so->so_proto->pr_domain->dom_family == PF_ROUTE))) so->so_fibnum = optval; else so->so_fibnum = 0; break; case SO_USER_COOKIE: error = sooptcopyin(sopt, &val32, sizeof val32, sizeof val32); if (error) goto bad; so->so_user_cookie = val32; break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; /* * Values < 1 make no sense for any of these options, * so disallow them. */ if (optval < 1) { error = EINVAL; goto bad; } switch (sopt->sopt_name) { case SO_SNDBUF: case SO_RCVBUF: if (sbreserve(sopt->sopt_name == SO_SNDBUF ? &so->so_snd : &so->so_rcv, (u_long)optval, so, curthread) == 0) { error = ENOBUFS; goto bad; } (sopt->sopt_name == SO_SNDBUF ? &so->so_snd : &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE; break; /* * Make sure the low-water is never greater than the * high-water. */ case SO_SNDLOWAT: SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_lowat = (optval > so->so_snd.sb_hiwat) ? so->so_snd.sb_hiwat : optval; SOCKBUF_UNLOCK(&so->so_snd); break; case SO_RCVLOWAT: SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_lowat = (optval > so->so_rcv.sb_hiwat) ? so->so_rcv.sb_hiwat : optval; SOCKBUF_UNLOCK(&so->so_rcv); break; } break; case SO_SNDTIMEO: case SO_RCVTIMEO: #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; error = sooptcopyin(sopt, &tv32, sizeof tv32, sizeof tv32); CP(tv32, tv, tv_sec); CP(tv32, tv, tv_usec); } else #endif error = sooptcopyin(sopt, &tv, sizeof tv, sizeof tv); if (error) goto bad; if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; goto bad; } if (tv.tv_sec > INT32_MAX) val = SBT_MAX; else val = tvtosbt(tv); switch (sopt->sopt_name) { case SO_SNDTIMEO: so->so_snd.sb_timeo = val; break; case SO_RCVTIMEO: so->so_rcv.sb_timeo = val; break; } break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof extmac, sizeof extmac); if (error) goto bad; error = mac_setsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); #else error = EOPNOTSUPP; #endif break; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } if (error == 0 && so->so_proto->pr_ctloutput != NULL) (void)(*so->so_proto->pr_ctloutput)(so, sopt); } bad: CURVNET_RESTORE(); return (error); } /* * Helper routine for getsockopt. */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) { int error; size_t valsize; error = 0; /* * Documented get behavior is that we always return a value, possibly * truncated to fit in the user's buffer. Traditional behavior is * that we always tell the user precisely how much we copied, rather * than something useful like the total amount we had available for * her. Note that this interface is not idempotent; the entire * answer must be generated ahead of time. */ valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != NULL) { if (sopt->sopt_td != NULL) error = copyout(buf, sopt->sopt_val, valsize); else bcopy(buf, sopt->sopt_val, valsize); } return (error); } int sogetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; CURVNET_RESTORE(); return (error); } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = do_getopt_accept_filter(so, sopt); break; case SO_LINGER: SOCK_LOCK(so); l.l_onoff = so->so_options & SO_LINGER; l.l_linger = so->so_linger; SOCK_UNLOCK(so); error = sooptcopyout(sopt, &l, sizeof l); break; case SO_USELOOPBACK: case SO_DONTROUTE: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_BROADCAST: case SO_OOBINLINE: case SO_ACCEPTCONN: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); break; case SO_TYPE: optval = so->so_type; goto integer; case SO_PROTOCOL: optval = so->so_proto->pr_protocol; goto integer; case SO_ERROR: SOCK_LOCK(so); optval = so->so_error; so->so_error = 0; SOCK_UNLOCK(so); goto integer; case SO_SNDBUF: optval = so->so_snd.sb_hiwat; goto integer; case SO_RCVBUF: optval = so->so_rcv.sb_hiwat; goto integer; case SO_SNDLOWAT: optval = so->so_snd.sb_lowat; goto integer; case SO_RCVLOWAT: optval = so->so_rcv.sb_lowat; goto integer; case SO_SNDTIMEO: case SO_RCVTIMEO: tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? so->so_snd.sb_timeo : so->so_rcv.sb_timeo); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; CP(tv, tv32, tv_sec); CP(tv, tv32, tv_usec); error = sooptcopyout(sopt, &tv32, sizeof tv32); } else #endif error = sooptcopyout(sopt, &tv, sizeof tv); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_PEERLABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_peerlabel( sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_LISTENQLIMIT: optval = so->so_qlimit; goto integer; case SO_LISTENQLEN: optval = so->so_qlen; goto integer; case SO_LISTENINCQLEN: optval = so->so_incqlen; goto integer; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } } #ifdef MAC bad: #endif CURVNET_RESTORE(); return (error); } int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { struct mbuf *m, *m_prev; int sopt_size = sopt->sopt_valsize; MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) return ENOBUFS; if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; *mp = m; m_prev = m; while (sopt_size) { MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { m_freem(*mp); return ENOBUFS; } if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m_freem(*mp); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; m_prev->m_next = m; m_prev = m; } return (0); } int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; m = m->m_next; } if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ panic("ip6_sooptmcopyin"); return (0); } int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; size_t valsize = 0; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; valsize += m->m_len; m = m->m_next; } if (m != NULL) { /* enough soopt buffer should be given from user-land */ m_freem(m0); return(EINVAL); } sopt->sopt_valsize = valsize; return (0); } /* * sohasoutofband(): protocol notifies socket layer of the arrival of new * out-of-band data, which will then notify socket consumers. */ void sohasoutofband(struct socket *so) { if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); selwakeuppri(&so->so_rcv.sb_sel, PSOCK); } int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { /* * We do not need to set or assert curvnet as long as everyone uses * sopoll_generic(). */ return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, td)); } int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { int revents = 0; SOCKBUF_LOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (events & (POLLIN | POLLRDNORM)) if (soreadabledata(so)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (sowriteable(so)) revents |= events & (POLLOUT | POLLWRNORM); if (events & (POLLPRI | POLLRDBAND)) if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) revents |= events & (POLLPRI | POLLRDBAND); if ((events & POLLINIGNEOF) == 0) { if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { revents |= events & (POLLIN | POLLRDNORM); if (so->so_snd.sb_state & SBS_CANTSENDMORE) revents |= POLLHUP; } } if (revents == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { selrecord(td, &so->so_rcv.sb_sel); so->so_rcv.sb_flags |= SB_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &so->so_snd.sb_sel); so->so_snd.sb_flags |= SB_SEL; } } SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); return (revents); } int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; switch (kn->kn_filter) { case EVFILT_READ: if (so->so_options & SO_ACCEPTCONN) kn->kn_fop = &solisten_filtops; else kn->kn_fop = &soread_filtops; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; sb = &so->so_snd; break; default: return (EINVAL); } SOCKBUF_LOCK(sb); knlist_add(&sb->sb_sel.si_note, kn, 1); sb->sb_flags |= SB_KNOTE; SOCKBUF_UNLOCK(sb); return (0); } /* * Some routines that return EOPNOTSUPP for entry points that are not * supported by a protocol. Fill in as needed. */ int pru_accept_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job) { return EOPNOTSUPP; } int pru_attach_notsupp(struct socket *so, int proto, struct thread *td) { return EOPNOTSUPP; } int pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect2_notsupp(struct socket *so1, struct socket *so2) { return EOPNOTSUPP; } int pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return EOPNOTSUPP; } int pru_disconnect_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) { return EOPNOTSUPP; } int pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_rcvd_notsupp(struct socket *so, int flags) { return EOPNOTSUPP; } int pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) { return EOPNOTSUPP; } int pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { return EOPNOTSUPP; } int pru_ready_notsupp(struct socket *so, struct mbuf *m, int count) { return (EOPNOTSUPP); } /* * This isn't really a ``null'' operation, but it's the default one and * doesn't do anything destructive. */ int pru_sense_null(struct socket *so, struct stat *sb) { sb->st_blksize = so->so_snd.sb_hiwat; return 0; } int pru_shutdown_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { return EOPNOTSUPP; } int pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { return EOPNOTSUPP; } int pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, struct thread *td) { return EOPNOTSUPP; } static void filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; SOCKBUF_LOCK(&so->so_rcv); knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); if (knlist_empty(&so->so_rcv.sb_sel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; SOCKBUF_UNLOCK(&so->so_rcv); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; SOCKBUF_LOCK_ASSERT(&so->so_rcv); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); if (kn->kn_sfflags & NOTE_LOWAT) { if (kn->kn_data >= kn->kn_sdata) return 1; } else { if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) return 1; } /* This hook returning non-zero indicates an event, not error */ return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); } static void filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; SOCKBUF_LOCK(&so->so_snd); knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); if (knlist_empty(&so->so_snd.sb_sel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; SOCKBUF_UNLOCK(&so->so_snd); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) return (0); else if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); else return (kn->kn_data >= so->so_snd.sb_lowat); } /*ARGSUSED*/ static int filt_solisten(struct knote *kn, long hint) { struct socket *so = kn->kn_fp->f_data; kn->kn_data = so->so_qlen; return (!TAILQ_EMPTY(&so->so_comp)); } int socheckuid(struct socket *so, uid_t uid) { if (so == NULL) return (EPERM); if (so->so_cred->cr_uid != uid) return (EPERM); return (0); } /* * These functions are used by protocols to notify the socket layer (and its * consumers) of state changes in the sockets driven by protocol-side events. */ /* * Procedures to manipulate state flags of socket and do appropriate wakeups. * * Normal sequence from the active (originating) side is that * soisconnecting() is called during processing of connect() call, resulting * in an eventual call to soisconnected() if/when the connection is * established. When the connection is torn down soisdisconnecting() is * called during processing of disconnect() call, and soisdisconnected() is * called when the connection to the peer is totally severed. The semantics * of these routines are such that connectionless protocols can call * soisconnected() and soisdisconnected() only, bypassing the in-progress * calls when setting up a ``connection'' takes no time. * * From the passive side, a socket is created with two queues of sockets: * so_incomp for connections in progress and so_comp for connections already * made and awaiting user acceptance. As a protocol is preparing incoming * connections, it creates a socket structure queued on so_incomp by calling * sonewconn(). When the connection is established, soisconnected() is * called, and transfers the socket structure to so_comp, making it available * to accept(). * * If a socket is closed with sockets on either so_incomp or so_comp, these * sockets are dropped. * * If higher-level protocols are implemented in the kernel, the wakeups done * here will sometimes cause software-interrupt process scheduling. */ void soisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } void soisconnected(struct socket *so) { struct socket *head; int ret; restart: ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; head = so->so_head; if (head != NULL && (so->so_qstate & SQ_INCOMP)) { if ((so->so_options & SO_ACCEPTFILTER) == 0) { SOCK_UNLOCK(so); TAILQ_REMOVE(&head->so_incomp, so, so_list); head->so_incqlen--; so->so_qstate &= ~SQ_INCOMP; TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); head->so_qlen++; so->so_qstate |= SQ_COMP; ACCEPT_UNLOCK(); sorwakeup(head); wakeup_one(&head->so_timeo); } else { ACCEPT_UNLOCK(); soupcall_set(so, SO_RCV, head->so_accf->so_accept_filter->accf_callback, head->so_accf->so_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; ret = head->so_accf->so_accept_filter->accf_callback(so, head->so_accf->so_accept_filter_arg, M_NOWAIT); if (ret == SU_ISCONNECTED) soupcall_clear(so, SO_RCV); SOCK_UNLOCK(so); if (ret == SU_ISCONNECTED) goto restart; } return; } SOCK_UNLOCK(so); ACCEPT_UNLOCK(); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); } void soisdisconnecting(struct socket *so) { /* * Note: This code assumes that SOCK_LOCK(so) and * SOCKBUF_LOCK(&so->so_rcv) are the same. */ SOCKBUF_LOCK(&so->so_rcv); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; socantrcvmore_locked(so); SOCKBUF_LOCK(&so->so_snd); socantsendmore_locked(so); wakeup(&so->so_timeo); } void soisdisconnected(struct socket *so) { /* * Note: This code assumes that SOCK_LOCK(so) and * SOCKBUF_LOCK(&so->so_rcv) are the same. */ SOCKBUF_LOCK(&so->so_rcv); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISDISCONNECTED; socantrcvmore_locked(so); SOCKBUF_LOCK(&so->so_snd); sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); socantsendmore_locked(so); wakeup(&so->so_timeo); } /* * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. */ struct sockaddr * sodupsockaddr(const struct sockaddr *sa, int mflags) { struct sockaddr *sa2; sa2 = malloc(sa->sa_len, M_SONAME, mflags); if (sa2) bcopy(sa, sa2, sa->sa_len); return sa2; } /* * Register per-socket buffer upcalls. */ void soupcall_set(struct socket *so, int which, int (*func)(struct socket *, void *, int), void *arg) { struct sockbuf *sb; switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_set: bad which"); } SOCKBUF_LOCK_ASSERT(sb); #if 0 /* XXX: accf_http actually wants to do this on purpose. */ KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall")); #endif sb->sb_upcall = func; sb->sb_upcallarg = arg; sb->sb_flags |= SB_UPCALL; } void soupcall_clear(struct socket *so, int which) { struct sockbuf *sb; switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_clear: bad which"); } SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear")); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void sotoxsocket(struct socket *so, struct xsocket *xso) { xso->xso_len = sizeof *xso; xso->xso_so = so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; xso->so_qlen = so->so_qlen; xso->so_incqlen = so->so_incqlen; xso->so_qlimit = so->so_qlimit; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; xso->so_oobmark = so->so_oobmark; sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); xso->so_uid = so->so_cred->cr_uid; } /* * Socket accessor functions to provide external consumers with * a safe interface to socket state * */ void so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg) { TAILQ_FOREACH(so, &so->so_comp, so_list) func(so, arg); } struct sockbuf * so_sockbuf_rcv(struct socket *so) { return (&so->so_rcv); } struct sockbuf * so_sockbuf_snd(struct socket *so) { return (&so->so_snd); } int so_state_get(const struct socket *so) { return (so->so_state); } void so_state_set(struct socket *so, int val) { so->so_state = val; } int so_options_get(const struct socket *so) { return (so->so_options); } void so_options_set(struct socket *so, int val) { so->so_options = val; } int so_error_get(const struct socket *so) { return (so->so_error); } void so_error_set(struct socket *so, int val) { so->so_error = val; } int so_linger_get(const struct socket *so) { return (so->so_linger); } void so_linger_set(struct socket *so, int val) { so->so_linger = val; } struct protosw * so_protosw_get(const struct socket *so) { return (so->so_proto); } void so_protosw_set(struct socket *so, struct protosw *val) { so->so_proto = val; } void so_sorwakeup(struct socket *so) { sorwakeup(so); } void so_sowwakeup(struct socket *so) { sowwakeup(so); } void so_sorwakeup_locked(struct socket *so) { sorwakeup_locked(so); } void so_sowwakeup_locked(struct socket *so) { sowwakeup_locked(so); } void so_lock(struct socket *so) { SOCK_LOCK(so); } void so_unlock(struct socket *so) { SOCK_UNLOCK(so); } Index: head/sys/net/if_arcsubr.c =================================================================== --- head/sys/net/if_arcsubr.c (revision 305823) +++ head/sys/net/if_arcsubr.c (revision 305824) @@ -1,834 +1,834 @@ /* $NetBSD: if_arcsubr.c,v 1.36 2001/06/14 05:44:23 itojun Exp $ */ /* $FreeBSD$ */ /*- * Copyright (c) 1994, 1995 Ignatios Souvatzis * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: NetBSD: if_ethersubr.c,v 1.9 1994/06/29 06:36:11 cgd Exp * @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93 * */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #endif #ifdef INET6 #include #endif #define ARCNET_ALLOW_BROKEN_ARP static struct mbuf *arc_defrag(struct ifnet *, struct mbuf *); static int arc_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); u_int8_t arcbroadcastaddr = 0; #define ARC_LLADDR(ifp) (*(u_int8_t *)IF_LLADDR(ifp)) #define senderr(e) { error = (e); goto bad;} #define SIN(s) ((const struct sockaddr_in *)(s)) /* * ARCnet output routine. * Encapsulate a packet of type family for the local net. * Assumes that ifp is actually pointer to arccom structure. */ int arc_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct arc_header *ah; int error; u_int8_t atype, adst; int loop_copy = 0; int isphds; #if defined(INET) || defined(INET6) int is_gw = 0; #endif if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) return(ENETDOWN); /* m, m1 aren't initialized yet */ error = 0; #if defined(INET) || defined(INET6) if (ro != NULL) is_gw = (ro->ro_flags & RT_HAS_GW) != 0; #endif switch (dst->sa_family) { #ifdef INET case AF_INET: /* * For now, use the simple IP addr -> ARCnet addr mapping */ if (m->m_flags & (M_BCAST|M_MCAST)) adst = arcbroadcastaddr; /* ARCnet broadcast address */ else if (ifp->if_flags & IFF_NOARP) adst = ntohl(SIN(dst)->sin_addr.s_addr) & 0xFF; else { error = arpresolve(ifp, is_gw, m, dst, &adst, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); } atype = (ifp->if_flags & IFF_LINK0) ? ARCTYPE_IP_OLD : ARCTYPE_IP; break; case AF_ARP: { struct arphdr *ah; ah = mtod(m, struct arphdr *); ah->ar_hrd = htons(ARPHRD_ARCNET); loop_copy = -1; /* if this is for us, don't do it */ switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: atype = ARCTYPE_REVARP; break; case ARPOP_REQUEST: case ARPOP_REPLY: default: atype = ARCTYPE_ARP; break; } if (m->m_flags & M_BCAST) bcopy(ifp->if_broadcastaddr, &adst, ARC_ADDR_LEN); else bcopy(ar_tha(ah), &adst, ARC_ADDR_LEN); } break; #endif #ifdef INET6 case AF_INET6: if ((m->m_flags & M_MCAST) != 0) adst = arcbroadcastaddr; /* ARCnet broadcast address */ else { error = nd6_resolve(ifp, is_gw, m, dst, &adst, NULL, NULL); if (error != 0) return (error == EWOULDBLOCK ? 0 : error); } atype = ARCTYPE_INET6; break; #endif case AF_UNSPEC: { const struct arc_header *ah; loop_copy = -1; ah = (const struct arc_header *)dst->sa_data; adst = ah->arc_dhost; atype = ah->arc_type; if (atype == ARCTYPE_ARP) { atype = (ifp->if_flags & IFF_LINK0) ? ARCTYPE_ARP_OLD: ARCTYPE_ARP; #ifdef ARCNET_ALLOW_BROKEN_ARP /* * XXX It's not clear per RFC826 if this is needed, but * "assigned numbers" say this is wrong. * However, e.g., AmiTCP 3.0Beta used it... we make this * switchable for emergency cases. Not perfect, but... */ if (ifp->if_flags & IFF_LINK2) mtod(m, struct arphdr *)->ar_pro = atype - 1; #endif } break; } default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); senderr(EAFNOSUPPORT); } isphds = arc_isphds(atype); M_PREPEND(m, isphds ? ARC_HDRNEWLEN : ARC_HDRLEN, M_NOWAIT); if (m == NULL) senderr(ENOBUFS); ah = mtod(m, struct arc_header *); ah->arc_type = atype; ah->arc_dhost = adst; ah->arc_shost = ARC_LLADDR(ifp); if (isphds) { ah->arc_flag = 0; ah->arc_seqid = 0; } if ((ifp->if_flags & IFF_SIMPLEX) && (loop_copy != -1)) { if ((m->m_flags & M_BCAST) || (loop_copy > 0)) { - struct mbuf *n = m_copy(m, 0, (int)M_COPYALL); + struct mbuf *n = m_copym(m, 0, M_COPYALL, M_NOWAIT); (void) if_simloop(ifp, n, dst->sa_family, ARC_HDRLEN); } else if (ah->arc_dhost == ah->arc_shost) { (void) if_simloop(ifp, m, dst->sa_family, ARC_HDRLEN); return (0); /* XXX */ } } BPF_MTAP(ifp, m); error = ifp->if_transmit(ifp, m); return (error); bad: if (m) m_freem(m); return (error); } void arc_frag_init(struct ifnet *ifp) { struct arccom *ac; ac = (struct arccom *)ifp->if_l2com; ac->curr_frag = 0; } struct mbuf * arc_frag_next(struct ifnet *ifp) { struct arccom *ac; struct mbuf *m; struct arc_header *ah; ac = (struct arccom *)ifp->if_l2com; if ((m = ac->curr_frag) == NULL) { int tfrags; /* dequeue new packet */ IF_DEQUEUE(&ifp->if_snd, m); if (m == NULL) return 0; ah = mtod(m, struct arc_header *); if (!arc_isphds(ah->arc_type)) return m; ++ac->ac_seqid; /* make the seqid unique */ tfrags = howmany(m->m_pkthdr.len, ARC_MAX_DATA); ac->fsflag = 2 * tfrags - 3; ac->sflag = 0; ac->rsflag = ac->fsflag; ac->arc_dhost = ah->arc_dhost; ac->arc_shost = ah->arc_shost; ac->arc_type = ah->arc_type; m_adj(m, ARC_HDRNEWLEN); ac->curr_frag = m; } /* split out next fragment and return it */ if (ac->sflag < ac->fsflag) { /* we CAN'T have short packets here */ ac->curr_frag = m_split(m, ARC_MAX_DATA, M_NOWAIT); if (ac->curr_frag == 0) { m_freem(m); return 0; } M_PREPEND(m, ARC_HDRNEWLEN, M_NOWAIT); if (m == NULL) { m_freem(ac->curr_frag); ac->curr_frag = 0; return 0; } ah = mtod(m, struct arc_header *); ah->arc_flag = ac->rsflag; ah->arc_seqid = ac->ac_seqid; ac->sflag += 2; ac->rsflag = ac->sflag; } else if ((m->m_pkthdr.len >= ARC_MIN_FORBID_LEN - ARC_HDRNEWLEN + 2) && (m->m_pkthdr.len <= ARC_MAX_FORBID_LEN - ARC_HDRNEWLEN + 2)) { ac->curr_frag = 0; M_PREPEND(m, ARC_HDRNEWLEN_EXC, M_NOWAIT); if (m == NULL) return 0; ah = mtod(m, struct arc_header *); ah->arc_flag = 0xFF; ah->arc_seqid = 0xFFFF; ah->arc_type2 = ac->arc_type; ah->arc_flag2 = ac->sflag; ah->arc_seqid2 = ac->ac_seqid; } else { ac->curr_frag = 0; M_PREPEND(m, ARC_HDRNEWLEN, M_NOWAIT); if (m == NULL) return 0; ah = mtod(m, struct arc_header *); ah->arc_flag = ac->sflag; ah->arc_seqid = ac->ac_seqid; } ah->arc_dhost = ac->arc_dhost; ah->arc_shost = ac->arc_shost; ah->arc_type = ac->arc_type; return m; } /* * Defragmenter. Returns mbuf if last packet found, else * NULL. frees incoming mbuf as necessary. */ static __inline struct mbuf * arc_defrag(struct ifnet *ifp, struct mbuf *m) { struct arc_header *ah, *ah1; struct arccom *ac; struct ac_frag *af; struct mbuf *m1; char *s; int newflen; u_char src,dst,typ; ac = (struct arccom *)ifp->if_l2com; if (m->m_len < ARC_HDRNEWLEN) { m = m_pullup(m, ARC_HDRNEWLEN); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return NULL; } } ah = mtod(m, struct arc_header *); typ = ah->arc_type; if (!arc_isphds(typ)) return m; src = ah->arc_shost; dst = ah->arc_dhost; if (ah->arc_flag == 0xff) { m_adj(m, 4); if (m->m_len < ARC_HDRNEWLEN) { m = m_pullup(m, ARC_HDRNEWLEN); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return NULL; } } ah = mtod(m, struct arc_header *); } af = &ac->ac_fragtab[src]; m1 = af->af_packet; s = "debug code error"; if (ah->arc_flag & 1) { /* * first fragment. We always initialize, which is * about the right thing to do, as we only want to * accept one fragmented packet per src at a time. */ if (m1 != NULL) m_freem(m1); af->af_packet = m; m1 = m; af->af_maxflag = ah->arc_flag; af->af_lastseen = 0; af->af_seqid = ah->arc_seqid; return NULL; /* notreached */ } else { /* check for unfragmented packet */ if (ah->arc_flag == 0) return m; /* do we have a first packet from that src? */ if (m1 == NULL) { s = "no first frag"; goto outofseq; } ah1 = mtod(m1, struct arc_header *); if (ah->arc_seqid != ah1->arc_seqid) { s = "seqid differs"; goto outofseq; } if (typ != ah1->arc_type) { s = "type differs"; goto outofseq; } if (dst != ah1->arc_dhost) { s = "dest host differs"; goto outofseq; } /* typ, seqid and dst are ok here. */ if (ah->arc_flag == af->af_lastseen) { m_freem(m); return NULL; } if (ah->arc_flag == af->af_lastseen + 2) { /* ok, this is next fragment */ af->af_lastseen = ah->arc_flag; m_adj(m,ARC_HDRNEWLEN); /* * m_cat might free the first mbuf (with pkthdr) * in 2nd chain; therefore: */ newflen = m->m_pkthdr.len; m_cat(m1,m); m1->m_pkthdr.len += newflen; /* is it the last one? */ if (af->af_lastseen > af->af_maxflag) { af->af_packet = NULL; return(m1); } else return NULL; } s = "other reason"; /* if all else fails, it is out of sequence, too */ } outofseq: if (m1) { m_freem(m1); af->af_packet = NULL; } if (m) m_freem(m); log(LOG_INFO,"%s: got out of seq. packet: %s\n", ifp->if_xname, s); return NULL; } /* * return 1 if Packet Header Definition Standard, else 0. * For now: old IP, old ARP aren't obviously. Lacking correct information, * we guess that besides new IP and new ARP also IPX and APPLETALK are PHDS. * (Apple and Novell corporations were involved, among others, in PHDS work). * Easiest is to assume that everybody else uses that, too. */ int arc_isphds(u_int8_t type) { return (type != ARCTYPE_IP_OLD && type != ARCTYPE_ARP_OLD && type != ARCTYPE_DIAGNOSE); } /* * Process a received Arcnet packet; * the packet is in the mbuf chain m with * the ARCnet header. */ void arc_input(struct ifnet *ifp, struct mbuf *m) { struct arc_header *ah; int isr; u_int8_t atype; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } /* possibly defragment: */ m = arc_defrag(ifp, m); if (m == NULL) return; BPF_MTAP(ifp, m); ah = mtod(m, struct arc_header *); /* does this belong to us? */ if ((ifp->if_flags & IFF_PROMISC) == 0 && ah->arc_dhost != arcbroadcastaddr && ah->arc_dhost != ARC_LLADDR(ifp)) { m_freem(m); return; } if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if (ah->arc_dhost == arcbroadcastaddr) { m->m_flags |= M_BCAST|M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } atype = ah->arc_type; switch (atype) { #ifdef INET case ARCTYPE_IP: m_adj(m, ARC_HDRNEWLEN); isr = NETISR_IP; break; case ARCTYPE_IP_OLD: m_adj(m, ARC_HDRLEN); isr = NETISR_IP; break; case ARCTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } m_adj(m, ARC_HDRNEWLEN); isr = NETISR_ARP; #ifdef ARCNET_ALLOW_BROKEN_ARP mtod(m, struct arphdr *)->ar_pro = htons(ETHERTYPE_IP); #endif break; case ARCTYPE_ARP_OLD: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } m_adj(m, ARC_HDRLEN); isr = NETISR_ARP; #ifdef ARCNET_ALLOW_BROKEN_ARP mtod(m, struct arphdr *)->ar_pro = htons(ETHERTYPE_IP); #endif break; #endif #ifdef INET6 case ARCTYPE_INET6: m_adj(m, ARC_HDRNEWLEN); isr = NETISR_IPV6; break; #endif default: m_freem(m); return; } M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); } /* * Register (new) link level address. */ void arc_storelladdr(struct ifnet *ifp, u_int8_t lla) { ARC_LLADDR(ifp) = lla; } /* * Perform common duties while attaching to interface list */ void arc_ifattach(struct ifnet *ifp, u_int8_t lla) { struct ifaddr *ifa; struct sockaddr_dl *sdl; struct arccom *ac; if_attach(ifp); ifp->if_addrlen = 1; ifp->if_hdrlen = ARC_HDRLEN; ifp->if_mtu = 1500; ifp->if_resolvemulti = arc_resolvemulti; if (ifp->if_baudrate == 0) ifp->if_baudrate = 2500000; ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_ARCNET; sdl->sdl_alen = ifp->if_addrlen; if (ifp->if_flags & IFF_BROADCAST) ifp->if_flags |= IFF_MULTICAST|IFF_ALLMULTI; ac = (struct arccom *)ifp->if_l2com; ac->ac_seqid = (time_second) & 0xFFFF; /* try to make seqid unique */ if (lla == 0) { /* XXX this message isn't entirely clear, to me -- cgd */ log(LOG_ERR,"%s: link address 0 reserved for broadcasts. Please change it and ifconfig %s down up\n", ifp->if_xname, ifp->if_xname); } arc_storelladdr(ifp, lla); ifp->if_broadcastaddr = &arcbroadcastaddr; bpfattach(ifp, DLT_ARCNET, ARC_HDRLEN); } void arc_ifdetach(struct ifnet *ifp) { bpfdetach(ifp); if_detach(ifp); } int arc_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; switch (command) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: { struct sockaddr *sa; sa = (struct sockaddr *) &ifr->ifr_data; *(u_int8_t *)sa->sa_data = ARC_LLADDR(ifp); } break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifr == NULL) error = EAFNOSUPPORT; else { switch (ifr->ifr_addr.sa_family) { case AF_INET: case AF_INET6: error = 0; break; default: error = EAFNOSUPPORT; break; } } break; case SIOCSIFMTU: /* * Set the interface MTU. * mtu can't be larger than ARCMTU for RFC1051 * and can't be larger than ARC_PHDS_MTU */ if (((ifp->if_flags & IFF_LINK0) && ifr->ifr_mtu > ARCMTU) || ifr->ifr_mtu > ARC_PHDS_MAXMTU) error = EINVAL; else ifp->if_mtu = ifr->ifr_mtu; break; } return (error); } /* based on ether_resolvemulti() */ int arc_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; if (*LLADDR(sdl) != arcbroadcastaddr) return EADDRNOTAVAIL; *llsa = NULL; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ARC_ADDR_LEN; *LLADDR(sdl) = 0; *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * An IP6 address of 0 means listen to all * of the Ethernet multicast address used for IP6. * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; *llsa = NULL; return 0; } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ARC_ADDR_LEN; *LLADDR(sdl) = 0; *llsa = (struct sockaddr *)sdl; return 0; #endif default: /* * Well, the text isn't quite right, but it's the name * that counts... */ return EAFNOSUPPORT; } } static MALLOC_DEFINE(M_ARCCOM, "arccom", "ARCNET interface internals"); static void* arc_alloc(u_char type, struct ifnet *ifp) { struct arccom *ac; ac = malloc(sizeof(struct arccom), M_ARCCOM, M_WAITOK | M_ZERO); ac->ac_ifp = ifp; return (ac); } static void arc_free(void *com, u_char type) { free(com, M_ARCCOM); } static int arc_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: if_register_com_alloc(IFT_ARCNET, arc_alloc, arc_free); break; case MOD_UNLOAD: if_deregister_com_alloc(IFT_ARCNET); break; default: return EOPNOTSUPP; } return (0); } static moduledata_t arc_mod = { "arcnet", arc_modevent, 0 }; DECLARE_MODULE(arcnet, arc_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(arcnet, 1); Index: head/sys/net/if_fddisubr.c =================================================================== --- head/sys/net/if_fddisubr.c (revision 305823) +++ head/sys/net/if_fddisubr.c (revision 305824) @@ -1,670 +1,670 @@ /*- * Copyright (c) 1995, 1996 * Matt Thomas . All rights reserved. * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: if_ethersubr.c,v 1.5 1994/12/13 22:31:45 wollman Exp * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #endif #ifdef INET6 #include #endif #ifdef DECNET #include #endif #include static const u_char fddibroadcastaddr[FDDI_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static int fddi_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); static int fddi_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void fddi_input(struct ifnet *ifp, struct mbuf *m); #define senderr(e) do { error = (e); goto bad; } while (0) /* * FDDI output routine. * Encapsulate a packet of type family for the local net. * Use trailer local net encapsulation if enough data in first * packet leaves a multiple of 512 bytes of data in remainder. */ static int fddi_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { u_int16_t type; int loop_copy = 0, error = 0, hdrcmplt = 0; u_char esrc[FDDI_ADDR_LEN], edst[FDDI_ADDR_LEN]; struct fddi_header *fh; #if defined(INET) || defined(INET6) int is_gw = 0; #endif #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) senderr(error); #endif if (ifp->if_flags & IFF_MONITOR) senderr(ENETDOWN); if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) senderr(ENETDOWN); getmicrotime(&ifp->if_lastchange); #if defined(INET) || defined(INET6) if (ro != NULL) is_gw = (ro->ro_flags & RT_HAS_GW) != 0; #endif switch (dst->sa_family) { #ifdef INET case AF_INET: { error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IP); break; } case AF_ARP: { struct arphdr *ah; ah = mtod(m, struct arphdr *); ah->ar_hrd = htons(ARPHRD_ETHER); loop_copy = -1; /* if this is for us, don't do it */ switch (ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: type = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: type = htons(ETHERTYPE_ARP); break; } if (m->m_flags & M_BCAST) bcopy(ifp->if_broadcastaddr, edst, FDDI_ADDR_LEN); else bcopy(ar_tha(ah), edst, FDDI_ADDR_LEN); } break; #endif /* INET */ #ifdef INET6 case AF_INET6: error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IPV6); break; #endif /* INET6 */ case pseudo_AF_HDRCMPLT: { const struct ether_header *eh; hdrcmplt = 1; eh = (const struct ether_header *)dst->sa_data; bcopy(eh->ether_shost, esrc, FDDI_ADDR_LEN); /* FALLTHROUGH */ } case AF_UNSPEC: { const struct ether_header *eh; loop_copy = -1; eh = (const struct ether_header *)dst->sa_data; bcopy(eh->ether_dhost, edst, FDDI_ADDR_LEN); if (*edst & 1) m->m_flags |= (M_BCAST|M_MCAST); type = eh->ether_type; break; } case AF_IMPLINK: { fh = mtod(m, struct fddi_header *); error = EPROTONOSUPPORT; switch (fh->fddi_fc & (FDDIFC_C|FDDIFC_L|FDDIFC_F)) { case FDDIFC_LLC_ASYNC: { /* legal priorities are 0 through 7 */ if ((fh->fddi_fc & FDDIFC_Z) > 7) goto bad; break; } case FDDIFC_LLC_SYNC: { /* FDDIFC_Z bits reserved, must be zero */ if (fh->fddi_fc & FDDIFC_Z) goto bad; break; } case FDDIFC_SMT: { /* FDDIFC_Z bits must be non zero */ if ((fh->fddi_fc & FDDIFC_Z) == 0) goto bad; break; } default: { /* anything else is too dangerous */ goto bad; } } error = 0; if (fh->fddi_dhost[0] & 1) m->m_flags |= (M_BCAST|M_MCAST); goto queue_it; } default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); senderr(EAFNOSUPPORT); } /* * Add LLC header. */ if (type != 0) { struct llc *l; M_PREPEND(m, LLC_SNAPFRAMELEN, M_NOWAIT); if (m == NULL) senderr(ENOBUFS); l = mtod(m, struct llc *); l->llc_control = LLC_UI; l->llc_dsap = l->llc_ssap = LLC_SNAP_LSAP; l->llc_snap.org_code[0] = l->llc_snap.org_code[1] = l->llc_snap.org_code[2] = 0; l->llc_snap.ether_type = htons(type); } /* * Add local net header. If no space in first mbuf, * allocate another. */ M_PREPEND(m, FDDI_HDR_LEN, M_NOWAIT); if (m == NULL) senderr(ENOBUFS); fh = mtod(m, struct fddi_header *); fh->fddi_fc = FDDIFC_LLC_ASYNC|FDDIFC_LLC_PRIO4; bcopy((caddr_t)edst, (caddr_t)fh->fddi_dhost, FDDI_ADDR_LEN); queue_it: if (hdrcmplt) bcopy((caddr_t)esrc, (caddr_t)fh->fddi_shost, FDDI_ADDR_LEN); else bcopy(IF_LLADDR(ifp), (caddr_t)fh->fddi_shost, FDDI_ADDR_LEN); /* * If a simplex interface, and the packet is being sent to our * Ethernet address or a broadcast address, loopback a copy. * XXX To make a simplex device behave exactly like a duplex * device, we should copy in the case of sending to our own * ethernet address (thus letting the original actually appear * on the wire). However, we don't do that here for security * reasons and compatibility with the original behavior. */ if ((ifp->if_flags & IFF_SIMPLEX) && (loop_copy != -1)) { if ((m->m_flags & M_BCAST) || (loop_copy > 0)) { struct mbuf *n; - n = m_copy(m, 0, (int)M_COPYALL); + n = m_copym(m, 0, M_COPYALL, M_NOWAIT); (void) if_simloop(ifp, n, dst->sa_family, FDDI_HDR_LEN); } else if (bcmp(fh->fddi_dhost, fh->fddi_shost, FDDI_ADDR_LEN) == 0) { (void) if_simloop(ifp, m, dst->sa_family, FDDI_HDR_LEN); return (0); /* XXX */ } } error = (ifp->if_transmit)(ifp, m); if (error) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); bad: if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); if (m) m_freem(m); return (error); } /* * Process a received FDDI packet. */ static void fddi_input(ifp, m) struct ifnet *ifp; struct mbuf *m; { int isr; struct llc *l; struct fddi_header *fh; /* * Do consistency checks to verify assumptions * made by code past this point. */ if ((m->m_flags & M_PKTHDR) == 0) { if_printf(ifp, "discard frame w/o packet header\n"); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } if (m->m_pkthdr.rcvif == NULL) { if_printf(ifp, "discard frame w/o interface pointer\n"); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } m = m_pullup(m, FDDI_HDR_LEN); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); goto dropanyway; } fh = mtod(m, struct fddi_header *); /* * Discard packet if interface is not up. */ if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) goto dropanyway; /* * Give bpf a chance at the packet. */ BPF_MTAP(ifp, m); /* * Interface marked for monitoring; discard packet. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); return; } #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif /* * Update interface statistics. */ if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); getmicrotime(&ifp->if_lastchange); /* * Discard non local unicast packets when interface * is in promiscuous mode. */ if ((ifp->if_flags & IFF_PROMISC) && ((fh->fddi_dhost[0] & 1) == 0) && (bcmp(IF_LLADDR(ifp), (caddr_t)fh->fddi_dhost, FDDI_ADDR_LEN) != 0)) goto dropanyway; /* * Set mbuf flags for bcast/mcast. */ if (fh->fddi_dhost[0] & 1) { if (bcmp(ifp->if_broadcastaddr, fh->fddi_dhost, FDDI_ADDR_LEN) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } #ifdef M_LINK0 /* * If this has a LLC priority of 0, then mark it so upper * layers have a hint that it really came via a FDDI/Ethernet * bridge. */ if ((fh->fddi_fc & FDDIFC_LLC_PRIO7) == FDDIFC_LLC_PRIO0) m->m_flags |= M_LINK0; #endif /* Strip off FDDI header. */ m_adj(m, FDDI_HDR_LEN); m = m_pullup(m, LLC_SNAPFRAMELEN); if (m == 0) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); goto dropanyway; } l = mtod(m, struct llc *); switch (l->llc_dsap) { case LLC_SNAP_LSAP: { u_int16_t type; if ((l->llc_control != LLC_UI) || (l->llc_ssap != LLC_SNAP_LSAP)) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } if (l->llc_snap.org_code[0] != 0 || l->llc_snap.org_code[1] != 0 || l->llc_snap.org_code[2] != 0) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } type = ntohs(l->llc_snap.ether_type); m_adj(m, LLC_SNAPFRAMELEN); switch (type) { #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) goto dropanyway; isr = NETISR_ARP; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif #ifdef DECNET case ETHERTYPE_DECNET: isr = NETISR_DECNET; break; #endif default: /* printf("fddi_input: unknown protocol 0x%x\n", type); */ if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } break; } default: /* printf("fddi_input: unknown dsap 0x%x\n", l->llc_dsap); */ if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); return; dropanyway: if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if (m) m_freem(m); return; } /* * Perform common duties while attaching to interface list */ void fddi_ifattach(ifp, lla, bpf) struct ifnet *ifp; const u_int8_t *lla; int bpf; { struct ifaddr *ifa; struct sockaddr_dl *sdl; ifp->if_type = IFT_FDDI; ifp->if_addrlen = FDDI_ADDR_LEN; ifp->if_hdrlen = 21; if_attach(ifp); /* Must be called before additional assignments */ ifp->if_mtu = FDDIMTU; ifp->if_output = fddi_output; ifp->if_input = fddi_input; ifp->if_resolvemulti = fddi_resolvemulti; ifp->if_broadcastaddr = fddibroadcastaddr; ifp->if_baudrate = 100000000; #ifdef IFF_NOTRAILERS ifp->if_flags |= IFF_NOTRAILERS; #endif ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_FDDI; sdl->sdl_alen = ifp->if_addrlen; bcopy(lla, LLADDR(sdl), ifp->if_addrlen); if (bpf) bpfattach(ifp, DLT_FDDI, FDDI_HDR_LEN); return; } void fddi_ifdetach(ifp, bpf) struct ifnet *ifp; int bpf; { if (bpf) bpfdetach(ifp); if_detach(ifp); return; } int fddi_ioctl (ifp, command, data) struct ifnet *ifp; u_long command; caddr_t data; { struct ifaddr *ifa; struct ifreq *ifr; int error; ifa = (struct ifaddr *) data; ifr = (struct ifreq *) data; error = 0; switch (command) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: /* before arpwhohas */ ifp->if_init(ifp->if_softc); arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: { struct sockaddr *sa; sa = (struct sockaddr *) & ifr->ifr_data; bcopy(IF_LLADDR(ifp), (caddr_t) sa->sa_data, FDDI_ADDR_LEN); } break; case SIOCSIFMTU: /* * Set the interface MTU. */ if (ifr->ifr_mtu > FDDIMTU) { error = EINVAL; } else { ifp->if_mtu = ifr->ifr_mtu; } break; default: error = EINVAL; break; } return (error); } static int fddi_resolvemulti(ifp, llsa, sa) struct ifnet *ifp; struct sockaddr **llsa; struct sockaddr *sa; { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if ((e_addr[0] & 1) != 1) return (EADDRNOTAVAIL); *llsa = NULL; return (0); #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return (EADDRNOTAVAIL); sdl = link_init_sdl(ifp, *llsa, IFT_FDDI); sdl->sdl_nlen = 0; sdl->sdl_alen = FDDI_ADDR_LEN; sdl->sdl_slen = 0; e_addr = LLADDR(sdl); ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); *llsa = (struct sockaddr *)sdl; return (0); #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * An IP6 address of 0 means listen to all * of the Ethernet multicast address used for IP6. * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; *llsa = NULL; return (0); } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return (EADDRNOTAVAIL); sdl = link_init_sdl(ifp, *llsa, IFT_FDDI); sdl->sdl_nlen = 0; sdl->sdl_alen = FDDI_ADDR_LEN; sdl->sdl_slen = 0; e_addr = LLADDR(sdl); ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); *llsa = (struct sockaddr *)sdl; return (0); #endif default: /* * Well, the text isn't quite right, but it's the name * that counts... */ return (EAFNOSUPPORT); } return (0); } static moduledata_t fddi_mod = { "fddi", /* module name */ NULL, /* event handler */ 0 /* extra data */ }; DECLARE_MODULE(fddi, fddi_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(fddi, 1); Index: head/sys/net/if_iso88025subr.c =================================================================== --- head/sys/net/if_iso88025subr.c (revision 305823) +++ head/sys/net/if_iso88025subr.c (revision 305824) @@ -1,697 +1,697 @@ /*- * Copyright (c) 1998, Larry Lile * All rights reserved. * * For latest sources and information on this driver, please * go to http://anarchy.stdio.com. * * Questions, comments or suggestions should be directed to * Larry Lile . * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ /* * * General ISO 802.5 (Token Ring) support routines * */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #endif #ifdef INET6 #include #endif #include static const u_char iso88025_broadcastaddr[ISO88025_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static int iso88025_resolvemulti (struct ifnet *, struct sockaddr **, struct sockaddr *); #define senderr(e) do { error = (e); goto bad; } while (0) /* * Perform common duties while attaching to interface list */ void iso88025_ifattach(struct ifnet *ifp, const u_int8_t *lla, int bpf) { struct ifaddr *ifa; struct sockaddr_dl *sdl; ifa = NULL; ifp->if_type = IFT_ISO88025; ifp->if_addrlen = ISO88025_ADDR_LEN; ifp->if_hdrlen = ISO88025_HDR_LEN; if_attach(ifp); /* Must be called before additional assignments */ ifp->if_output = iso88025_output; ifp->if_input = iso88025_input; ifp->if_resolvemulti = iso88025_resolvemulti; ifp->if_broadcastaddr = iso88025_broadcastaddr; if (ifp->if_baudrate == 0) ifp->if_baudrate = TR_16MBPS; /* 16Mbit should be a safe default */ if (ifp->if_mtu == 0) ifp->if_mtu = ISO88025_DEFAULT_MTU; ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_ISO88025; sdl->sdl_alen = ifp->if_addrlen; bcopy(lla, LLADDR(sdl), ifp->if_addrlen); if (bpf) bpfattach(ifp, DLT_IEEE802, ISO88025_HDR_LEN); return; } /* * Perform common duties while detaching a Token Ring interface */ void iso88025_ifdetach(ifp, bpf) struct ifnet *ifp; int bpf; { if (bpf) bpfdetach(ifp); if_detach(ifp); return; } int iso88025_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifaddr *ifa; struct ifreq *ifr; int error; ifa = (struct ifaddr *) data; ifr = (struct ifreq *) data; error = 0; switch (command) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif /* INET */ default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: { struct sockaddr *sa; sa = (struct sockaddr *) & ifr->ifr_data; bcopy(IF_LLADDR(ifp), (caddr_t) sa->sa_data, ISO88025_ADDR_LEN); } break; case SIOCSIFMTU: /* * Set the interface MTU. */ if (ifr->ifr_mtu > ISO88025_MAX_MTU) { error = EINVAL; } else { ifp->if_mtu = ifr->ifr_mtu; } break; default: error = EINVAL; /* XXX netbsd has ENOTTY??? */ break; } return (error); } /* * ISO88025 encapsulation */ int iso88025_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { u_int16_t snap_type = 0; int loop_copy = 0, error = 0, rif_len = 0; u_char edst[ISO88025_ADDR_LEN]; struct iso88025_header *th; struct iso88025_header gen_th; struct sockaddr_dl *sdl = NULL; struct rtentry *rt0 = NULL; int is_gw = 0; if (ro != NULL) is_gw = (ro->ro_flags & RT_HAS_GW) != 0; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) senderr(error); #endif if (ifp->if_flags & IFF_MONITOR) senderr(ENETDOWN); if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) senderr(ENETDOWN); getmicrotime(&ifp->if_lastchange); /* Calculate routing info length based on arp table entry */ /* XXX any better way to do this ? */ if (rt0 && (sdl = (struct sockaddr_dl *)rt0->rt_gateway)) if (SDL_ISO88025(sdl)->trld_rcf != 0) rif_len = TR_RCF_RIFLEN(SDL_ISO88025(sdl)->trld_rcf); /* Generate a generic 802.5 header for the packet */ gen_th.ac = TR_AC; gen_th.fc = TR_LLC_FRAME; (void)memcpy((caddr_t)gen_th.iso88025_shost, IF_LLADDR(ifp), ISO88025_ADDR_LEN); if (rif_len) { gen_th.iso88025_shost[0] |= TR_RII; if (rif_len > 2) { gen_th.rcf = SDL_ISO88025(sdl)->trld_rcf; (void)memcpy((caddr_t)gen_th.rd, (caddr_t)SDL_ISO88025(sdl)->trld_route, rif_len - 2); } } switch (dst->sa_family) { #ifdef INET case AF_INET: error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); snap_type = ETHERTYPE_IP; break; case AF_ARP: { struct arphdr *ah; ah = mtod(m, struct arphdr *); ah->ar_hrd = htons(ARPHRD_IEEE802); loop_copy = -1; /* if this is for us, don't do it */ switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: snap_type = ETHERTYPE_REVARP; break; case ARPOP_REQUEST: case ARPOP_REPLY: default: snap_type = ETHERTYPE_ARP; break; } if (m->m_flags & M_BCAST) bcopy(ifp->if_broadcastaddr, edst, ISO88025_ADDR_LEN); else bcopy(ar_tha(ah), edst, ISO88025_ADDR_LEN); } break; #endif /* INET */ #ifdef INET6 case AF_INET6: error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); snap_type = ETHERTYPE_IPV6; break; #endif /* INET6 */ case AF_UNSPEC: { const struct iso88025_sockaddr_data *sd; /* * For AF_UNSPEC sockaddr.sa_data must contain all of the * mac information needed to send the packet. This allows * full mac, llc, and source routing function to be controlled. * llc and source routing information must already be in the * mbuf provided, ac/fc are set in sa_data. sockaddr.sa_data * should be an iso88025_sockaddr_data structure see iso88025.h */ loop_copy = -1; sd = (const struct iso88025_sockaddr_data *)dst->sa_data; gen_th.ac = sd->ac; gen_th.fc = sd->fc; (void)memcpy(edst, sd->ether_dhost, ISO88025_ADDR_LEN); (void)memcpy(gen_th.iso88025_shost, sd->ether_shost, ISO88025_ADDR_LEN); rif_len = 0; break; } default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); senderr(EAFNOSUPPORT); break; } /* * Add LLC header. */ if (snap_type != 0) { struct llc *l; M_PREPEND(m, LLC_SNAPFRAMELEN, M_NOWAIT); if (m == NULL) senderr(ENOBUFS); l = mtod(m, struct llc *); l->llc_control = LLC_UI; l->llc_dsap = l->llc_ssap = LLC_SNAP_LSAP; l->llc_snap.org_code[0] = l->llc_snap.org_code[1] = l->llc_snap.org_code[2] = 0; l->llc_snap.ether_type = htons(snap_type); } /* * Add local net header. If no space in first mbuf, * allocate another. */ M_PREPEND(m, ISO88025_HDR_LEN + rif_len, M_NOWAIT); if (m == NULL) senderr(ENOBUFS); th = mtod(m, struct iso88025_header *); bcopy((caddr_t)edst, (caddr_t)&gen_th.iso88025_dhost, ISO88025_ADDR_LEN); /* Copy as much of the generic header as is needed into the mbuf */ memcpy(th, &gen_th, ISO88025_HDR_LEN + rif_len); /* * If a simplex interface, and the packet is being sent to our * Ethernet address or a broadcast address, loopback a copy. * XXX To make a simplex device behave exactly like a duplex * device, we should copy in the case of sending to our own * ethernet address (thus letting the original actually appear * on the wire). However, we don't do that here for security * reasons and compatibility with the original behavior. */ if ((ifp->if_flags & IFF_SIMPLEX) && (loop_copy != -1)) { if ((m->m_flags & M_BCAST) || (loop_copy > 0)) { struct mbuf *n; - n = m_copy(m, 0, (int)M_COPYALL); + n = m_copym(m, 0, M_COPYALL, M_NOWAIT); (void) if_simloop(ifp, n, dst->sa_family, ISO88025_HDR_LEN); } else if (bcmp(th->iso88025_dhost, th->iso88025_shost, ETHER_ADDR_LEN) == 0) { (void) if_simloop(ifp, m, dst->sa_family, ISO88025_HDR_LEN); return(0); /* XXX */ } } IFQ_HANDOFF_ADJ(ifp, m, ISO88025_HDR_LEN + LLC_SNAPFRAMELEN, error); if (error) { printf("iso88025_output: packet dropped QFULL.\n"); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } return (error); bad: if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); if (m) m_freem(m); return (error); } /* * ISO 88025 de-encapsulation */ void iso88025_input(ifp, m) struct ifnet *ifp; struct mbuf *m; { struct iso88025_header *th; struct llc *l; int isr; int mac_hdr_len; /* * Do consistency checks to verify assumptions * made by code past this point. */ if ((m->m_flags & M_PKTHDR) == 0) { if_printf(ifp, "discard frame w/o packet header\n"); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } if (m->m_pkthdr.rcvif == NULL) { if_printf(ifp, "discard frame w/o interface pointer\n"); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } m = m_pullup(m, ISO88025_HDR_LEN); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); goto dropanyway; } th = mtod(m, struct iso88025_header *); /* * Discard packet if interface is not up. */ if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) goto dropanyway; /* * Give bpf a chance at the packet. */ BPF_MTAP(ifp, m); /* * Interface marked for monitoring; discard packet. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); return; } #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif /* * Update interface statistics. */ if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); getmicrotime(&ifp->if_lastchange); /* * Discard non local unicast packets when interface * is in promiscuous mode. */ if ((ifp->if_flags & IFF_PROMISC) && ((th->iso88025_dhost[0] & 1) == 0) && (bcmp(IF_LLADDR(ifp), (caddr_t) th->iso88025_dhost, ISO88025_ADDR_LEN) != 0)) goto dropanyway; /* * Set mbuf flags for bcast/mcast. */ if (th->iso88025_dhost[0] & 1) { if (bcmp(iso88025_broadcastaddr, th->iso88025_dhost, ISO88025_ADDR_LEN) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } mac_hdr_len = ISO88025_HDR_LEN; /* Check for source routing info */ if (th->iso88025_shost[0] & TR_RII) mac_hdr_len += TR_RCF_RIFLEN(th->rcf); /* Strip off ISO88025 header. */ m_adj(m, mac_hdr_len); m = m_pullup(m, LLC_SNAPFRAMELEN); if (m == 0) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); goto dropanyway; } l = mtod(m, struct llc *); switch (l->llc_dsap) { case LLC_SNAP_LSAP: { u_int16_t type; if ((l->llc_control != LLC_UI) || (l->llc_ssap != LLC_SNAP_LSAP)) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } if (l->llc_snap.org_code[0] != 0 || l->llc_snap.org_code[1] != 0 || l->llc_snap.org_code[2] != 0) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } type = ntohs(l->llc_snap.ether_type); m_adj(m, LLC_SNAPFRAMELEN); switch (type) { #ifdef INET case ETHERTYPE_IP: th->iso88025_shost[0] &= ~(TR_RII); isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) goto dropanyway; isr = NETISR_ARP; break; #endif /* INET */ #ifdef INET6 case ETHERTYPE_IPV6: th->iso88025_shost[0] &= ~(TR_RII); isr = NETISR_IPV6; break; #endif /* INET6 */ default: printf("iso88025_input: unexpected llc_snap ether_type 0x%02x\n", type); if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } break; } #ifdef ISO case LLC_ISO_LSAP: switch (l->llc_control) { case LLC_UI: if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; break; case LLC_XID: case LLC_XID_P: if(m->m_len < ISO88025_ADDR_LEN) goto dropanyway; l->llc_window = 0; l->llc_fid = 9; l->llc_class = 1; l->llc_dsap = l->llc_ssap = 0; /* Fall through to */ case LLC_TEST: case LLC_TEST_P: { struct sockaddr sa; struct iso88025_sockaddr_data *th2; int i; u_char c; c = l->llc_dsap; if (th->iso88025_shost[0] & TR_RII) { /* XXX */ printf("iso88025_input: dropping source routed LLC_TEST\n"); goto dropanyway; } l->llc_dsap = l->llc_ssap; l->llc_ssap = c; if (m->m_flags & (M_BCAST | M_MCAST)) bcopy((caddr_t)IF_LLADDR(ifp), (caddr_t)th->iso88025_dhost, ISO88025_ADDR_LEN); sa.sa_family = AF_UNSPEC; sa.sa_len = sizeof(sa); th2 = (struct iso88025_sockaddr_data *)sa.sa_data; for (i = 0; i < ISO88025_ADDR_LEN; i++) { th2->ether_shost[i] = c = th->iso88025_dhost[i]; th2->ether_dhost[i] = th->iso88025_dhost[i] = th->iso88025_shost[i]; th->iso88025_shost[i] = c; } th2->ac = TR_AC; th2->fc = TR_LLC_FRAME; ifp->if_output(ifp, m, &sa, NULL); return; } default: printf("iso88025_input: unexpected llc control 0x%02x\n", l->llc_control); if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; break; } break; #endif /* ISO */ default: printf("iso88025_input: unknown dsap 0x%x\n", l->llc_dsap); if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; break; } M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); return; dropanyway: if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if (m) m_freem(m); return; } static int iso88025_resolvemulti (ifp, llsa, sa) struct ifnet *ifp; struct sockaddr **llsa; struct sockaddr *sa; { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if ((e_addr[0] & 1) != 1) { return (EADDRNOTAVAIL); } *llsa = NULL; return (0); #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { return (EADDRNOTAVAIL); } sdl = link_init_sdl(ifp, *llsa, IFT_ISO88025); sdl->sdl_alen = ISO88025_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); *llsa = (struct sockaddr *)sdl; return (0); #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * An IP6 address of 0 means listen to all * of the Ethernet multicast address used for IP6. * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; *llsa = NULL; return (0); } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { return (EADDRNOTAVAIL); } sdl = link_init_sdl(ifp, *llsa, IFT_ISO88025); sdl->sdl_alen = ISO88025_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); *llsa = (struct sockaddr *)sdl; return (0); #endif default: /* * Well, the text isn't quite right, but it's the name * that counts... */ return (EAFNOSUPPORT); } return (0); } static moduledata_t iso88025_mod = { .name = "iso88025", }; DECLARE_MODULE(iso88025, iso88025_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(iso88025, 1); Index: head/sys/net/raw_usrreq.c =================================================================== --- head/sys/net/raw_usrreq.c (revision 305823) +++ head/sys/net/raw_usrreq.c (revision 305824) @@ -1,274 +1,274 @@ /*- * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_usrreq.c 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MTX_SYSINIT(rawcb_mtx, &rawcb_mtx, "rawcb", MTX_DEF); /* * Initialize raw connection block q. */ void raw_init(void) { LIST_INIT(&V_rawcb_list); } /* * Raw protocol input routine. Find the socket associated with the packet(s) * and move them over. If nothing exists for this packet, drop it. */ /* * Raw protocol interface. */ void raw_input(struct mbuf *m0, struct sockproto *proto, struct sockaddr *src) { return (raw_input_ext(m0, proto, src, NULL)); } void raw_input_ext(struct mbuf *m0, struct sockproto *proto, struct sockaddr *src, raw_input_cb_fn cb) { struct rawcb *rp; struct mbuf *m = m0; struct socket *last; last = NULL; mtx_lock(&rawcb_mtx); LIST_FOREACH(rp, &V_rawcb_list, list) { if (rp->rcb_proto.sp_family != proto->sp_family) continue; if (rp->rcb_proto.sp_protocol && rp->rcb_proto.sp_protocol != proto->sp_protocol) continue; if (cb != NULL && (*cb)(m, proto, src, rp) != 0) continue; if (last) { struct mbuf *n; - n = m_copy(m, 0, (int)M_COPYALL); + n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n) { if (sbappendaddr(&last->so_rcv, src, n, (struct mbuf *)0) == 0) /* should notify about lost packet */ m_freem(n); else sorwakeup(last); } } last = rp->rcb_socket; } if (last) { if (sbappendaddr(&last->so_rcv, src, m, (struct mbuf *)0) == 0) m_freem(m); else sorwakeup(last); } else m_freem(m); mtx_unlock(&rawcb_mtx); } /*ARGSUSED*/ void raw_ctlinput(int cmd, struct sockaddr *arg, void *dummy) { if (cmd < 0 || cmd >= PRC_NCMDS) return; /* INCOMPLETE */ } static void raw_uabort(struct socket *so) { KASSERT(sotorawcb(so) != NULL, ("raw_uabort: rp == NULL")); soisdisconnected(so); } static void raw_uclose(struct socket *so) { KASSERT(sotorawcb(so) != NULL, ("raw_uabort: rp == NULL")); soisdisconnected(so); } /* pru_accept is EOPNOTSUPP */ static int raw_uattach(struct socket *so, int proto, struct thread *td) { int error; /* * Implementors of raw sockets will already have allocated the PCB, * so it must be non-NULL here. */ KASSERT(sotorawcb(so) != NULL, ("raw_uattach: so_pcb == NULL")); if (td != NULL) { error = priv_check(td, PRIV_NET_RAW); if (error) return (error); } return (raw_attach(so, proto)); } static int raw_ubind(struct socket *so, struct sockaddr *nam, struct thread *td) { return (EINVAL); } static int raw_uconnect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (EINVAL); } /* pru_connect2 is EOPNOTSUPP */ /* pru_control is EOPNOTSUPP */ static void raw_udetach(struct socket *so) { struct rawcb *rp = sotorawcb(so); KASSERT(rp != NULL, ("raw_udetach: rp == NULL")); raw_detach(rp); } static int raw_udisconnect(struct socket *so) { KASSERT(sotorawcb(so) != NULL, ("raw_udisconnect: rp == NULL")); return (ENOTCONN); } /* pru_listen is EOPNOTSUPP */ static int raw_upeeraddr(struct socket *so, struct sockaddr **nam) { KASSERT(sotorawcb(so) != NULL, ("raw_upeeraddr: rp == NULL")); return (ENOTCONN); } /* pru_rcvd is EOPNOTSUPP */ /* pru_rcvoob is EOPNOTSUPP */ static int raw_usend(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { KASSERT(sotorawcb(so) != NULL, ("raw_usend: rp == NULL")); if ((flags & PRUS_OOB) || (control && control->m_len)) { /* XXXRW: Should control also be freed here? */ if (m != NULL) m_freem(m); return (EOPNOTSUPP); } /* * For historical (bad?) reasons, we effectively ignore the address * argument to sendto(2). Perhaps we should return an error instead? */ return ((*so->so_proto->pr_output)(m, so)); } /* pru_sense is null */ static int raw_ushutdown(struct socket *so) { KASSERT(sotorawcb(so) != NULL, ("raw_ushutdown: rp == NULL")); socantsendmore(so); return (0); } static int raw_usockaddr(struct socket *so, struct sockaddr **nam) { KASSERT(sotorawcb(so) != NULL, ("raw_usockaddr: rp == NULL")); return (EINVAL); } struct pr_usrreqs raw_usrreqs = { .pru_abort = raw_uabort, .pru_attach = raw_uattach, .pru_bind = raw_ubind, .pru_connect = raw_uconnect, .pru_detach = raw_udetach, .pru_disconnect = raw_udisconnect, .pru_peeraddr = raw_upeeraddr, .pru_send = raw_usend, .pru_shutdown = raw_ushutdown, .pru_sockaddr = raw_usockaddr, .pru_close = raw_uclose, }; Index: head/sys/netinet/ip_input.c =================================================================== --- head/sys/netinet/ip_input.c (revision 305823) +++ head/sys/netinet/ip_input.c (revision 305824) @@ -1,1367 +1,1367 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #include #endif /* IPSEC */ #include #include #include #ifdef CTASSERT CTASSERT(sizeof(struct ip) == 20); #endif /* IP reassembly functions are defined in ip_reass.c. */ extern void ipreass_init(void); extern void ipreass_drain(void); extern void ipreass_slowtimo(void); #ifdef VIMAGE extern void ipreass_destroy(void); #endif struct rmlock in_ifaddr_lock; RM_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock"); VNET_DEFINE(int, rsvp_on); VNET_DEFINE(int, ipforwarding); SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipforwarding), 0, "Enable IP forwarding between interfaces"); static VNET_DEFINE(int, ipsendredirects) = 1; /* XXX */ #define V_ipsendredirects VNET(ipsendredirects) SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsendredirects), 0, "Enable sending IP redirects"); /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table * and transmit implementation do not implement the Strong ES model, * setting this to 1 results in an odd hybrid. * * XXX - ip_checkinterface currently must be disabled if you use ipnat * to translate the destination address to another local interface. * * XXX - ip_checkinterface must be disabled if you add IP aliases * to the loopback interface instead of the interface where the * packets for those addresses are received. */ static VNET_DEFINE(int, ip_checkinterface); #define V_ip_checkinterface VNET(ip_checkinterface) SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_checkinterface), 0, "Verify packet arrives on correct interface"); VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */ static struct netisr_handler ip_nh = { .nh_name = "ip", .nh_handler = ip_input, .nh_proto = NETISR_IP, #ifdef RSS .nh_m2cpuid = rss_soft_m2cpuid_v4, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, #else .nh_policy = NETISR_POLICY_FLOW, #endif }; #ifdef RSS /* * Directly dispatched frames are currently assumed * to have a flowid already calculated. * * It should likely have something that assert it * actually has valid flow details. */ static struct netisr_handler ip_direct_nh = { .nh_name = "ip_direct", .nh_handler = ip_direct_input, .nh_proto = NETISR_IP_DIRECT, .nh_m2cpuid = rss_soft_m2cpuid_v4, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, }; #endif extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */ VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */ VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */ #ifdef IPCTL_DEFMTU SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, &ip_mtu, 0, "Default MTU"); #endif #ifdef IPSTEALTH VNET_DEFINE(int, ipstealth); SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipstealth), 0, "IP stealth mode, no TTL decrementation on forwarding"); #endif /* * IP statistics are stored in the "array" of counter(9)s. */ VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat); VNET_PCPUSTAT_SYSINIT(ipstat); SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ipstat); #endif /* VIMAGE */ /* * Kernel module interface for updating ipstat. The argument is an index * into ipstat treated as an array. */ void kmod_ipstat_inc(int statnum) { counter_u64_add(VNET(ipstat)[statnum], 1); } void kmod_ipstat_dec(int statnum) { counter_u64_add(VNET(ipstat)[statnum], -1); } static int sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I", "Maximum size of the IP input queue"); static int sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I", "Number of packets dropped from the IP input queue"); #ifdef RSS static int sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_direct_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_direct_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_direct_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I", "Maximum size of the IP direct input queue"); static int sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_direct_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_direct_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_direct_queue_drops, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I", "Number of packets dropped from the IP direct input queue"); #endif /* RSS */ /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. */ void ip_init(void) { struct protosw *pr; int i; TAILQ_INIT(&V_in_ifaddrhead); V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); /* Initialize IP reassembly queue. */ ipreass_init(); /* Initialize packet filter hooks. */ V_inet_pfil_hook.ph_type = PFIL_TYPE_AF; V_inet_pfil_hook.ph_af = AF_INET; if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET, &V_ipsec_hhh_in[HHOOK_IPSEC_INET], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register input helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET, &V_ipsec_hhh_out[HHOOK_IPSEC_INET], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register output helper hook\n", __func__); /* Skip initialization of globals for non-default instances. */ #ifdef VIMAGE if (!IS_DEFAULT_VNET(curvnet)) { netisr_register_vnet(&ip_nh); #ifdef RSS netisr_register_vnet(&ip_direct_nh); #endif return; } #endif pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip_init: PF_INET not found"); /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip_protox[i] = pr - inetsw; /* * Cycle through IP protocols and put them into the appropriate place * in ip_protox[]. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip_protox[pr->pr_protocol] = pr - inetsw; } netisr_register(&ip_nh); #ifdef RSS netisr_register(&ip_direct_nh); #endif } #ifdef VIMAGE static void ip_destroy(void *unused __unused) { struct ifnet *ifp; int error; #ifdef RSS netisr_unregister_vnet(&ip_direct_nh); #endif netisr_unregister_vnet(&ip_nh); if ((error = pfil_head_unregister(&V_inet_pfil_hook)) != 0) printf("%s: WARNING: unable to unregister pfil hook, " "error %d\n", __func__, error); error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET]); if (error != 0) { printf("%s: WARNING: unable to deregister input helper hook " "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET: " "error %d returned\n", __func__, error); } error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET]); if (error != 0) { printf("%s: WARNING: unable to deregister output helper hook " "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET: " "error %d returned\n", __func__, error); } /* Remove the IPv4 addresses from all interfaces. */ in_ifscrub_all(); /* Make sure the IPv4 routes are gone as well. */ IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) rt_flushifroutes_af(ifp, AF_INET); IFNET_RUNLOCK(); /* Destroy IP reassembly queue. */ ipreass_destroy(); /* Cleanup in_ifaddr hash table; should be empty. */ hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask); } VNET_SYSUNINIT(ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_destroy, NULL); #endif #ifdef RSS /* * IP direct input routine. * * This is called when reinjecting completed fragments where * all of the previous checking and book-keeping has been done. */ void ip_direct_input(struct mbuf *m) { struct ip *ip; int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; } #endif /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. */ void ip_input(struct mbuf *m) { struct ip *ip = NULL; struct in_ifaddr *ia = NULL; struct ifaddr *ifa; struct ifnet *ifp; int checkif, hlen = 0; uint16_t sum, ip_len; int dchg = 0; /* dest changed after fw */ struct in_addr odst; /* original dst address */ M_ASSERTPKTHDR(m); if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; /* Set up some basics that will be used later. */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip_len = ntohs(ip->ip_len); goto ours; } IPSTAT_INC(ips_total); if (m->m_pkthdr.len < sizeof(struct ip)) goto tooshort; if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { IPSTAT_INC(ips_toosmall); return; } ip = mtod(m, struct ip *); if (ip->ip_v != IPVERSION) { IPSTAT_INC(ips_badvers); goto bad; } hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ IPSTAT_INC(ips_badhlen); goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { IPSTAT_INC(ips_badhlen); return; } ip = mtod(m, struct ip *); } IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL); /* 127/8 must not appear on wire - RFC1122 */ ifp = m->m_pkthdr.rcvif; if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); goto bad; } } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); } else { if (hlen == sizeof(struct ip)) { sum = in_cksum_hdr(ip); } else { sum = in_cksum(m, hlen); } } if (sum) { IPSTAT_INC(ips_badsum); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) /* packet is dropped by traffic conditioner */ return; #endif ip_len = ntohs(ip->ip_len); if (ip_len < hlen) { IPSTAT_INC(ips_badlen); goto bad; } /* * Check that the amount of data in the buffers * is as at least much as the IP header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len < ip_len) { tooshort: IPSTAT_INC(ips_tooshort); goto bad; } if (m->m_pkthdr.len > ip_len) { if (m->m_len == m->m_pkthdr.len) { m->m_len = ip_len; m->m_pkthdr.len = ip_len; } else m_adj(m, ip_len - m->m_pkthdr.len); } /* Try to forward the packet, but if we fail continue */ #ifdef IPSEC /* For now we do not handle IPSEC in tryforward. */ if (!key_havesp(IPSEC_DIR_INBOUND) && !key_havesp(IPSEC_DIR_OUTBOUND) && (V_ipforwarding == 1)) if (ip_tryforward(m) == NULL) return; /* * Bypass packet filtering for packets previously handled by IPsec. */ if (ip_ipsec_filtertunnel(m)) goto passin; #else if (V_ipforwarding == 1) if (ip_tryforward(m) == NULL) return; #endif /* IPSEC */ /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing (e.g. * by NAT rewriting). When this happens, tell * ip_forward to do the right thing. */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet_pfil_hook)) goto passin; odst = ip->ip_dst; if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0) return; if (m == NULL) /* consumed by filter */ return; ip = mtod(m, struct ip *); dchg = (odst.s_addr != ip->ip_dst.s_addr); ifp = m->m_pkthdr.rcvif; if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; goto ours; } if (m->m_flags & M_IP_NEXTHOP) { if (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) { /* * Directly ship the packet on. This allows * forwarding packets originally destined to us * to some other directly connected host. */ ip_forward(m, 1); return; } } passin: /* * Process options and, if not destined for us, * ship it on. ip_dooptions returns 1 when an * error was detected (causing an icmp message * to be sent and the original packet to be freed). */ if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) return; /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no * matter if it is destined to another node, or whether it is * a multicast one, RSVP wants it! and prevents it from being forwarded * anywhere else. Also checks if the rsvp daemon is running before * grabbing the packet. */ if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) goto ours; /* * Check our list of addresses, to see if the packet is for us. * If we don't have any addresses, assume any unicast packet * we receive might be for us (and let the upper layers deal * with it). */ if (TAILQ_EMPTY(&V_in_ifaddrhead) && (m->m_flags & (M_MCAST|M_BCAST)) == 0) goto ours; /* * Enable a consistency check between the destination address * and the arrival interface for a unicast packet (the RFC 1122 * strong ES model) if IP forwarding is disabled and the packet * is not locally generated and the packet is not subject to * 'ipfw fwd'. * * XXX - Checking also should be disabled if the destination * address is ipnat'ed to a different interface. * * XXX - Checking is incompatible with IP aliases added * to the loopback interface instead of the interface where * the packets are received. * * XXX - This is the case for carp vhost IPs as well so we * insert a workaround. If the packet got here, we already * checked with carp_iamatch() and carp_forus(). */ checkif = V_ip_checkinterface && (V_ipforwarding == 0) && ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) && ifp->if_carp == NULL && (dchg == 0); /* * Check for exact addresses in the hash bucket. */ /* IN_IFADDR_RLOCK(); */ LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { /* * If the address matches, verify that the packet * arrived via the correct interface if checking is * enabled. */ if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && (!checkif || ia->ia_ifp == ifp)) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); /* IN_IFADDR_RUNLOCK(); */ goto ours; } } /* IN_IFADDR_RUNLOCK(); */ /* * Check for broadcast addresses. * * Only accept broadcast packets that arrive via the matching * interface. Reception of forwarded directed broadcasts would * be handled via ip_forward() and ether_output() with the loopback * into the stack for SIMPLEX interfaces handled by ether_output(). */ if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == ip->ip_dst.s_addr) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); IF_ADDR_RUNLOCK(ifp); goto ours; } #ifdef BOOTP_COMPAT if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); IF_ADDR_RUNLOCK(ifp); goto ours; } #endif } IF_ADDR_RUNLOCK(ifp); ia = NULL; } /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */ if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { if (V_ip_mrouter) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } /* * The process-level routing daemon needs to receive * all multicast IGMP packets, whether or not this * host belongs to their destination groups. */ if (ip->ip_p == IPPROTO_IGMP) goto ours; IPSTAT_INC(ips_forward); } /* * Assume the packet is for us, to avoid prematurely taking * a lock on the in_multi hash. Protocols must perform * their own filtering and update statistics accordingly. */ goto ours; } if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) goto ours; if (ip->ip_dst.s_addr == INADDR_ANY) goto ours; /* * Not for us; forward if possible and desirable. */ if (V_ipforwarding == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); } else { ip_forward(m, dchg); } return; ours: #ifdef IPSTEALTH /* * IPSTEALTH: Process non-routing options only * if the packet is destined for us. */ if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) return; #endif /* IPSTEALTH */ /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) { /* XXXGL: shouldn't we save & set m_flags? */ m = ip_reass(m); if (m == NULL) return; ip = mtod(m, struct ip *); /* Get the header length of the reassembled packet */ hlen = ip->ip_hl << 2; } #ifdef IPSEC /* * enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ if (ip_ipsec_input(m, ip->ip_p) != 0) goto bad; #endif /* IPSEC */ /* * Switch out to protocol's input routine. */ IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; bad: m_freem(m); } /* * IP timer processing; * if a timer expires on a reassembly * queue, discard it. */ void ip_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ipreass_slowtimo(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } void ip_drain(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ipreass_drain(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * The protocol to be inserted into ip_protox[] must be already registered * in inetsw[], either statically or through pf_proto_register(). */ int ipproto_register(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* * The protocol slot must not be occupied by another protocol * already. An index pointing to IPPROTO_RAW is unused. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */ return (EEXIST); /* Find the protocol position in inetsw[] and set the index. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) { if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol == ipproto) { ip_protox[pr->pr_protocol] = pr - inetsw; return (0); } } return (EPROTONOSUPPORT); } int ipproto_unregister(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* Check if the protocol was indeed registered. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */ return (ENOENT); /* Reset the protocol slot to IPPROTO_RAW. */ ip_protox[ipproto] = pr - inetsw; return (0); } u_char inetctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, EHOSTUNREACH, 0, ENOPROTOOPT, ECONNREFUSED }; /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful * icmp message because icmp doesn't have a large enough repertoire * of codes and types. * * If not forwarding, just drop the packet. This could be confusing * if ipforwarding was zero but some routing protocol was advancing * us as a gateway to somewhere. However, we must let the routing * protocol deal with that. * * The srcrt parameter indicates whether the packet is being forwarded * via a source route. */ void ip_forward(struct mbuf *m, int srcrt) { struct ip *ip = mtod(m, struct ip *); struct in_ifaddr *ia; struct mbuf *mcopy; struct sockaddr_in *sin; struct in_addr dest; struct route ro; int error, type = 0, code = 0, mtu = 0; if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } #ifdef IPSEC if (ip_ipsec_fwd(m) != 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } #endif /* IPSEC */ #ifdef IPSTEALTH if (!V_ipstealth) { #endif if (ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); return; } #ifdef IPSTEALTH } #endif bzero(&ro, sizeof(ro)); sin = (struct sockaddr_in *)&ro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ip->ip_dst; #ifdef RADIX_MPATH rtalloc_mpath_fib(&ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), M_GETFIB(m)); #else in_rtalloc_ign(&ro, 0, M_GETFIB(m)); #endif if (ro.ro_rt != NULL) { ia = ifatoia(ro.ro_rt->rt_ifa); ifa_ref(&ia->ia_ifa); } else ia = NULL; #ifndef IPSEC /* * 'ia' may be NULL if there is no route for this destination. * In case of IPsec, Don't discard it just yet, but pass it to * ip_output in case of outgoing IPsec policy. */ if (!srcrt && ia == NULL) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); RO_RTFREE(&ro); return; } #endif /* * Save the IP header and at most 8 bytes of the payload, * in case we need to generate an ICMP message to the src. * * XXX this can be optimized a lot by saving the data in a local * buffer on the stack (72 bytes at most), and only allocating the * mbuf if really necessary. The vast majority of the packets * are forwarded without having to send an ICMP back (either * because unnecessary, or because rate limited), so we are * really we are wasting a lot of work here. * - * We don't use m_copy() because it might return a reference + * We don't use m_copym() because it might return a reference * to a shared cluster. Both this function and ip_output() * assume exclusive access to the IP header in `m', so any * data in a cluster may change before we reach icmp_error(). */ mcopy = m_gethdr(M_NOWAIT, m->m_type); if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) { /* * It's probably ok if the pkthdr dup fails (because * the deep copy of the tag chain failed), but for now * be conservative and just discard the copy since * code below may some day want the tags. */ m_free(mcopy); mcopy = NULL; } if (mcopy != NULL) { mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy)); mcopy->m_pkthdr.len = mcopy->m_len; m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); } #ifdef IPSTEALTH if (!V_ipstealth) { #endif ip->ip_ttl -= IPTTLDEC; #ifdef IPSTEALTH } #endif /* * If forwarding packet using same interface that it came in on, * perhaps should send a redirect to sender to shortcut a hop. * Only send redirect if source is sending directly to us, * and if packet was not source routed (or has any options). * Also, don't send redirect if forwarding using a default route * or a route modified by a redirect. */ dest.s_addr = 0; if (!srcrt && V_ipsendredirects && ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) { struct rtentry *rt; rt = ro.ro_rt; if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && satosin(rt_key(rt))->sin_addr.s_addr != 0) { #define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) u_long src = ntohl(ip->ip_src.s_addr); if (RTA(rt) && (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { if (rt->rt_flags & RTF_GATEWAY) dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr; else dest.s_addr = ip->ip_dst.s_addr; /* Router requirements says to only send host redirects */ type = ICMP_REDIRECT; code = ICMP_REDIRECT_HOST; } } } error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); if (error == EMSGSIZE && ro.ro_rt) mtu = ro.ro_rt->rt_mtu; RO_RTFREE(&ro); if (error) IPSTAT_INC(ips_cantforward); else { IPSTAT_INC(ips_forward); if (type) IPSTAT_INC(ips_redirectsent); else { if (mcopy) m_freem(mcopy); if (ia != NULL) ifa_free(&ia->ia_ifa); return; } } if (mcopy == NULL) { if (ia != NULL) ifa_free(&ia->ia_ifa); return; } switch (error) { case 0: /* forwarded, but need redirect */ /* type, code set above */ break; case ENETUNREACH: case EHOSTUNREACH: case ENETDOWN: case EHOSTDOWN: default: type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; break; case EMSGSIZE: type = ICMP_UNREACH; code = ICMP_UNREACH_NEEDFRAG; #ifdef IPSEC /* * If IPsec is configured for this path, * override any possibly mtu value set by ip_output. */ mtu = ip_ipsec_mtu(mcopy, mtu); #endif /* IPSEC */ /* * If the MTU was set before make sure we are below the * interface MTU. * If the MTU wasn't set before use the interface mtu or * fall back to the next smaller mtu step compared to the * current packet size. */ if (mtu != 0) { if (ia != NULL) mtu = min(mtu, ia->ia_ifp->if_mtu); } else { if (ia != NULL) mtu = ia->ia_ifp->if_mtu; else mtu = ip_next_mtu(ntohs(ip->ip_len), 0); } IPSTAT_INC(ips_cantfrag); break; case ENOBUFS: case EACCES: /* ipfw denied packet */ m_freem(mcopy); if (ia != NULL) ifa_free(&ia->ia_ifa); return; } if (ia != NULL) ifa_free(&ia->ia_ifa); icmp_error(mcopy, type, code, dest.s_addr, mtu); } void ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct mbuf *m) { if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) { struct bintime bt; bintime(&bt); if (inp->inp_socket->so_options & SO_BINTIME) { *mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt), SCM_BINTIME, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_socket->so_options & SO_TIMESTAMP) { struct timeval tv; bintime2timeval(&bt, &tv); *mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } } if (inp->inp_flags & INP_RECVDSTADDR) { *mp = sbcreatecontrol((caddr_t)&ip->ip_dst, sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTTL) { *mp = sbcreatecontrol((caddr_t)&ip->ip_ttl, sizeof(u_char), IP_RECVTTL, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef notyet /* XXX * Moving these out of udp_input() made them even more broken * than they already were. */ /* options were tossed already */ if (inp->inp_flags & INP_RECVOPTS) { *mp = sbcreatecontrol((caddr_t)opts_deleted_above, sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } /* ip_srcroute doesn't do what we want here, need to fix */ if (inp->inp_flags & INP_RECVRETOPTS) { *mp = sbcreatecontrol((caddr_t)ip_srcroute(m), sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #endif if (inp->inp_flags & INP_RECVIF) { struct ifnet *ifp; struct sdlbuf { struct sockaddr_dl sdl; u_char pad[32]; } sdlbuf; struct sockaddr_dl *sdp; struct sockaddr_dl *sdl2 = &sdlbuf.sdl; if ((ifp = m->m_pkthdr.rcvif) && ifp->if_index && ifp->if_index <= V_if_index) { sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; /* * Change our mind and don't try copy. */ if (sdp->sdl_family != AF_LINK || sdp->sdl_len > sizeof(sdlbuf)) { goto makedummy; } bcopy(sdp, sdl2, sdp->sdl_len); } else { makedummy: sdl2->sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); sdl2->sdl_family = AF_LINK; sdl2->sdl_index = 0; sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } *mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len, IP_RECVIF, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTOS) { *mp = sbcreatecontrol((caddr_t)&ip->ip_tos, sizeof(u_char), IP_RECVTOS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); /* * XXX should handle the failure of one or the * other - don't populate both? */ *mp = sbcreatecontrol((caddr_t) &flowid, sizeof(uint32_t), IP_FLOWID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; *mp = sbcreatecontrol((caddr_t) &flow_type, sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef RSS if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { *mp = sbcreatecontrol((caddr_t) &rss_bucketid, sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } } #endif } /* * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on * locking. This code remains in ip_input.c as ip_mroute.c is optionally * compiled. */ static VNET_DEFINE(int, ip_rsvp_on); VNET_DEFINE(struct socket *, ip_rsvpd); #define V_ip_rsvp_on VNET(ip_rsvp_on) int ip_rsvp_init(struct socket *so) { if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; if (V_ip_rsvpd != NULL) return EADDRINUSE; V_ip_rsvpd = so; /* * This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ if (!V_ip_rsvp_on) { V_ip_rsvp_on = 1; V_rsvp_on++; } return 0; } int ip_rsvp_done(void) { V_ip_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ if (V_ip_rsvp_on) { V_ip_rsvp_on = 0; V_rsvp_on--; } return 0; } int rsvp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; m = *mp; *mp = NULL; if (rsvp_input_p) { /* call the real one if loaded */ *mp = m; rsvp_input_p(mp, offp, proto); return (IPPROTO_DONE); } /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ if (!V_rsvp_on) { m_freem(m); return (IPPROTO_DONE); } if (V_ip_rsvpd != NULL) { *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); } /* Drop the packet */ m_freem(m); return (IPPROTO_DONE); } Index: head/sys/netinet/ip_mroute.c =================================================================== --- head/sys/netinet/ip_mroute.c (revision 305823) +++ head/sys/netinet/ip_mroute.c (revision 305824) @@ -1,2949 +1,2949 @@ /*- * Copyright (c) 1989 Stephen Deering * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 */ /* * IP multicast forwarding procedures * * Written by David Waitzman, BBN Labs, August 1988. * Modified by Steve Deering, Stanford, February 1989. * Modified by Mark J. Steiglitz, Stanford, May, 1991 * Modified by Van Jacobson, LBL, January 1993 * Modified by Ajit Thyagarajan, PARC, August 1993 * Modified by Bill Fenner, PARC, April 1995 * Modified by Ahmed Helmy, SGI, June 1996 * Modified by George Edmond Eddy (Rusty), ISI, February 1998 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 * Modified by Hitoshi Asaeda, WIDE, August 2000 * Modified by Pavlin Radoslavov, ICSI, October 2002 * * MROUTING Revision: 3.5 * and PIM-SMv2 and PIM-DM support, advanced API support, * bandwidth metering and signaling */ /* * TODO: Prefix functions with ipmf_. * TODO: Maintain a refcount on if_allmulti() in ifnet or in the protocol * domain attachment (if_afdata) so we can track consumers of that service. * TODO: Deprecate routing socket path for SIOCGETSGCNT and SIOCGETVIFCNT, * move it to socket options. * TODO: Cleanup LSRR removal further. * TODO: Push RSVP stubs into raw_ip.c. * TODO: Use bitstring.h for vif set. * TODO: Fix mrt6_ioctl dangling ref when dynamically loaded. * TODO: Sync ip6_mroute.c with this file. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_mrouting.h" #define _PIM_VT 1 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IPMF #define KTR_IPMF KTR_INET #endif #define VIFI_INVALID ((vifi_t) -1) static VNET_DEFINE(uint32_t, last_tv_sec); /* last time we processed this */ #define V_last_tv_sec VNET(last_tv_sec) static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast forwarding cache"); /* * Locking. We use two locks: one for the virtual interface table and * one for the forwarding table. These locks may be nested in which case * the VIF lock must always be taken first. Note that each lock is used * to cover not only the specific data structure but also related data * structures. */ static struct mtx mrouter_mtx; #define MROUTER_LOCK() mtx_lock(&mrouter_mtx) #define MROUTER_UNLOCK() mtx_unlock(&mrouter_mtx) #define MROUTER_LOCK_ASSERT() mtx_assert(&mrouter_mtx, MA_OWNED) #define MROUTER_LOCK_INIT() \ mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF) #define MROUTER_LOCK_DESTROY() mtx_destroy(&mrouter_mtx) static int ip_mrouter_cnt; /* # of vnets with active mrouters */ static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */ static VNET_PCPUSTAT_DEFINE(struct mrtstat, mrtstat); VNET_PCPUSTAT_SYSINIT(mrtstat); VNET_PCPUSTAT_SYSUNINIT(mrtstat); SYSCTL_VNET_PCPUSTAT(_net_inet_ip, OID_AUTO, mrtstat, struct mrtstat, mrtstat, "IPv4 Multicast Forwarding Statistics (struct mrtstat, " "netinet/ip_mroute.h)"); static VNET_DEFINE(u_long, mfchash); #define V_mfchash VNET(mfchash) #define MFCHASH(a, g) \ ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \ ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & V_mfchash) #define MFCHASHSIZE 256 static u_long mfchashsize; /* Hash size */ static VNET_DEFINE(u_char *, nexpire); /* 0..mfchashsize-1 */ #define V_nexpire VNET(nexpire) static VNET_DEFINE(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl); #define V_mfchashtbl VNET(mfchashtbl) static struct mtx mfc_mtx; #define MFC_LOCK() mtx_lock(&mfc_mtx) #define MFC_UNLOCK() mtx_unlock(&mfc_mtx) #define MFC_LOCK_ASSERT() mtx_assert(&mfc_mtx, MA_OWNED) #define MFC_LOCK_INIT() \ mtx_init(&mfc_mtx, "IPv4 multicast forwarding cache", NULL, MTX_DEF) #define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) static VNET_DEFINE(vifi_t, numvifs); #define V_numvifs VNET(numvifs) static VNET_DEFINE(struct vif, viftable[MAXVIFS]); #define V_viftable VNET(viftable) SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(viftable), sizeof(V_viftable), "S,vif[MAXVIFS]", "IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)"); static struct mtx vif_mtx; #define VIF_LOCK() mtx_lock(&vif_mtx) #define VIF_UNLOCK() mtx_unlock(&vif_mtx) #define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED) #define VIF_LOCK_INIT() \ mtx_init(&vif_mtx, "IPv4 multicast interfaces", NULL, MTX_DEF) #define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx) static eventhandler_tag if_detach_event_tag = NULL; static VNET_DEFINE(struct callout, expire_upcalls_ch); #define V_expire_upcalls_ch VNET(expire_upcalls_ch) #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ /* * Bandwidth meter variables and constants */ static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); /* * Pending timeouts are stored in a hash table, the key being the * expiration time. Periodically, the entries are analysed and processed. */ #define BW_METER_BUCKETS 1024 static VNET_DEFINE(struct bw_meter*, bw_meter_timers[BW_METER_BUCKETS]); #define V_bw_meter_timers VNET(bw_meter_timers) static VNET_DEFINE(struct callout, bw_meter_ch); #define V_bw_meter_ch VNET(bw_meter_ch) #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ /* * Pending upcalls are stored in a vector which is flushed when * full, or periodically */ static VNET_DEFINE(struct bw_upcall, bw_upcalls[BW_UPCALLS_MAX]); #define V_bw_upcalls VNET(bw_upcalls) static VNET_DEFINE(u_int, bw_upcalls_n); /* # of pending upcalls */ #define V_bw_upcalls_n VNET(bw_upcalls_n) static VNET_DEFINE(struct callout, bw_upcalls_ch); #define V_bw_upcalls_ch VNET(bw_upcalls_ch) #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ static VNET_PCPUSTAT_DEFINE(struct pimstat, pimstat); VNET_PCPUSTAT_SYSINIT(pimstat); VNET_PCPUSTAT_SYSUNINIT(pimstat); SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM"); SYSCTL_VNET_PCPUSTAT(_net_inet_pim, PIMCTL_STATS, stats, struct pimstat, pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)"); static u_long pim_squelch_wholepkt = 0; SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW, &pim_squelch_wholepkt, 0, "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified"); extern struct domain inetdomain; static const struct protosw in_pim_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = pim_input, .pr_output = rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; static const struct encaptab *pim_encap_cookie; static int pim_encapcheck(const struct mbuf *, int, int, void *); /* * Note: the PIM Register encapsulation adds the following in front of a * data packet: * * struct pim_encap_hdr { * struct ip ip; * struct pim_encap_pimhdr pim; * } * */ struct pim_encap_pimhdr { struct pim pim; uint32_t flags; }; #define PIM_ENCAP_TTL 64 static struct ip pim_encap_iphdr = { #if BYTE_ORDER == LITTLE_ENDIAN sizeof(struct ip) >> 2, IPVERSION, #else IPVERSION, sizeof(struct ip) >> 2, #endif 0, /* tos */ sizeof(struct ip), /* total length */ 0, /* id */ 0, /* frag offset */ PIM_ENCAP_TTL, IPPROTO_PIM, 0, /* checksum */ }; static struct pim_encap_pimhdr pim_encap_pimhdr = { { PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 0, /* reserved */ 0, /* checksum */ }, 0 /* flags */ }; static VNET_DEFINE(vifi_t, reg_vif_num) = VIFI_INVALID; #define V_reg_vif_num VNET(reg_vif_num) static VNET_DEFINE(struct ifnet, multicast_register_if); #define V_multicast_register_if VNET(multicast_register_if) /* * Private variables. */ static u_long X_ip_mcast_src(int); static int X_ip_mforward(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); static int X_ip_mrouter_done(void); static int X_ip_mrouter_get(struct socket *, struct sockopt *); static int X_ip_mrouter_set(struct socket *, struct sockopt *); static int X_legal_vif_num(int); static int X_mrt_ioctl(u_long, caddr_t, int); static int add_bw_upcall(struct bw_upcall *); static int add_mfc(struct mfcctl2 *); static int add_vif(struct vifctl *); static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *); static void bw_meter_process(void); static void bw_meter_receive_packet(struct bw_meter *, int, struct timeval *); static void bw_upcalls_send(void); static int del_bw_upcall(struct bw_upcall *); static int del_mfc(struct mfcctl2 *); static int del_vif(vifi_t); static int del_vif_locked(vifi_t); static void expire_bw_meter_process(void *); static void expire_bw_upcalls_send(void *); static void expire_mfc(struct mfc *); static void expire_upcalls(void *); static void free_bw_list(struct bw_meter *); static int get_sg_cnt(struct sioc_sg_req *); static int get_vif_cnt(struct sioc_vif_req *); static void if_detached_event(void *, struct ifnet *); static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); static int ip_mrouter_init(struct socket *, int); static __inline struct mfc * mfc_find(struct in_addr *, struct in_addr *); static void phyint_send(struct ip *, struct vif *, struct mbuf *); static struct mbuf * pim_register_prepare(struct ip *, struct mbuf *); static int pim_register_send(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_rp(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_upcall(struct ip *, struct vif *, struct mbuf *, struct mfc *); static void schedule_bw_meter(struct bw_meter *, struct timeval *); static void send_packet(struct vif *, struct mbuf *); static int set_api_config(uint32_t *); static int set_assert(int); static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); static void unschedule_bw_meter(struct bw_meter *); /* * Kernel multicast forwarding API capabilities and setup. * If more API capabilities are added to the kernel, they should be * recorded in `mrt_api_support'. */ #define MRT_API_VERSION 0x0305 static const int mrt_api_version = MRT_API_VERSION; static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | MRT_MFC_FLAGS_BORDER_VIF | MRT_MFC_RP | MRT_MFC_BW_UPCALL); static VNET_DEFINE(uint32_t, mrt_api_config); #define V_mrt_api_config VNET(mrt_api_config) static VNET_DEFINE(int, pim_assert_enabled); #define V_pim_assert_enabled VNET(pim_assert_enabled) static struct timeval pim_assert_interval = { 3, 0 }; /* Rate limit */ /* * Find a route for a given origin IP address and multicast group address. * Statistics must be updated by the caller. */ static __inline struct mfc * mfc_find(struct in_addr *o, struct in_addr *g) { struct mfc *rt; MFC_LOCK_ASSERT(); LIST_FOREACH(rt, &V_mfchashtbl[MFCHASH(*o, *g)], mfc_hash) { if (in_hosteq(rt->mfc_origin, *o) && in_hosteq(rt->mfc_mcastgrp, *g) && TAILQ_EMPTY(&rt->mfc_stall)) break; } return (rt); } /* * Handle MRT setsockopt commands to modify the multicast forwarding tables. */ static int X_ip_mrouter_set(struct socket *so, struct sockopt *sopt) { int error, optval; vifi_t vifi; struct vifctl vifc; struct mfcctl2 mfc; struct bw_upcall bw_upcall; uint32_t i; if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT) return EPERM; error = 0; switch (sopt->sopt_name) { case MRT_INIT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; error = ip_mrouter_init(so, optval); break; case MRT_DONE: error = ip_mrouter_done(); break; case MRT_ADD_VIF: error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); if (error) break; error = add_vif(&vifc); break; case MRT_DEL_VIF: error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); if (error) break; error = del_vif(vifi); break; case MRT_ADD_MFC: case MRT_DEL_MFC: /* * select data size depending on API version. */ if (sopt->sopt_name == MRT_ADD_MFC && V_mrt_api_config & MRT_API_FLAGS_ALL) { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2), sizeof(struct mfcctl2)); } else { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl), sizeof(struct mfcctl)); bzero((caddr_t)&mfc + sizeof(struct mfcctl), sizeof(mfc) - sizeof(struct mfcctl)); } if (error) break; if (sopt->sopt_name == MRT_ADD_MFC) error = add_mfc(&mfc); else error = del_mfc(&mfc); break; case MRT_ASSERT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; set_assert(optval); break; case MRT_API_CONFIG: error = sooptcopyin(sopt, &i, sizeof i, sizeof i); if (!error) error = set_api_config(&i); if (!error) error = sooptcopyout(sopt, &i, sizeof i); break; case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall, sizeof bw_upcall); if (error) break; if (sopt->sopt_name == MRT_ADD_BW_UPCALL) error = add_bw_upcall(&bw_upcall); else error = del_bw_upcall(&bw_upcall); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle MRT getsockopt commands */ static int X_ip_mrouter_get(struct socket *so, struct sockopt *sopt) { int error; switch (sopt->sopt_name) { case MRT_VERSION: error = sooptcopyout(sopt, &mrt_api_version, sizeof mrt_api_version); break; case MRT_ASSERT: error = sooptcopyout(sopt, &V_pim_assert_enabled, sizeof V_pim_assert_enabled); break; case MRT_API_SUPPORT: error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support); break; case MRT_API_CONFIG: error = sooptcopyout(sopt, &V_mrt_api_config, sizeof V_mrt_api_config); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle ioctl commands to obtain information from the cache */ static int X_mrt_ioctl(u_long cmd, caddr_t data, int fibnum __unused) { int error = 0; /* * Currently the only function calling this ioctl routine is rtioctl_fib(). * Typically, only root can create the raw socket in order to execute * this ioctl method, however the request might be coming from a prison */ error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error) return (error); switch (cmd) { case (SIOCGETVIFCNT): error = get_vif_cnt((struct sioc_vif_req *)data); break; case (SIOCGETSGCNT): error = get_sg_cnt((struct sioc_sg_req *)data); break; default: error = EINVAL; break; } return error; } /* * returns the packet, byte, rpf-failure count for the source group provided */ static int get_sg_cnt(struct sioc_sg_req *req) { struct mfc *rt; MFC_LOCK(); rt = mfc_find(&req->src, &req->grp); if (rt == NULL) { MFC_UNLOCK(); req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; return EADDRNOTAVAIL; } req->pktcnt = rt->mfc_pkt_cnt; req->bytecnt = rt->mfc_byte_cnt; req->wrong_if = rt->mfc_wrong_if; MFC_UNLOCK(); return 0; } /* * returns the input and output packet and byte counts on the vif provided */ static int get_vif_cnt(struct sioc_vif_req *req) { vifi_t vifi = req->vifi; VIF_LOCK(); if (vifi >= V_numvifs) { VIF_UNLOCK(); return EINVAL; } req->icount = V_viftable[vifi].v_pkt_in; req->ocount = V_viftable[vifi].v_pkt_out; req->ibytes = V_viftable[vifi].v_bytes_in; req->obytes = V_viftable[vifi].v_bytes_out; VIF_UNLOCK(); return 0; } static void if_detached_event(void *arg __unused, struct ifnet *ifp) { vifi_t vifi; u_long i; MROUTER_LOCK(); if (V_ip_mrouter == NULL) { MROUTER_UNLOCK(); return; } VIF_LOCK(); MFC_LOCK(); /* * Tear down multicast forwarder state associated with this ifnet. * 1. Walk the vif list, matching vifs against this ifnet. * 2. Walk the multicast forwarding cache (mfc) looking for * inner matches with this vif's index. * 3. Expire any matching multicast forwarding cache entries. * 4. Free vif state. This should disable ALLMULTI on the interface. */ for (vifi = 0; vifi < V_numvifs; vifi++) { if (V_viftable[vifi].v_ifp != ifp) continue; for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { if (rt->mfc_parent == vifi) { expire_mfc(rt); } } } del_vif_locked(vifi); } MFC_UNLOCK(); VIF_UNLOCK(); MROUTER_UNLOCK(); } /* * Enable multicast forwarding. */ static int ip_mrouter_init(struct socket *so, int version) { CTR3(KTR_IPMF, "%s: so_type %d, pr_protocol %d", __func__, so->so_type, so->so_proto->pr_protocol); if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP) return EOPNOTSUPP; if (version != 1) return ENOPROTOOPT; MROUTER_LOCK(); if (ip_mrouter_unloading) { MROUTER_UNLOCK(); return ENOPROTOOPT; } if (V_ip_mrouter != NULL) { MROUTER_UNLOCK(); return EADDRINUSE; } V_mfchashtbl = hashinit_flags(mfchashsize, M_MRTABLE, &V_mfchash, HASH_NOWAIT); callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, curvnet); callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, curvnet); callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, curvnet); V_ip_mrouter = so; ip_mrouter_cnt++; MROUTER_UNLOCK(); CTR1(KTR_IPMF, "%s: done", __func__); return 0; } /* * Disable multicast forwarding. */ static int X_ip_mrouter_done(void) { struct ifnet *ifp; u_long i; vifi_t vifi; MROUTER_LOCK(); if (V_ip_mrouter == NULL) { MROUTER_UNLOCK(); return EINVAL; } /* * Detach/disable hooks to the reset of the system. */ V_ip_mrouter = NULL; ip_mrouter_cnt--; V_mrt_api_config = 0; VIF_LOCK(); /* * For each phyint in use, disable promiscuous reception of all IP * multicasts. */ for (vifi = 0; vifi < V_numvifs; vifi++) { if (!in_nullhost(V_viftable[vifi].v_lcl_addr) && !(V_viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { ifp = V_viftable[vifi].v_ifp; if_allmulti(ifp, 0); } } bzero((caddr_t)V_viftable, sizeof(V_viftable)); V_numvifs = 0; V_pim_assert_enabled = 0; VIF_UNLOCK(); callout_stop(&V_expire_upcalls_ch); callout_stop(&V_bw_upcalls_ch); callout_stop(&V_bw_meter_ch); MFC_LOCK(); /* * Free all multicast forwarding cache entries. * Do not use hashdestroy(), as we must perform other cleanup. */ for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { expire_mfc(rt); } } free(V_mfchashtbl, M_MRTABLE); V_mfchashtbl = NULL; bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize); V_bw_upcalls_n = 0; bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers)); MFC_UNLOCK(); V_reg_vif_num = VIFI_INVALID; MROUTER_UNLOCK(); CTR1(KTR_IPMF, "%s: done", __func__); return 0; } /* * Set PIM assert processing global */ static int set_assert(int i) { if ((i != 1) && (i != 0)) return EINVAL; V_pim_assert_enabled = i; return 0; } /* * Configure API capabilities */ int set_api_config(uint32_t *apival) { u_long i; /* * We can set the API capabilities only if it is the first operation * after MRT_INIT. I.e.: * - there are no vifs installed * - pim_assert is not enabled * - the MFC table is empty */ if (V_numvifs > 0) { *apival = 0; return EPERM; } if (V_pim_assert_enabled) { *apival = 0; return EPERM; } MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { if (LIST_FIRST(&V_mfchashtbl[i]) != NULL) { MFC_UNLOCK(); *apival = 0; return EPERM; } } MFC_UNLOCK(); V_mrt_api_config = *apival & mrt_api_support; *apival = V_mrt_api_config; return 0; } /* * Add a vif to the vif table */ static int add_vif(struct vifctl *vifcp) { struct vif *vifp = V_viftable + vifcp->vifc_vifi; struct sockaddr_in sin = {sizeof sin, AF_INET}; struct ifaddr *ifa; struct ifnet *ifp; int error; VIF_LOCK(); if (vifcp->vifc_vifi >= MAXVIFS) { VIF_UNLOCK(); return EINVAL; } /* rate limiting is no longer supported by this code */ if (vifcp->vifc_rate_limit != 0) { log(LOG_ERR, "rate limiting is no longer supported\n"); VIF_UNLOCK(); return EINVAL; } if (!in_nullhost(vifp->v_lcl_addr)) { VIF_UNLOCK(); return EADDRINUSE; } if (in_nullhost(vifcp->vifc_lcl_addr)) { VIF_UNLOCK(); return EADDRNOTAVAIL; } /* Find the interface with an address in AF_INET family */ if (vifcp->vifc_flags & VIFF_REGISTER) { /* * XXX: Because VIFF_REGISTER does not really need a valid * local interface (e.g. it could be 127.0.0.2), we don't * check its address. */ ifp = NULL; } else { sin.sin_addr = vifcp->vifc_lcl_addr; ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == NULL) { VIF_UNLOCK(); return EADDRNOTAVAIL; } ifp = ifa->ifa_ifp; ifa_free(ifa); } if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) { CTR1(KTR_IPMF, "%s: tunnels are no longer supported", __func__); VIF_UNLOCK(); return EOPNOTSUPP; } else if (vifcp->vifc_flags & VIFF_REGISTER) { ifp = &V_multicast_register_if; CTR2(KTR_IPMF, "%s: add register vif for ifp %p", __func__, ifp); if (V_reg_vif_num == VIFI_INVALID) { if_initname(&V_multicast_register_if, "register_vif", 0); V_multicast_register_if.if_flags = IFF_LOOPBACK; V_reg_vif_num = vifcp->vifc_vifi; } } else { /* Make sure the interface supports multicast */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { VIF_UNLOCK(); return EOPNOTSUPP; } /* Enable promiscuous reception of all IP multicasts from the if */ error = if_allmulti(ifp, 1); if (error) { VIF_UNLOCK(); return error; } } vifp->v_flags = vifcp->vifc_flags; vifp->v_threshold = vifcp->vifc_threshold; vifp->v_lcl_addr = vifcp->vifc_lcl_addr; vifp->v_rmt_addr = vifcp->vifc_rmt_addr; vifp->v_ifp = ifp; /* initialize per vif pkt counters */ vifp->v_pkt_in = 0; vifp->v_pkt_out = 0; vifp->v_bytes_in = 0; vifp->v_bytes_out = 0; /* Adjust numvifs up if the vifi is higher than numvifs */ if (V_numvifs <= vifcp->vifc_vifi) V_numvifs = vifcp->vifc_vifi + 1; VIF_UNLOCK(); CTR4(KTR_IPMF, "%s: add vif %d laddr %s thresh %x", __func__, (int)vifcp->vifc_vifi, inet_ntoa(vifcp->vifc_lcl_addr), (int)vifcp->vifc_threshold); return 0; } /* * Delete a vif from the vif table */ static int del_vif_locked(vifi_t vifi) { struct vif *vifp; VIF_LOCK_ASSERT(); if (vifi >= V_numvifs) { return EINVAL; } vifp = &V_viftable[vifi]; if (in_nullhost(vifp->v_lcl_addr)) { return EADDRNOTAVAIL; } if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) if_allmulti(vifp->v_ifp, 0); if (vifp->v_flags & VIFF_REGISTER) V_reg_vif_num = VIFI_INVALID; bzero((caddr_t)vifp, sizeof (*vifp)); CTR2(KTR_IPMF, "%s: delete vif %d", __func__, (int)vifi); /* Adjust numvifs down */ for (vifi = V_numvifs; vifi > 0; vifi--) if (!in_nullhost(V_viftable[vifi-1].v_lcl_addr)) break; V_numvifs = vifi; return 0; } static int del_vif(vifi_t vifi) { int cc; VIF_LOCK(); cc = del_vif_locked(vifi); VIF_UNLOCK(); return cc; } /* * update an mfc entry without resetting counters and S,G addresses. */ static void update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { int i; rt->mfc_parent = mfccp->mfcc_parent; for (i = 0; i < V_numvifs; i++) { rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; rt->mfc_flags[i] = mfccp->mfcc_flags[i] & V_mrt_api_config & MRT_MFC_FLAGS_ALL; } /* set the RP address */ if (V_mrt_api_config & MRT_MFC_RP) rt->mfc_rp = mfccp->mfcc_rp; else rt->mfc_rp.s_addr = INADDR_ANY; } /* * fully initialize an mfc entry from the parameter. */ static void init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { rt->mfc_origin = mfccp->mfcc_origin; rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; update_mfc_params(rt, mfccp); /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; timevalclear(&rt->mfc_last_assert); } static void expire_mfc(struct mfc *rt) { struct rtdetq *rte, *nrte; MFC_LOCK_ASSERT(); free_bw_list(rt->mfc_bw_meter); TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { m_freem(rte->m); TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); free(rte, M_MRTABLE); } LIST_REMOVE(rt, mfc_hash); free(rt, M_MRTABLE); } /* * Add an mfc entry */ static int add_mfc(struct mfcctl2 *mfccp) { struct mfc *rt; struct rtdetq *rte, *nrte; u_long hash = 0; u_short nstl; VIF_LOCK(); MFC_LOCK(); rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); /* If an entry already exists, just update the fields */ if (rt) { CTR4(KTR_IPMF, "%s: update mfc orig %s group %lx parent %x", __func__, inet_ntoa(mfccp->mfcc_origin), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent); update_mfc_params(rt, mfccp); MFC_UNLOCK(); VIF_UNLOCK(); return (0); } /* * Find the entry for which the upcall was made and update */ nstl = 0; hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) && !TAILQ_EMPTY(&rt->mfc_stall)) { CTR5(KTR_IPMF, "%s: add mfc orig %s group %lx parent %x qh %p", __func__, inet_ntoa(mfccp->mfcc_origin), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent, TAILQ_FIRST(&rt->mfc_stall)); if (nstl++) CTR1(KTR_IPMF, "%s: multiple matches", __func__); init_mfc_params(rt, mfccp); rt->mfc_expire = 0; /* Don't clean this guy up */ V_nexpire[hash]--; /* Free queued packets, but attempt to forward them first. */ TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { if (rte->ifp != NULL) ip_mdq(rte->m, rte->ifp, rt, -1); m_freem(rte->m); TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall--; free(rte, M_MRTABLE); } } } /* * It is possible that an entry is being inserted without an upcall */ if (nstl == 0) { CTR1(KTR_IPMF, "%s: adding mfc w/o upcall", __func__); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) { init_mfc_params(rt, mfccp); if (rt->mfc_expire) V_nexpire[hash]--; rt->mfc_expire = 0; break; /* XXX */ } } if (rt == NULL) { /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return (ENOBUFS); } init_mfc_params(rt, mfccp); TAILQ_INIT(&rt->mfc_stall); rt->mfc_nstall = 0; rt->mfc_expire = 0; rt->mfc_bw_meter = NULL; /* insert new entry at head of hash chain */ LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash); } } MFC_UNLOCK(); VIF_UNLOCK(); return (0); } /* * Delete an mfc entry */ static int del_mfc(struct mfcctl2 *mfccp) { struct in_addr origin; struct in_addr mcastgrp; struct mfc *rt; origin = mfccp->mfcc_origin; mcastgrp = mfccp->mfcc_mcastgrp; CTR3(KTR_IPMF, "%s: delete mfc orig %s group %lx", __func__, inet_ntoa(origin), (u_long)ntohl(mcastgrp.s_addr)); MFC_LOCK(); rt = mfc_find(&origin, &mcastgrp); if (rt == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } /* * free the bw_meter entries */ free_bw_list(rt->mfc_bw_meter); rt->mfc_bw_meter = NULL; LIST_REMOVE(rt, mfc_hash); free(rt, M_MRTABLE); MFC_UNLOCK(); return (0); } /* * Send a message to the routing daemon on the multicast routing socket. */ static int socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) { if (s) { SOCKBUF_LOCK(&s->so_rcv); if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm, NULL) != 0) { sorwakeup_locked(s); return 0; } SOCKBUF_UNLOCK(&s->so_rcv); } m_freem(mm); return -1; } /* * IP multicast forwarding function. This function assumes that the packet * pointed to by "ip" has arrived on (or is about to be sent to) the interface * pointed to by "ifp", and the packet is to be relayed to other networks * that have members of the packet's destination IP multicast group. * * The packet is returned unscathed to the caller, unless it is * erroneous, in which case a non-zero return value tells the caller to * discard it. */ #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo) { struct mfc *rt; int error; vifi_t vifi; CTR3(KTR_IPMF, "ip_mforward: delete mfc orig %s group %lx ifp %p", inet_ntoa(ip->ip_src), (u_long)ntohl(ip->ip_dst.s_addr), ifp); if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) { /* * Packet arrived via a physical interface or * an encapsulated tunnel or a register_vif. */ } else { /* * Packet arrived through a source-route tunnel. * Source-route tunnels are no longer supported. */ return (1); } VIF_LOCK(); MFC_LOCK(); if (imo && ((vifi = imo->imo_multicast_vif) < V_numvifs)) { if (ip->ip_ttl < MAXTTL) ip->ip_ttl++; /* compensate for -1 in *_send routines */ error = ip_mdq(m, ifp, NULL, vifi); MFC_UNLOCK(); VIF_UNLOCK(); return error; } /* * Don't forward a packet with time-to-live of zero or one, * or a packet destined to a local-only group. */ if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) { MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* * Determine forwarding vifs from the forwarding cache table */ MRTSTAT_INC(mrts_mfc_lookups); rt = mfc_find(&ip->ip_src, &ip->ip_dst); /* Entry exists, so forward if necessary */ if (rt != NULL) { error = ip_mdq(m, ifp, rt, -1); MFC_UNLOCK(); VIF_UNLOCK(); return error; } else { /* * If we don't have a route for packet's origin, * Make a copy of the packet & send message to routing daemon */ struct mbuf *mb0; struct rtdetq *rte; u_long hash; int hlen = ip->ip_hl << 2; MRTSTAT_INC(mrts_mfc_misses); MRTSTAT_INC(mrts_no_route); CTR2(KTR_IPMF, "ip_mforward: no mfc for (%s,%lx)", inet_ntoa(ip->ip_src), (u_long)ntohl(ip->ip_dst.s_addr)); /* * Allocate mbufs early so that we don't do extra work if we are * just going to fail anyway. Make sure to pullup the header so * that other people can't step on it. */ rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT|M_ZERO); if (rte == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } mb0 = m_copypacket(m, M_NOWAIT); if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < hlen)) mb0 = m_pullup(mb0, hlen); if (mb0 == NULL) { free(rte, M_MRTABLE); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* is there an upcall waiting for this flow ? */ hash = MFCHASH(ip->ip_src, ip->ip_dst); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(ip->ip_src, rt->mfc_origin) && in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) && !TAILQ_EMPTY(&rt->mfc_stall)) break; } if (rt == NULL) { int i; struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct mbuf *mm; /* * Locate the vifi for the incoming interface for this packet. * If none found, drop packet. */ for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= V_numvifs) /* vif not found, drop packet */ goto non_fatal; /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) goto fail; /* Make a copy of the header to send to the user level process */ - mm = m_copy(mb0, 0, hlen); + mm = m_copym(mb0, 0, hlen, M_NOWAIT); if (mm == NULL) goto fail1; /* * Send message to routing daemon to install * a route into the kernel table */ im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_NOCACHE; im->im_mbz = 0; im->im_vif = vifi; MRTSTAT_INC(mrts_upcalls); k_igmpsrc.sin_addr = ip->ip_src; if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { CTR0(KTR_IPMF, "ip_mforward: socket queue full"); MRTSTAT_INC(mrts_upq_sockfull); fail1: free(rt, M_MRTABLE); fail: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* insert new entry at head of hash chain */ rt->mfc_origin.s_addr = ip->ip_src.s_addr; rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; rt->mfc_expire = UPCALL_EXPIRE; V_nexpire[hash]++; for (i = 0; i < V_numvifs; i++) { rt->mfc_ttls[i] = 0; rt->mfc_flags[i] = 0; } rt->mfc_parent = -1; /* clear the RP address */ rt->mfc_rp.s_addr = INADDR_ANY; rt->mfc_bw_meter = NULL; /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; timevalclear(&rt->mfc_last_assert); TAILQ_INIT(&rt->mfc_stall); rt->mfc_nstall = 0; /* link into table */ LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash); TAILQ_INSERT_HEAD(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall++; } else { /* determine if queue has overflowed */ if (rt->mfc_nstall > MAX_UPQ) { MRTSTAT_INC(mrts_upq_ovflw); non_fatal: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return (0); } TAILQ_INSERT_TAIL(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall++; } rte->m = mb0; rte->ifp = ifp; MFC_UNLOCK(); VIF_UNLOCK(); return 0; } } /* * Clean up the cache entry if upcall is not serviced */ static void expire_upcalls(void *arg) { u_long i; CURVNET_SET((struct vnet *) arg); MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; if (V_nexpire[i] == 0) continue; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { if (TAILQ_EMPTY(&rt->mfc_stall)) continue; if (rt->mfc_expire == 0 || --rt->mfc_expire > 0) continue; /* * free the bw_meter entries */ while (rt->mfc_bw_meter != NULL) { struct bw_meter *x = rt->mfc_bw_meter; rt->mfc_bw_meter = x->bm_mfc_next; free(x, M_BWMETER); } MRTSTAT_INC(mrts_cache_cleanups); CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__, (u_long)ntohl(rt->mfc_origin.s_addr), (u_long)ntohl(rt->mfc_mcastgrp.s_addr)); expire_mfc(rt); } } MFC_UNLOCK(); callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, curvnet); CURVNET_RESTORE(); } /* * Packet forwarding routine once entry in the cache is made */ static int ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) { struct ip *ip = mtod(m, struct ip *); vifi_t vifi; int plen = ntohs(ip->ip_len); VIF_LOCK_ASSERT(); /* * If xmt_vif is not -1, send on only the requested vif. * * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) */ if (xmt_vif < V_numvifs) { if (V_viftable[xmt_vif].v_flags & VIFF_REGISTER) pim_register_send(ip, V_viftable + xmt_vif, m, rt); else phyint_send(ip, V_viftable + xmt_vif, m); return 1; } /* * Don't forward if it didn't arrive from the parent vif for its origin. */ vifi = rt->mfc_parent; if ((vifi >= V_numvifs) || (V_viftable[vifi].v_ifp != ifp)) { CTR4(KTR_IPMF, "%s: rx on wrong ifp %p (vifi %d, v_ifp %p)", __func__, ifp, (int)vifi, V_viftable[vifi].v_ifp); MRTSTAT_INC(mrts_wrong_if); ++rt->mfc_wrong_if; /* * If we are doing PIM assert processing, send a message * to the routing daemon. * * XXX: A PIM-SM router needs the WRONGVIF detection so it * can complete the SPT switch, regardless of the type * of the iif (broadcast media, GRE tunnel, etc). */ if (V_pim_assert_enabled && (vifi < V_numvifs) && V_viftable[vifi].v_ifp) { if (ifp == &V_multicast_register_if) PIMSTAT_INC(pims_rcv_registers_wrongiif); /* Get vifi for the incoming packet */ for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= V_numvifs) return 0; /* The iif is not found: ignore the packet. */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF) return 0; /* WRONGVIF disabled: ignore the packet */ if (ratecheck(&rt->mfc_last_assert, &pim_assert_interval)) { struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct igmpmsg *im; int hlen = ip->ip_hl << 2; - struct mbuf *mm = m_copy(m, 0, hlen); + struct mbuf *mm = m_copym(m, 0, hlen, M_NOWAIT); if (mm && (!M_WRITABLE(mm) || mm->m_len < hlen)) mm = m_pullup(mm, hlen); if (mm == NULL) return ENOBUFS; im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_WRONGVIF; im->im_mbz = 0; im->im_vif = vifi; MRTSTAT_INC(mrts_upcalls); k_igmpsrc.sin_addr = im->im_src; if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { CTR1(KTR_IPMF, "%s: socket queue full", __func__); MRTSTAT_INC(mrts_upq_sockfull); return ENOBUFS; } } } return 0; } /* If I sourced this packet, it counts as output, else it was input. */ if (in_hosteq(ip->ip_src, V_viftable[vifi].v_lcl_addr)) { V_viftable[vifi].v_pkt_out++; V_viftable[vifi].v_bytes_out += plen; } else { V_viftable[vifi].v_pkt_in++; V_viftable[vifi].v_bytes_in += plen; } rt->mfc_pkt_cnt++; rt->mfc_byte_cnt += plen; /* * For each vif, decide if a copy of the packet should be forwarded. * Forward if: * - the ttl exceeds the vif's threshold * - there are group members downstream on interface */ for (vifi = 0; vifi < V_numvifs; vifi++) if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) { V_viftable[vifi].v_pkt_out++; V_viftable[vifi].v_bytes_out += plen; if (V_viftable[vifi].v_flags & VIFF_REGISTER) pim_register_send(ip, V_viftable + vifi, m, rt); else phyint_send(ip, V_viftable + vifi, m); } /* * Perform upcall-related bw measuring. */ if (rt->mfc_bw_meter != NULL) { struct bw_meter *x; struct timeval now; microtime(&now); MFC_LOCK_ASSERT(); for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) bw_meter_receive_packet(x, plen, &now); } return 0; } /* * Check if a vif number is legal/ok. This is used by in_mcast.c. */ static int X_legal_vif_num(int vif) { int ret; ret = 0; if (vif < 0) return (ret); VIF_LOCK(); if (vif < V_numvifs) ret = 1; VIF_UNLOCK(); return (ret); } /* * Return the local address used by this vif */ static u_long X_ip_mcast_src(int vifi) { in_addr_t addr; addr = INADDR_ANY; if (vifi < 0) return (addr); VIF_LOCK(); if (vifi < V_numvifs) addr = V_viftable[vifi].v_lcl_addr.s_addr; VIF_UNLOCK(); return (addr); } static void phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) { struct mbuf *mb_copy; int hlen = ip->ip_hl << 2; VIF_LOCK_ASSERT(); /* * Make a new reference to the packet; make sure that * the IP header is actually copied, not just referenced, * so that ip_output() only scribbles on the copy. */ mb_copy = m_copypacket(m, M_NOWAIT); if (mb_copy && (!M_WRITABLE(mb_copy) || mb_copy->m_len < hlen)) mb_copy = m_pullup(mb_copy, hlen); if (mb_copy == NULL) return; send_packet(vifp, mb_copy); } static void send_packet(struct vif *vifp, struct mbuf *m) { struct ip_moptions imo; struct in_multi *imm[2]; int error; VIF_LOCK_ASSERT(); imo.imo_multicast_ifp = vifp->v_ifp; imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; imo.imo_multicast_loop = 1; imo.imo_multicast_vif = -1; imo.imo_num_memberships = 0; imo.imo_max_memberships = 2; imo.imo_membership = &imm[0]; /* * Re-entrancy should not be a problem here, because * the packets that we send out and are looped back at us * should get rejected because they appear to come from * the loopback interface, thus preventing looping. */ error = ip_output(m, NULL, NULL, IP_FORWARDING, &imo, NULL); CTR3(KTR_IPMF, "%s: vif %td err %d", __func__, (ptrdiff_t)(vifp - V_viftable), error); } /* * Stubs for old RSVP socket shim implementation. */ static int X_ip_rsvp_vif(struct socket *so __unused, struct sockopt *sopt __unused) { return (EOPNOTSUPP); } static void X_ip_rsvp_force_done(struct socket *so __unused) { } static int X_rsvp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; m = *mp; *mp = NULL; if (!V_rsvp_on) m_freem(m); return (IPPROTO_DONE); } /* * Code for bandwidth monitors */ /* * Define common interface for timeval-related methods */ #define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp) #define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp)) #define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp)) static uint32_t compute_bw_meter_flags(struct bw_upcall *req) { uint32_t flags = 0; if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) flags |= BW_METER_UNIT_PACKETS; if (req->bu_flags & BW_UPCALL_UNIT_BYTES) flags |= BW_METER_UNIT_BYTES; if (req->bu_flags & BW_UPCALL_GEQ) flags |= BW_METER_GEQ; if (req->bu_flags & BW_UPCALL_LEQ) flags |= BW_METER_LEQ; return flags; } /* * Add a bw_meter entry */ static int add_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; struct timeval now; struct bw_meter *x; uint32_t flags; if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; /* Test if the flags are valid */ if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) return EINVAL; if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) return EINVAL; if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) return EINVAL; /* Test if the threshold time interval is valid */ if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) return EINVAL; flags = compute_bw_meter_flags(req); /* * Find if we have already same bw_meter entry */ MFC_LOCK(); mfc = mfc_find(&req->bu_src, &req->bu_dst); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) { MFC_UNLOCK(); return 0; /* XXX Already installed */ } } /* Allocate the new bw_meter entry */ x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT); if (x == NULL) { MFC_UNLOCK(); return ENOBUFS; } /* Set the new bw_meter entry */ x->bm_threshold.b_time = req->bu_threshold.b_time; microtime(&now); x->bm_start_time = now; x->bm_threshold.b_packets = req->bu_threshold.b_packets; x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags = flags; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; /* Add the new bw_meter entry to the front of entries for this MFC */ x->bm_mfc = mfc; x->bm_mfc_next = mfc->mfc_bw_meter; mfc->mfc_bw_meter = x; schedule_bw_meter(x, &now); MFC_UNLOCK(); return 0; } static void free_bw_list(struct bw_meter *list) { while (list != NULL) { struct bw_meter *x = list; list = list->bm_mfc_next; unschedule_bw_meter(x); free(x, M_BWMETER); } } /* * Delete one or multiple bw_meter entries */ static int del_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct bw_meter *x; if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; MFC_LOCK(); /* Find the corresponding MFC entry */ mfc = mfc_find(&req->bu_src, &req->bu_dst); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { /* * Delete all bw_meter entries for this mfc */ struct bw_meter *list; list = mfc->mfc_bw_meter; mfc->mfc_bw_meter = NULL; free_bw_list(list); MFC_UNLOCK(); return 0; } else { /* Delete a single bw_meter entry */ struct bw_meter *prev; uint32_t flags = 0; flags = compute_bw_meter_flags(req); /* Find the bw_meter entry to delete */ for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; prev = x, x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) break; } if (x != NULL) { /* Delete entry from the list for this MFC */ if (prev != NULL) prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ else x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ unschedule_bw_meter(x); MFC_UNLOCK(); /* Free the bw_meter entry */ free(x, M_BWMETER); return 0; } else { MFC_UNLOCK(); return EINVAL; } } /* NOTREACHED */ } /* * Perform bandwidth measurement processing that may result in an upcall */ static void bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) { struct timeval delta; MFC_LOCK_ASSERT(); delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); if (x->bm_flags & BW_METER_GEQ) { /* * Processing for ">=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should deliver an upcall */ if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); x->bm_flags |= BW_METER_UPCALL_DELIVERED; } } } else if (x->bm_flags & BW_METER_LEQ) { /* * Processing for "<=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* * We are behind time with the multicast forwarding table * scanning for "<=" type of bw_meter entries, so test now * if we should deliver an upcall. */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); } /* Reschedule the bw_meter entry */ unschedule_bw_meter(x); schedule_bw_meter(x, nowp); } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should restart the measuring interval */ if ((x->bm_flags & BW_METER_UNIT_PACKETS && x->bm_measured.b_packets <= x->bm_threshold.b_packets) || (x->bm_flags & BW_METER_UNIT_BYTES && x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { /* Don't restart the measuring interval */ } else { /* Do restart the measuring interval */ /* * XXX: note that we don't unschedule and schedule, because this * might be too much overhead per packet. Instead, when we process * all entries for a given timer hash bin, we check whether it is * really a timeout. If not, we reschedule at that time. */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } } } /* * Prepare a bandwidth-related upcall */ static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) { struct timeval delta; struct bw_upcall *u; MFC_LOCK_ASSERT(); /* * Compute the measured time interval */ delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); /* * If there are too many pending upcalls, deliver them now */ if (V_bw_upcalls_n >= BW_UPCALLS_MAX) bw_upcalls_send(); /* * Set the bw_upcall entry */ u = &V_bw_upcalls[V_bw_upcalls_n++]; u->bu_src = x->bm_mfc->mfc_origin; u->bu_dst = x->bm_mfc->mfc_mcastgrp; u->bu_threshold.b_time = x->bm_threshold.b_time; u->bu_threshold.b_packets = x->bm_threshold.b_packets; u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; u->bu_measured.b_time = delta; u->bu_measured.b_packets = x->bm_measured.b_packets; u->bu_measured.b_bytes = x->bm_measured.b_bytes; u->bu_flags = 0; if (x->bm_flags & BW_METER_UNIT_PACKETS) u->bu_flags |= BW_UPCALL_UNIT_PACKETS; if (x->bm_flags & BW_METER_UNIT_BYTES) u->bu_flags |= BW_UPCALL_UNIT_BYTES; if (x->bm_flags & BW_METER_GEQ) u->bu_flags |= BW_UPCALL_GEQ; if (x->bm_flags & BW_METER_LEQ) u->bu_flags |= BW_UPCALL_LEQ; } /* * Send the pending bandwidth-related upcalls */ static void bw_upcalls_send(void) { struct mbuf *m; int len = V_bw_upcalls_n * sizeof(V_bw_upcalls[0]); struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; static struct igmpmsg igmpmsg = { 0, /* unused1 */ 0, /* unused2 */ IGMPMSG_BW_UPCALL,/* im_msgtype */ 0, /* im_mbz */ 0, /* im_vif */ 0, /* unused3 */ { 0 }, /* im_src */ { 0 } }; /* im_dst */ MFC_LOCK_ASSERT(); if (V_bw_upcalls_n == 0) return; /* No pending upcalls */ V_bw_upcalls_n = 0; /* * Allocate a new mbuf, initialize it with the header and * the payload for the pending calls. */ m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); return; } m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&V_bw_upcalls[0]); /* * Send the upcalls * XXX do we need to set the address in k_igmpsrc ? */ MRTSTAT_INC(mrts_upcalls); if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) { log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); MRTSTAT_INC(mrts_upq_sockfull); } } /* * Compute the timeout hash value for the bw_meter entries */ #define BW_METER_TIMEHASH(bw_meter, hash) \ do { \ struct timeval next_timeval = (bw_meter)->bm_start_time; \ \ BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ (hash) = next_timeval.tv_sec; \ if (next_timeval.tv_usec) \ (hash)++; /* XXX: make sure we don't timeout early */ \ (hash) %= BW_METER_BUCKETS; \ } while (0) /* * Schedule a timer to process periodically bw_meter entry of type "<=" * by linking the entry in the proper hash bucket. */ static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) { int time_hash; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; /* * Compute the timeout hash value and insert the entry */ BW_METER_TIMEHASH(x, time_hash); x->bm_time_next = V_bw_meter_timers[time_hash]; V_bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; } /* * Unschedule the periodic timer that processes bw_meter entry of type "<=" * by removing the entry from the proper hash bucket. */ static void unschedule_bw_meter(struct bw_meter *x) { int time_hash; struct bw_meter *prev, *tmp; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Compute the timeout hash value and delete the entry */ time_hash = x->bm_time_hash; if (time_hash >= BW_METER_BUCKETS) return; /* Entry was not scheduled */ for (prev = NULL, tmp = V_bw_meter_timers[time_hash]; tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) if (tmp == x) break; if (tmp == NULL) panic("unschedule_bw_meter: bw_meter entry not found"); if (prev != NULL) prev->bm_time_next = x->bm_time_next; else V_bw_meter_timers[time_hash] = x->bm_time_next; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; } /* * Process all "<=" type of bw_meter that should be processed now, * and for each entry prepare an upcall if necessary. Each processed * entry is rescheduled again for the (periodic) processing. * * This is run periodically (once per second normally). On each round, * all the potentially matching entries are in the hash slot that we are * looking at. */ static void bw_meter_process() { uint32_t loops; int i; struct timeval now, process_endtime; microtime(&now); if (V_last_tv_sec == now.tv_sec) return; /* nothing to do */ loops = now.tv_sec - V_last_tv_sec; V_last_tv_sec = now.tv_sec; if (loops > BW_METER_BUCKETS) loops = BW_METER_BUCKETS; MFC_LOCK(); /* * Process all bins of bw_meter entries from the one after the last * processed to the current one. On entry, i points to the last bucket * visited, so we need to increment i at the beginning of the loop. */ for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { struct bw_meter *x, *tmp_list; if (++i >= BW_METER_BUCKETS) i = 0; /* Disconnect the list of bw_meter entries from the bin */ tmp_list = V_bw_meter_timers[i]; V_bw_meter_timers[i] = NULL; /* Process the list of bw_meter entries */ while (tmp_list != NULL) { x = tmp_list; tmp_list = tmp_list->bm_time_next; /* Test if the time interval is over */ process_endtime = x->bm_start_time; BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); if (BW_TIMEVALCMP(&process_endtime, &now, >)) { /* Not yet: reschedule, but don't reset */ int time_hash; BW_METER_TIMEHASH(x, time_hash); if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { /* * XXX: somehow the bin processing is a bit ahead of time. * Put the entry in the next bin. */ if (++time_hash >= BW_METER_BUCKETS) time_hash = 0; } x->bm_time_next = V_bw_meter_timers[time_hash]; V_bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; continue; } /* * Test if we should deliver an upcall */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, &now); } /* * Reschedule for next processing */ schedule_bw_meter(x, &now); } } /* Send all upcalls that are pending delivery */ bw_upcalls_send(); MFC_UNLOCK(); } /* * A periodic function for sending all upcalls that are pending delivery */ static void expire_bw_upcalls_send(void *arg) { CURVNET_SET((struct vnet *) arg); MFC_LOCK(); bw_upcalls_send(); MFC_UNLOCK(); callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, curvnet); CURVNET_RESTORE(); } /* * A periodic function for periodic scanning of the multicast forwarding * table for processing all "<=" bw_meter entries. */ static void expire_bw_meter_process(void *arg) { CURVNET_SET((struct vnet *) arg); if (V_mrt_api_config & MRT_MFC_BW_UPCALL) bw_meter_process(); callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, curvnet); CURVNET_RESTORE(); } /* * End of bandwidth monitoring code */ /* * Send the packet up to the user daemon, or eventually do kernel encapsulation * */ static int pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m, struct mfc *rt) { struct mbuf *mb_copy, *mm; /* * Do not send IGMP_WHOLEPKT notifications to userland, if the * rendezvous point was unspecified, and we were told not to. */ if (pim_squelch_wholepkt != 0 && (V_mrt_api_config & MRT_MFC_RP) && in_nullhost(rt->mfc_rp)) return 0; mb_copy = pim_register_prepare(ip, m); if (mb_copy == NULL) return ENOBUFS; /* * Send all the fragments. Note that the mbuf for each fragment * is freed by the sending machinery. */ for (mm = mb_copy; mm; mm = mb_copy) { mb_copy = mm->m_nextpkt; mm->m_nextpkt = 0; mm = m_pullup(mm, sizeof(struct ip)); if (mm != NULL) { ip = mtod(mm, struct ip *); if ((V_mrt_api_config & MRT_MFC_RP) && !in_nullhost(rt->mfc_rp)) { pim_register_send_rp(ip, vifp, mm, rt); } else { pim_register_send_upcall(ip, vifp, mm, rt); } } } return 0; } /* * Return a copy of the data packet that is ready for PIM Register * encapsulation. * XXX: Note that in the returned copy the IP header is a valid one. */ static struct mbuf * pim_register_prepare(struct ip *ip, struct mbuf *m) { struct mbuf *mb_copy = NULL; int mtu; /* Take care of delayed checksums */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } /* * Copy the old packet & pullup its IP header into the * new mbuf so we can modify it. */ mb_copy = m_copypacket(m, M_NOWAIT); if (mb_copy == NULL) return NULL; mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); if (mb_copy == NULL) return NULL; /* take care of the TTL */ ip = mtod(mb_copy, struct ip *); --ip->ip_ttl; /* Compute the MTU after the PIM Register encapsulation */ mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); if (ntohs(ip->ip_len) <= mtu) { /* Turn the IP header into a valid one */ ip->ip_sum = 0; ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); } else { /* Fragment the packet */ mb_copy->m_pkthdr.csum_flags |= CSUM_IP; if (ip_fragment(ip, &mb_copy, mtu, 0) != 0) { m_freem(mb_copy); return NULL; } } return mb_copy; } /* * Send an upcall with the data packet to the user-level process. */ static int pim_register_send_upcall(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; int len = ntohs(ip->ip_len); struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; VIF_LOCK_ASSERT(); /* * Add a new mbuf with an upcall header */ mb_first = m_gethdr(M_NOWAIT, MT_DATA); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); mb_first->m_len = sizeof(struct igmpmsg); mb_first->m_next = mb_copy; /* Send message to routing daemon */ im = mtod(mb_first, struct igmpmsg *); im->im_msgtype = IGMPMSG_WHOLEPKT; im->im_mbz = 0; im->im_vif = vifp - V_viftable; im->im_src = ip->ip_src; im->im_dst = ip->ip_dst; k_igmpsrc.sin_addr = ip->ip_src; MRTSTAT_INC(mrts_upcalls); if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) { CTR1(KTR_IPMF, "%s: socket queue full", __func__); MRTSTAT_INC(mrts_upq_sockfull); return ENOBUFS; } /* Keep statistics */ PIMSTAT_INC(pims_snd_registers_msgs); PIMSTAT_ADD(pims_snd_registers_bytes, len); return 0; } /* * Encapsulate the data packet in PIM Register message and send it to the RP. */ static int pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; struct ip *ip_outer; struct pim_encap_pimhdr *pimhdr; int len = ntohs(ip->ip_len); vifi_t vifi = rt->mfc_parent; VIF_LOCK_ASSERT(); if ((vifi >= V_numvifs) || in_nullhost(V_viftable[vifi].v_lcl_addr)) { m_freem(mb_copy); return EADDRNOTAVAIL; /* The iif vif is invalid */ } /* * Add a new mbuf with the encapsulating header */ mb_first = m_gethdr(M_NOWAIT, MT_DATA); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); mb_first->m_next = mb_copy; mb_first->m_pkthdr.len = len + mb_first->m_len; /* * Fill in the encapsulating IP and PIM header */ ip_outer = mtod(mb_first, struct ip *); *ip_outer = pim_encap_iphdr; ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr)); ip_outer->ip_src = V_viftable[vifi].v_lcl_addr; ip_outer->ip_dst = rt->mfc_rp; /* * Copy the inner header TOS to the outer header, and take care of the * IP_DF bit. */ ip_outer->ip_tos = ip->ip_tos; if (ip->ip_off & htons(IP_DF)) ip_outer->ip_off |= htons(IP_DF); ip_fillid(ip_outer); pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer + sizeof(pim_encap_iphdr)); *pimhdr = pim_encap_pimhdr; /* If the iif crosses a border, set the Border-bit */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & V_mrt_api_config) pimhdr->flags |= htonl(PIM_BORDER_REGISTER); mb_first->m_data += sizeof(pim_encap_iphdr); pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); mb_first->m_data -= sizeof(pim_encap_iphdr); send_packet(vifp, mb_first); /* Keep statistics */ PIMSTAT_INC(pims_snd_registers_msgs); PIMSTAT_ADD(pims_snd_registers_bytes, len); return 0; } /* * pim_encapcheck() is called by the encap4_input() path at runtime to * determine if a packet is for PIM; allowing PIM to be dynamically loaded * into the kernel. */ static int pim_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { #ifdef DIAGNOSTIC KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM")); #endif if (proto != IPPROTO_PIM) return 0; /* not for us; reject the datagram. */ return 64; /* claim the datagram. */ } /* * PIM-SMv2 and PIM-DM messages processing. * Receives and verifies the PIM control messages, and passes them * up to the listening socket, using rip_input(). * The only message with special processing is the PIM_REGISTER message * (used by PIM-SM): the PIM header is stripped off, and the inner packet * is passed to if_simloop(). */ int pim_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct pim *pim; int iphlen = *offp; int minlen; int datalen = ntohs(ip->ip_len) - iphlen; int ip_tos; *mp = NULL; /* Keep statistics */ PIMSTAT_INC(pims_rcv_total_msgs); PIMSTAT_ADD(pims_rcv_total_bytes, datalen); /* * Validate lengths */ if (datalen < PIM_MINLEN) { PIMSTAT_INC(pims_rcv_tooshort); CTR3(KTR_IPMF, "%s: short packet (%d) from %s", __func__, datalen, inet_ntoa(ip->ip_src)); m_freem(m); return (IPPROTO_DONE); } /* * If the packet is at least as big as a REGISTER, go agead * and grab the PIM REGISTER header size, to avoid another * possible m_pullup() later. * * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 */ minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); /* * Get the IP and PIM headers in contiguous memory, and * possibly the PIM REGISTER header. */ if (m->m_len < minlen && (m = m_pullup(m, minlen)) == NULL) { CTR1(KTR_IPMF, "%s: m_pullup() failed", __func__); return (IPPROTO_DONE); } /* m_pullup() may have given us a new mbuf so reset ip. */ ip = mtod(m, struct ip *); ip_tos = ip->ip_tos; /* adjust mbuf to point to the PIM header */ m->m_data += iphlen; m->m_len -= iphlen; pim = mtod(m, struct pim *); /* * Validate checksum. If PIM REGISTER, exclude the data packet. * * XXX: some older PIMv2 implementations don't make this distinction, * so for compatibility reason perform the checksum over part of the * message, and if error, then over the whole message. */ if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { /* do nothing, checksum okay */ } else if (in_cksum(m, datalen)) { PIMSTAT_INC(pims_rcv_badsum); CTR1(KTR_IPMF, "%s: invalid checksum", __func__); m_freem(m); return (IPPROTO_DONE); } /* PIM version check */ if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { PIMSTAT_INC(pims_rcv_badversion); CTR3(KTR_IPMF, "%s: bad version %d expect %d", __func__, (int)PIM_VT_V(pim->pim_vt), PIM_VERSION); m_freem(m); return (IPPROTO_DONE); } /* restore mbuf back to the outer IP */ m->m_data -= iphlen; m->m_len += iphlen; if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { /* * Since this is a REGISTER, we'll make a copy of the register * headers ip + pim + u_int32 + encap_ip, to be passed up to the * routing daemon. */ struct sockaddr_in dst = { sizeof(dst), AF_INET }; struct mbuf *mcp; struct ip *encap_ip; u_int32_t *reghdr; struct ifnet *vifp; VIF_LOCK(); if ((V_reg_vif_num >= V_numvifs) || (V_reg_vif_num == VIFI_INVALID)) { VIF_UNLOCK(); CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__, (int)V_reg_vif_num); m_freem(m); return (IPPROTO_DONE); } /* XXX need refcnt? */ vifp = V_viftable[V_reg_vif_num].v_ifp; VIF_UNLOCK(); /* * Validate length */ if (datalen < PIM_REG_MINLEN) { PIMSTAT_INC(pims_rcv_tooshort); PIMSTAT_INC(pims_rcv_badregisters); CTR1(KTR_IPMF, "%s: register packet size too small", __func__); m_freem(m); return (IPPROTO_DONE); } reghdr = (u_int32_t *)(pim + 1); encap_ip = (struct ip *)(reghdr + 1); CTR3(KTR_IPMF, "%s: register: encap ip src %s len %d", __func__, inet_ntoa(encap_ip->ip_src), ntohs(encap_ip->ip_len)); /* verify the version number of the inner packet */ if (encap_ip->ip_v != IPVERSION) { PIMSTAT_INC(pims_rcv_badregisters); CTR1(KTR_IPMF, "%s: bad encap ip version", __func__); m_freem(m); return (IPPROTO_DONE); } /* verify the inner packet is destined to a mcast group */ if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) { PIMSTAT_INC(pims_rcv_badregisters); CTR2(KTR_IPMF, "%s: bad encap ip dest %s", __func__, inet_ntoa(encap_ip->ip_dst)); m_freem(m); return (IPPROTO_DONE); } /* If a NULL_REGISTER, pass it to the daemon */ if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) goto pim_input_to_daemon; /* * Copy the TOS from the outer IP header to the inner IP header. */ if (encap_ip->ip_tos != ip_tos) { /* Outer TOS -> inner TOS */ encap_ip->ip_tos = ip_tos; /* Recompute the inner header checksum. Sigh... */ /* adjust mbuf to point to the inner IP header */ m->m_data += (iphlen + PIM_MINLEN); m->m_len -= (iphlen + PIM_MINLEN); encap_ip->ip_sum = 0; encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); /* restore mbuf to point back to the outer IP header */ m->m_data -= (iphlen + PIM_MINLEN); m->m_len += (iphlen + PIM_MINLEN); } /* * Decapsulate the inner IP packet and loopback to forward it * as a normal multicast packet. Also, make a copy of the * outer_iphdr + pimhdr + reghdr + encap_iphdr * to pass to the daemon later, so it can take the appropriate * actions (e.g., send back PIM_REGISTER_STOP). * XXX: here m->m_data points to the outer IP header. */ - mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN); + mcp = m_copym(m, 0, iphlen + PIM_REG_MINLEN, M_NOWAIT); if (mcp == NULL) { - CTR1(KTR_IPMF, "%s: m_copy() failed", __func__); + CTR1(KTR_IPMF, "%s: m_copym() failed", __func__); m_freem(m); return (IPPROTO_DONE); } /* Keep statistics */ /* XXX: registers_bytes include only the encap. mcast pkt */ PIMSTAT_INC(pims_rcv_registers_msgs); PIMSTAT_ADD(pims_rcv_registers_bytes, ntohs(encap_ip->ip_len)); /* * forward the inner ip packet; point m_data at the inner ip. */ m_adj(m, iphlen + PIM_MINLEN); CTR4(KTR_IPMF, "%s: forward decap'd REGISTER: src %lx dst %lx vif %d", __func__, (u_long)ntohl(encap_ip->ip_src.s_addr), (u_long)ntohl(encap_ip->ip_dst.s_addr), (int)V_reg_vif_num); /* NB: vifp was collected above; can it change on us? */ if_simloop(vifp, m, dst.sin_family, 0); /* prepare the register head to send to the mrouting daemon */ m = mcp; } pim_input_to_daemon: /* * Pass the PIM message up to the daemon; if it is a Register message, * pass the 'head' only up to the daemon. This includes the * outer IP header, PIM header, PIM-Register header and the * inner IP header. * XXX: the outer IP header pkt size of a Register is not adjust to * reflect the fact that the inner multicast data is truncated. */ *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); } static int sysctl_mfctable(SYSCTL_HANDLER_ARGS) { struct mfc *rt; int error, i; if (req->newptr) return (EPERM); if (V_mfchashtbl == NULL) /* XXX unlocked */ return (0); error = sysctl_wire_old_buffer(req, 0); if (error) return (error); MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { LIST_FOREACH(rt, &V_mfchashtbl[i], mfc_hash) { error = SYSCTL_OUT(req, rt, sizeof(struct mfc)); if (error) goto out_locked; } } out_locked: MFC_UNLOCK(); return (error); } static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, sysctl_mfctable, "IPv4 Multicast Forwarding Table " "(struct *mfc[mfchashsize], netinet/ip_mroute.h)"); static void vnet_mroute_init(const void *unused __unused) { MALLOC(V_nexpire, u_char *, mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO); bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers)); callout_init(&V_expire_upcalls_ch, 1); callout_init(&V_bw_upcalls_ch, 1); callout_init(&V_bw_meter_ch, 1); } VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init, NULL); static void vnet_mroute_uninit(const void *unused __unused) { FREE(V_nexpire, M_MRTABLE); V_nexpire = NULL; } VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, vnet_mroute_uninit, NULL); static int ip_mroute_modevent(module_t mod, int type, void *unused) { switch (type) { case MOD_LOAD: MROUTER_LOCK_INIT(); if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, if_detached_event, NULL, EVENTHANDLER_PRI_ANY); if (if_detach_event_tag == NULL) { printf("ip_mroute: unable to register " "ifnet_departure_event handler\n"); MROUTER_LOCK_DESTROY(); return (EINVAL); } MFC_LOCK_INIT(); VIF_LOCK_INIT(); mfchashsize = MFCHASHSIZE; if (TUNABLE_ULONG_FETCH("net.inet.ip.mfchashsize", &mfchashsize) && !powerof2(mfchashsize)) { printf("WARNING: %s not a power of 2; using default\n", "net.inet.ip.mfchashsize"); mfchashsize = MFCHASHSIZE; } pim_squelch_wholepkt = 0; TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt", &pim_squelch_wholepkt); pim_encap_cookie = encap_attach_func(AF_INET, IPPROTO_PIM, pim_encapcheck, &in_pim_protosw, NULL); if (pim_encap_cookie == NULL) { printf("ip_mroute: unable to attach pim encap\n"); VIF_LOCK_DESTROY(); MFC_LOCK_DESTROY(); MROUTER_LOCK_DESTROY(); return (EINVAL); } ip_mcast_src = X_ip_mcast_src; ip_mforward = X_ip_mforward; ip_mrouter_done = X_ip_mrouter_done; ip_mrouter_get = X_ip_mrouter_get; ip_mrouter_set = X_ip_mrouter_set; ip_rsvp_force_done = X_ip_rsvp_force_done; ip_rsvp_vif = X_ip_rsvp_vif; legal_vif_num = X_legal_vif_num; mrt_ioctl = X_mrt_ioctl; rsvp_input_p = X_rsvp_input; break; case MOD_UNLOAD: /* * Typically module unload happens after the user-level * process has shutdown the kernel services (the check * below insures someone can't just yank the module out * from under a running process). But if the module is * just loaded and then unloaded w/o starting up a user * process we still need to cleanup. */ MROUTER_LOCK(); if (ip_mrouter_cnt != 0) { MROUTER_UNLOCK(); return (EINVAL); } ip_mrouter_unloading = 1; MROUTER_UNLOCK(); EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); if (pim_encap_cookie) { encap_detach(pim_encap_cookie); pim_encap_cookie = NULL; } ip_mcast_src = NULL; ip_mforward = NULL; ip_mrouter_done = NULL; ip_mrouter_get = NULL; ip_mrouter_set = NULL; ip_rsvp_force_done = NULL; ip_rsvp_vif = NULL; legal_vif_num = NULL; mrt_ioctl = NULL; rsvp_input_p = NULL; VIF_LOCK_DESTROY(); MFC_LOCK_DESTROY(); MROUTER_LOCK_DESTROY(); break; default: return EOPNOTSUPP; } return 0; } static moduledata_t ip_mroutemod = { "ip_mroute", ip_mroute_modevent, 0 }; DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE); Index: head/sys/netinet/raw_ip.c =================================================================== --- head/sys/netinet/raw_ip.c (revision 305823) +++ head/sys/netinet/raw_ip.c (revision 305824) @@ -1,1139 +1,1139 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #endif /*IPSEC*/ #include #include VNET_DEFINE(int, ip_defttl) = IPDEFTTL; SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_defttl), 0, "Maximum TTL on IP packets"); VNET_DEFINE(struct inpcbhead, ripcb); VNET_DEFINE(struct inpcbinfo, ripcbinfo); #define V_ripcb VNET(ripcb) #define V_ripcbinfo VNET(ripcbinfo) /* * Control and data hooks for ipfw, dummynet, divert and so on. * The data hooks are not used here but it is convenient * to keep them all in one place. */ VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL; VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL; int (*ip_dn_ctl_ptr)(struct sockopt *); int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *); void (*ip_divert_ptr)(struct mbuf *, int); int (*ng_ipfw_input_p)(struct mbuf **, int, struct ip_fw_args *, int); #ifdef INET /* * Hooks for multicast routing. They all default to NULL, so leave them not * initialized and rely on BSS being set to 0. */ /* * The socket used to communicate with the multicast routing daemon. */ VNET_DEFINE(struct socket *, ip_mrouter); /* * The various mrouter and rsvp functions. */ int (*ip_mrouter_set)(struct socket *, struct sockopt *); int (*ip_mrouter_get)(struct socket *, struct sockopt *); int (*ip_mrouter_done)(void); int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); int (*mrt_ioctl)(u_long, caddr_t, int); int (*legal_vif_num)(int); u_long (*ip_mcast_src)(int); int (*rsvp_input_p)(struct mbuf **, int *, int); int (*ip_rsvp_vif)(struct socket *, struct sockopt *); void (*ip_rsvp_force_done)(struct socket *); #endif /* INET */ extern struct protosw inetsw[]; u_long rip_sendspace = 9216; SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); u_long rip_recvspace = 9216; SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); /* * Hash functions */ #define INP_PCBHASH_RAW_SIZE 256 #define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \ (((proto) + (laddr) + (faddr)) % (mask) + 1) #ifdef INET static void rip_inshash(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *pcbhash; int hash; INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); if (inp->inp_ip_p != 0 && inp->inp_laddr.s_addr != INADDR_ANY && inp->inp_faddr.s_addr != INADDR_ANY) { hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr, inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask); } else hash = 0; pcbhash = &pcbinfo->ipi_hashbase[hash]; LIST_INSERT_HEAD(pcbhash, inp, inp_hash); } static void rip_delhash(struct inpcb *inp) { INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); LIST_REMOVE(inp, inp_hash); } #endif /* INET */ /* * Raw interface to IP protocol. */ /* * Initialize raw connection block q. */ static void rip_zone_change(void *tag) { uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); } static int rip_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "rawinp"); return (0); } void rip_init(void) { in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE, 1, "ripcb", rip_inpcb_init, NULL, 0, IPI_HASHFIELDS_NONE); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } #ifdef VIMAGE static void rip_destroy(void *unused __unused) { in_pcbinfo_destroy(&V_ripcbinfo); } VNET_SYSUNINIT(raw_ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, rip_destroy, NULL); #endif #ifdef INET static int rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, struct sockaddr_in *ripsrc) { int policyfail = 0; INP_LOCK_ASSERT(last); #ifdef IPSEC /* check AH/ESP integrity. */ if (ipsec4_in_reject(n, last)) { policyfail = 1; } #endif /* IPSEC */ #ifdef MAC if (!policyfail && mac_inpcb_check_deliver(last, n) != 0) policyfail = 1; #endif /* Check the minimum TTL for socket. */ if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) policyfail = 1; if (!policyfail) { struct mbuf *opts = NULL; struct socket *so; so = last->inp_socket; if ((last->inp_flags & INP_CONTROLOPTS) || (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) ip_savecontrol(last, &opts, ip, n); SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)ripsrc, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) m_freem(opts); SOCKBUF_UNLOCK(&so->so_rcv); } else sorwakeup_locked(so); } else m_freem(n); return (policyfail); } /* * Setup generic address and protocol structures for raw_input routine, then * pass them along with mbuf chain. */ int rip_input(struct mbuf **mp, int *offp, int proto) { struct ifnet *ifp; struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct inpcb *inp, *last; struct sockaddr_in ripsrc; int hash; *mp = NULL; bzero(&ripsrc, sizeof(ripsrc)); ripsrc.sin_len = sizeof(ripsrc); ripsrc.sin_family = AF_INET; ripsrc.sin_addr = ip->ip_src; last = NULL; ifp = m->m_pkthdr.rcvif; hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); INP_INFO_RLOCK(&V_ripcbinfo); LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { if (inp->inp_ip_p != proto) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; if (jailed_without_vnet(inp->inp_cred)) { /* * XXX: If faddr was bound to multicast group, * jailed raw socket will drop datagram. */ if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) continue; } if (last != NULL) { struct mbuf *n; - n = m_copy(m, 0, (int)M_COPYALL); + n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n != NULL) (void) rip_append(last, ip, n, &ripsrc); /* XXX count dropped packet */ INP_RUNLOCK(last); } INP_RLOCK(inp); last = inp; } LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) { if (inp->inp_ip_p && inp->inp_ip_p != proto) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (!in_nullhost(inp->inp_laddr) && !in_hosteq(inp->inp_laddr, ip->ip_dst)) continue; if (!in_nullhost(inp->inp_faddr) && !in_hosteq(inp->inp_faddr, ip->ip_src)) continue; if (jailed_without_vnet(inp->inp_cred)) { /* * Allow raw socket in jail to receive multicast; * assume process had PRIV_NETINET_RAW at attach, * and fall through into normal filter path if so. */ if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) continue; } /* * If this raw socket has multicast state, and we * have received a multicast, check if this socket * should receive it, as multicast filtering is now * the responsibility of the transport layer. */ if (inp->inp_moptions != NULL && IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { /* * If the incoming datagram is for IGMP, allow it * through unconditionally to the raw socket. * * In the case of IGMPv2, we may not have explicitly * joined the group, and may have set IFF_ALLMULTI * on the interface. imo_multi_filter() may discard * control traffic we actually need to see. * * Userland multicast routing daemons should continue * filter the control traffic appropriately. */ int blocked; blocked = MCAST_PASS; if (proto != IPPROTO_IGMP) { struct sockaddr_in group; bzero(&group, sizeof(struct sockaddr_in)); group.sin_len = sizeof(struct sockaddr_in); group.sin_family = AF_INET; group.sin_addr = ip->ip_dst; blocked = imo_multi_filter(inp->inp_moptions, ifp, (struct sockaddr *)&group, (struct sockaddr *)&ripsrc); } if (blocked != MCAST_PASS) { IPSTAT_INC(ips_notmember); continue; } } if (last != NULL) { struct mbuf *n; - n = m_copy(m, 0, (int)M_COPYALL); + n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n != NULL) (void) rip_append(last, ip, n, &ripsrc); /* XXX count dropped packet */ INP_RUNLOCK(last); } INP_RLOCK(inp); last = inp; } INP_INFO_RUNLOCK(&V_ripcbinfo); if (last != NULL) { if (rip_append(last, ip, m, &ripsrc) != 0) IPSTAT_INC(ips_delivered); INP_RUNLOCK(last); } else { if (inetsw[ip_protox[ip->ip_p]].pr_input == rip_input) { IPSTAT_INC(ips_noproto); IPSTAT_DEC(ips_delivered); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0); } else { m_freem(m); } } return (IPPROTO_DONE); } /* * Generate IP header and pass packet to ip_output. Tack on options user may * have setup with control call. */ int rip_output(struct mbuf *m, struct socket *so, ...) { struct ip *ip; int error; struct inpcb *inp = sotoinpcb(so); va_list ap; u_long dst; int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST; va_start(ap, so); dst = va_arg(ap, u_long); va_end(ap); /* * If the user handed us a complete IP packet, use it. Otherwise, * allocate an mbuf for a header and fill it in. */ if ((inp->inp_flags & INP_HDRINCL) == 0) { if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } M_PREPEND(m, sizeof(struct ip), M_NOWAIT); if (m == NULL) return(ENOBUFS); INP_RLOCK(inp); ip = mtod(m, struct ip *); ip->ip_tos = inp->inp_ip_tos; if (inp->inp_flags & INP_DONTFRAG) ip->ip_off = htons(IP_DF); else ip->ip_off = htons(0); ip->ip_p = inp->inp_ip_p; ip->ip_len = htons(m->m_pkthdr.len); ip->ip_src = inp->inp_laddr; ip->ip_dst.s_addr = dst; if (jailed(inp->inp_cred)) { /* * prison_local_ip4() would be good enough but would * let a source of INADDR_ANY pass, which we do not * want to see from jails. */ if (ip->ip_src.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &ip->ip_dst, &ip->ip_src, inp->inp_cred); } else { error = prison_local_ip4(inp->inp_cred, &ip->ip_src); } if (error != 0) { INP_RUNLOCK(inp); m_freem(m); return (error); } } ip->ip_ttl = inp->inp_ip_ttl; } else { if (m->m_pkthdr.len > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } INP_RLOCK(inp); ip = mtod(m, struct ip *); error = prison_check_ip4(inp->inp_cred, &ip->ip_src); if (error != 0) { INP_RUNLOCK(inp); m_freem(m); return (error); } /* * Don't allow both user specified and setsockopt options, * and don't allow packet length sizes that will crash. */ if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || (ntohs(ip->ip_len) > m->m_pkthdr.len) || (ntohs(ip->ip_len) < (ip->ip_hl << 2))) { INP_RUNLOCK(inp); m_freem(m); return (EINVAL); } /* * This doesn't allow application to specify ID of zero, * but we got this limitation from the beginning of history. */ if (ip->ip_id == 0) ip_fillid(ip); /* * XXX prevent ip_output from overwriting header fields. */ flags |= IP_RAWOUTPUT; IPSTAT_INC(ips_rawout); } if (inp->inp_flags & INP_ONESBCAST) flags |= IP_SENDONES; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif error = ip_output(m, inp->inp_options, NULL, flags, inp->inp_moptions, inp); INP_RUNLOCK(inp); return (error); } /* * Raw IP socket option processing. * * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could * only be created by a privileged process, and as such, socket option * operations to manage system properties on any raw socket were allowed to * take place without explicit additional access control checks. However, * raw sockets can now also be created in jail(), and therefore explicit * checks are now required. Likewise, raw sockets can be used by a process * after it gives up privilege, so some caution is required. For options * passed down to the IP layer via ip_ctloutput(), checks are assumed to be * performed in ip_ctloutput() and therefore no check occurs here. * Unilaterally checking priv_check() here breaks normal IP socket option * operations on raw sockets. * * When adding new socket options here, make sure to add access control * checks here as necessary. * * XXX-BZ inp locking? */ int rip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; if (sopt->sopt_level != IPPROTO_IP) { if ((sopt->sopt_level == SOL_SOCKET) && (sopt->sopt_name == SO_SETFIB)) { inp->inp_inc.inc_fibnum = so->so_fibnum; return (0); } return (EINVAL); } error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case IP_HDRINCL: optval = inp->inp_flags & INP_HDRINCL; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_FW3: /* generic ipfw v.3 functions */ case IP_FW_ADD: /* ADD actually returns the body... */ case IP_FW_GET: case IP_FW_TABLE_GETSIZE: case IP_FW_TABLE_LIST: case IP_FW_NAT_GET_CONFIG: case IP_FW_NAT_GET_LOG: if (V_ip_fw_ctl_ptr != NULL) error = V_ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET3: /* generic dummynet v.3 functions */ case IP_DUMMYNET_GET: if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT; break ; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case IP_HDRINCL: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; if (optval) inp->inp_flags |= INP_HDRINCL; else inp->inp_flags &= ~INP_HDRINCL; break; case IP_FW3: /* generic ipfw v.3 functions */ case IP_FW_ADD: case IP_FW_DEL: case IP_FW_FLUSH: case IP_FW_ZERO: case IP_FW_RESETLOG: case IP_FW_TABLE_ADD: case IP_FW_TABLE_DEL: case IP_FW_TABLE_FLUSH: case IP_FW_NAT_CFG: case IP_FW_NAT_DEL: if (V_ip_fw_ctl_ptr != NULL) error = V_ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET3: /* generic dummynet v.3 functions */ case IP_DUMMYNET_CONFIGURE: case IP_DUMMYNET_DEL: case IP_DUMMYNET_FLUSH: if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT ; break ; case IP_RSVP_ON: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_init(so); break; case IP_RSVP_OFF: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_done(); break; case IP_RSVP_VIF_ON: case IP_RSVP_VIF_OFF: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_vif ? ip_rsvp_vif(so, sopt) : EINVAL; break; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; } return (error); } /* * This function exists solely to receive the PRC_IFDOWN messages which are * sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, and calls * in_ifadown() to remove all routes corresponding to that address. It also * receives the PRC_IFUP messages from if_up() and reinstalls the interface * routes. */ void rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct ifnet *ifp; int err; int flags; switch (cmd) { case PRC_IFDOWN: IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa && (ia->ia_flags & IFA_ROUTE)) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, 0); /* * in_ifadown gets rid of all the rest of the * routes. This is not quite the right thing * to do, but at least if we are running a * routing process they will come back. */ in_ifadown(&ia->ia_ifa, 0); ifa_free(&ia->ia_ifa); break; } } if (ia == NULL) /* If ia matched, already unlocked. */ IN_IFADDR_RUNLOCK(&in_ifa_tracker); break; case PRC_IFUP: IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa) break; } if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return; } ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); flags = RTF_UP; ifp = ia->ia_ifa.ifa_ifp; if ((ifp->if_flags & IFF_LOOPBACK) || (ifp->if_flags & IFF_POINTOPOINT)) flags |= RTF_HOST; err = ifa_del_loopback_route((struct ifaddr *)ia, sa); err = rtinit(&ia->ia_ifa, RTM_ADD, flags); if (err == 0) ia->ia_flags |= IFA_ROUTE; err = ifa_add_loopback_route((struct ifaddr *)ia, sa); ifa_free(&ia->ia_ifa); break; } } static int rip_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("rip_attach: inp != NULL")); error = priv_check(td, PRIV_NETINET_RAW); if (error) return (error); if (proto >= IPPROTO_MAX || proto < 0) return EPROTONOSUPPORT; error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return (error); INP_INFO_WLOCK(&V_ripcbinfo); error = in_pcballoc(so, &V_ripcbinfo); if (error) { INP_INFO_WUNLOCK(&V_ripcbinfo); return (error); } inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV4; inp->inp_ip_p = proto; inp->inp_ip_ttl = V_ip_defttl; rip_inshash(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); INP_WUNLOCK(inp); return (0); } static void rip_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("rip_detach: not closed")); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); if (so == V_ip_mrouter && ip_mrouter_done) ip_mrouter_done(); if (ip_rsvp_force_done) ip_rsvp_force_done(so); if (so == V_ip_rsvpd) ip_rsvp_done(); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); } static void rip_dodisconnect(struct socket *so, struct inpcb *inp) { struct inpcbinfo *pcbinfo; pcbinfo = inp->inp_pcbinfo; INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); rip_delhash(inp); inp->inp_faddr.s_addr = INADDR_ANY; rip_inshash(inp); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(pcbinfo); } static void rip_abort(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_abort: inp == NULL")); rip_dodisconnect(so, inp); } static void rip_close(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_close: inp == NULL")); rip_dodisconnect(so, inp); } static int rip_disconnect(struct socket *so) { struct inpcb *inp; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); rip_dodisconnect(so, inp); return (0); } static int rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; int error; if (nam->sa_len != sizeof(*addr)) return (EINVAL); error = prison_check_ip4(td->td_ucred, &addr->sin_addr); if (error != 0) return (error); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_bind: inp == NULL")); if (TAILQ_EMPTY(&V_ifnet) || (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || (addr->sin_addr.s_addr && (inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)addr) == 0)) return (EADDRNOTAVAIL); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); inp->inp_laddr = addr->sin_addr; rip_inshash(inp); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; if (nam->sa_len != sizeof(*addr)) return (EINVAL); if (TAILQ_EMPTY(&V_ifnet)) return (EADDRNOTAVAIL); if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) return (EAFNOSUPPORT); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_connect: inp == NULL")); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); inp->inp_faddr = addr->sin_addr; rip_inshash(inp); soisconnected(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } static int rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct inpcb *inp; u_long dst; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_send: inp == NULL")); /* * Note: 'dst' reads below are unlocked. */ if (so->so_state & SS_ISCONNECTED) { if (nam) { m_freem(m); return (EISCONN); } dst = inp->inp_faddr.s_addr; /* Unlocked read. */ } else { if (nam == NULL) { m_freem(m); return (ENOTCONN); } dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; } return (rip_output(m, so, dst)); } #endif /* INET */ static int rip_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = V_ripcbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return (0); } if (req->newptr != 0) return (EPERM); /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&V_ripcbinfo); gencnt = V_ripcbinfo.ipi_gencnt; n = V_ripcbinfo.ipi_count; INP_INFO_RUNLOCK(&V_ripcbinfo); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == NULL) return (ENOMEM); INP_INFO_RLOCK(&V_ripcbinfo); for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { in_pcbref(inp); inp_list[i++] = inp; } INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&V_ripcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else INP_RUNLOCK(inp); } INP_INFO_WLOCK(&V_ripcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_ripcbinfo); if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ INP_INFO_RLOCK(&V_ripcbinfo); xig.xig_gen = V_ripcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_ripcbinfo.ipi_count; INP_INFO_RUNLOCK(&V_ripcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); #ifdef INET struct pr_usrreqs rip_usrreqs = { .pru_abort = rip_abort, .pru_attach = rip_attach, .pru_bind = rip_bind, .pru_connect = rip_connect, .pru_control = in_control, .pru_detach = rip_detach, .pru_disconnect = rip_disconnect, .pru_peeraddr = in_getpeeraddr, .pru_send = rip_send, .pru_shutdown = rip_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = rip_close, }; #endif /* INET */ Index: head/sys/netinet/tcp_output.c =================================================================== --- head/sys/netinet/tcp_output.c (revision 305823) +++ head/sys/netinet/tcp_output.c (revision 305824) @@ -1,1814 +1,1814 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #ifdef TCP_RFC7413 #include #endif #include #define TCPOUTFLAGS #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #ifdef TCPDEBUG #include #endif #ifdef TCP_OFFLOAD #include #endif #ifdef IPSEC #include #endif /*IPSEC*/ #include #include VNET_DEFINE(int, path_mtu_discovery) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(path_mtu_discovery), 1, "Enable Path MTU Discovery"); VNET_DEFINE(int, tcp_do_tso) = 1; #define V_tcp_do_tso VNET(tcp_do_tso) SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_tso), 0, "Enable TCP Segmentation Offload"); VNET_DEFINE(int, tcp_sendspace) = 1024*32; #define V_tcp_sendspace VNET(tcp_sendspace) SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size"); VNET_DEFINE(int, tcp_do_autosndbuf) = 1; #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autosndbuf), 0, "Enable automatic send buffer sizing"); VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024; #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_inc), 0, "Incrementor step size of automatic send buffer"); VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024; #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_max), 0, "Max size of automatic send buffer"); /* * Make sure that either retransmit or persist timer is set for SYN, FIN and * non-ACK. */ #define TCP_XMIT_TIMER_ASSERT(tp, len, th_flags) \ KASSERT(((len) == 0 && ((th_flags) & (TH_SYN | TH_FIN)) == 0) ||\ tcp_timer_active((tp), TT_REXMT) || \ tcp_timer_active((tp), TT_PERSIST), \ ("neither rexmt nor persist timer is set")) static void inline hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, long len, int tso); static void inline cc_after_idle(struct tcpcb *tp); /* * Wrapper for the TCP established output helper hook. */ static void inline hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, long len, int tso) { struct tcp_hhook_data hhook_data; if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) { hhook_data.tp = tp; hhook_data.th = th; hhook_data.to = to; hhook_data.len = len; hhook_data.tso = tso; hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data, tp->osd); } } /* * CC wrapper hook functions */ static void inline cc_after_idle(struct tcpcb *tp) { INP_WLOCK_ASSERT(tp->t_inpcb); if (CC_ALGO(tp)->after_idle != NULL) CC_ALGO(tp)->after_idle(tp->ccv); } /* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; long len, recwin, sendwin; int off, flags, error = 0; /* Keep compiler happy */ struct mbuf *m; struct ip *ip = NULL; struct ipovly *ipov = NULL; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; #ifdef IPSEC unsigned ipsec_optlen = 0; #endif int idle, sendalot; int sack_rxmit, sack_bytes_rxmt; struct sackhole *p; int tso, mtu; struct tcpopt to; #if 0 int maxburst = TCP_MAXBURST; #endif #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif INP_WLOCK_ASSERT(tp->t_inpcb); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif #ifdef TCP_RFC7413 /* * For TFO connections in SYN_RECEIVED, only allow the initial * SYN|ACK and those sent by the retransmit timer. */ if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED) && SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ return (0); #endif /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur) cc_after_idle(tp); tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_LT(tp->snd_nxt, tp->snd_max)) tcp_sack_adjust(tp); sendalot = 0; tso = 0; mtu = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly trying * to send out new data (when sendalot is 1), bypass this function. * If we retransmit in fast recovery mode, decrement snd_cwnd, since * we're replacing a (future) new transmission with a retransmission * now, and we previously incremented snd_cwnd in tcp_input(). */ /* * Still in sack recovery , reset rxmit flag to zero. */ sack_rxmit = 0; sack_bytes_rxmt = 0; len = 0; p = NULL; if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { long cwin; cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* * (At least) part of sack hole extends beyond * snd_recover. Check to see if we can rexmit data * for this hole. */ if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { /* * Can't rexmit any more data for this hole. * That data will be rexmitted in the next * sack recovery episode, when snd_recover * moves past p->rxmit. */ p = NULL; goto after_sack_rexmit; } else /* Can rexmit part of the current hole */ len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit)); } else len = ((long)ulmin(cwin, p->end - p->rxmit)); off = p->rxmit - tp->snd_una; KASSERT(off >= 0,("%s: sack block to the left of una : %d", __func__, off)); if (len > 0) { sack_rxmit = 1; sendalot = 1; TCPSTAT_INC(tcps_sack_rexmits); TCPSTAT_ADD(tcps_sack_rexmit_bytes, min(len, tp->t_maxseg)); } } after_sack_rexmit: /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; SOCKBUF_LOCK(&so->so_snd); /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_flags & TF_FORCEDATA) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < sbused(&so->so_snd)) flags &= ~TH_FIN; sendwin = 1; } else { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.sb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { if (sack_bytes_rxmt == 0) len = ((long)ulmin(sbavail(&so->so_snd), sendwin) - off); else { long cwin; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ len = ((long)ulmin(sbavail(&so->so_snd), tp->snd_wnd) - off); /* * Don't remove this (len > 0) check ! * We explicitly check for len > 0 here (although it * isn't really necessary), to work around a gcc * optimization issue - to force gcc to compute * len above. Without this check, the computation * of len is bungled by the optimizer. */ if (len > 0) { cwin = tp->snd_cwnd - (tp->snd_nxt - tp->sack_newdata) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; len = lmin(len, cwin); } } } /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data and if we don't * know that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { if (tp->t_state != TCPS_SYN_RECEIVED) flags &= ~TH_SYN; #ifdef TCP_RFC7413 /* * When sending additional segments following a TFO SYN|ACK, * do not include the SYN bit. */ if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED)) flags &= ~TH_SYN; #endif off--, len++; } /* * Be careful not to send data and/or FIN on SYN segments. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { len = 0; flags &= ~TH_FIN; } #ifdef TCP_RFC7413 /* * When retransmitting SYN|ACK on a passively-created TFO socket, * don't include data, as the presence of data may have caused the * original SYN|ACK to have been dropped by a middlebox. */ if ((tp->t_flags & TF_FASTOPEN) && (((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0)) || (flags & TH_RST))) len = 0; #endif if (len <= 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be < 0. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. * * We also do a general check here to ensure that * we will set the persist timer when we have data * to send, but a 0-byte window. This makes sure * the persist timer is set even if the packet * hits one of the "goto send" lines below. */ len = 0; if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && (off < (int) sbavail(&so->so_snd))) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!tcp_timer_active(tp, TT_PERSIST)) tcp_setpersist(tp); } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); /* * Automatic sizing of send socket buffer. Often the send buffer * size is not optimally adjusted to the actual network conditions * at hand (delay bandwidth product). Setting the buffer size too * small limits throughput on links with high bandwidth and high * delay (eg. trans-continental/oceanic links). Setting the * buffer size too big consumes too much real kernel memory, * especially with many connections on busy servers. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 3. send buffer fill has not hit maximal automatic size; * 4. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwidth product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwidth (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. * * XXXGL: should there be used sbused() or sbavail()? */ if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && sbused(&so->so_snd) >= (so->so_snd.sb_hiwat / 8 * 7) && sbused(&so->so_snd) < V_tcp_autosndbuf_max && sendwin >= (sbused(&so->so_snd) - (tp->snd_nxt - tp->snd_una))) { if (!sbreserve_locked(&so->so_snd, min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max), so, curthread)) so->so_snd.sb_flags &= ~SB_AUTOSIZE; } } /* * Decide if we can use TCP Segmentation Offloading (if supported by * hardware). * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and * IP options prevent using TSO. With TSO the TCP header is the same * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per generated * segment or packet. */ #ifdef IPSEC /* * Pre-calculate here as we save another lookup into the darknesses * of IPsec that way and can actually decide if TSO is ok. */ ipsec_optlen = ipsec_hdrsiz_tcp(tp); #endif if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && #ifdef IPSEC ipsec_optlen == 0 && #endif tp->t_inpcb->inp_options == NULL && tp->t_inpcb->in6p_options == NULL) tso = 1; if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + sbused(&so->so_snd))) flags &= ~TH_FIN; } else { if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + sbused(&so->so_snd))) flags &= ~TH_FIN; } recwin = sbspace(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment (or more with TSO) * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limited the window size) * - we need to retransmit */ if (len) { if (len >= tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= sbavail(&so->so_snd) && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; if (sack_rxmit) goto send; } /* * Sending of standalone window updates. * * Window updates are important when we close our window due to a * full socket buffer and are opening it again after the application * reads data from it. Once the window has opened again and the * remote end starts to send again the ACK clock takes over and * provides the most current window information. * * We must avoid the silly window syndrome whereas every read * from the receive buffer, no matter how small, causes a window * update to be sent. We also should avoid sending a flurry of * window updates when the socket buffer had queued a lot of data * and the application is doing small reads. * * Prevent a flurry of pointless window updates by only sending * an update when we can increase the advertized window by more * than 1/4th of the socket buffer capacity. When the buffer is * getting full or is very small be more aggressive and send an * update whenever we can increase by two mss sized segments. * In all other situations the ACK's to new incoming data will * carry further window increases. * * Don't send an independent window update if a delayed * ACK is pending (it will get piggy-backed on it) or the * remote side already has done a half-close and won't send * more data. Skip this if the connection is in T/TCP * half-open state. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && !(tp->t_flags & TF_DELACK) && !TCPS_HAVERCVDFIN(tp->t_state)) { /* * "adv" is the amount we could increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long adv; int oldwin; adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale); if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { oldwin = (tp->rcv_adv - tp->rcv_nxt); adv -= oldwin; } else oldwin = 0; /* * If the new window size ends up being the same as the old * size when it is scaled, then don't force a window update. */ if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) goto dontupdate; if (adv >= (long)(2 * tp->t_maxseg) && (adv >= (long)(so->so_rcv.sb_hiwat / 4) || recwin <= (long)(so->so_rcv.sb_hiwat / 8) || so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) goto send; } dontupdate: /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; /* * In SACK, it is possible for tcp_output to fail to send a segment * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_GT(tp->snd_max, tp->snd_una) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); goto just_return; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tcp_timer_active(tp, TT_PERSIST) * is true when we are in persist state. * (tp->t_flags & TF_FORCEDATA) * is set when we are called to send a persist packet. * tcp_timer_active(tp, TT_REXMT) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(&so->so_snd); return (0); send: SOCKBUF_LOCK_ASSERT(&so->so_snd); if (len > 0) { if (len >= tp->t_maxseg) tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; else tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; } /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); else #endif hdrlen = sizeof (struct tcpiphdr); /* * Compute options for segment. * We only have to care about SYN and established connection * segments. Options for SYN-ACK segments are handled in TCP * syncache. */ to.to_flags = 0; if ((tp->t_flags & TF_NOOPT) == 0) { /* Maximum segment size. */ if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc); to.to_flags |= TOF_MSS; #ifdef TCP_RFC7413 /* * Only include the TFO option on the first * transmission of the SYN|ACK on a * passively-created TFO socket, as the presence of * the TFO option may have caused the original * SYN|ACK to have been dropped by a middlebox. */ if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift == 0)) { to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie; to.to_flags |= TOF_FASTOPEN; } #endif } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { to.to_wscale = tp->request_r_scale; to.to_flags |= TOF_SCALE; } /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { to.to_tsval = tcp_ts_getticks() + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = tcp_ts_getticks(); } /* Selective ACK's. */ if (tp->t_flags & TF_SACK_PERMIT) { if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { to.to_flags |= TOF_SACK; to.to_nsacks = tp->rcv_numsacks; to.to_sacks = (u_char *)tp->sackblks; } } #ifdef TCP_SIGNATURE /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); } #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #ifdef IPSEC ipoptlen += ipsec_optlen; #endif /* * Adjust data length if insertion of options will * bump the packet length beyond the t_maxseg length. * Clear the FIN bit because we cut off the tail of * the segment. */ if (len + optlen + ipoptlen > tp->t_maxseg) { flags &= ~TH_FIN; if (tso) { u_int if_hw_tsomax; u_int if_hw_tsomaxsegcount; u_int if_hw_tsomaxsegsize; struct mbuf *mb; u_int moff; int max_len; /* extract TSO information */ if_hw_tsomax = tp->t_tsomax; if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; /* * Limit a TSO burst to prevent it from * overflowing or exceeding the maximum length * allowed by the network interface: */ KASSERT(ipoptlen == 0, ("%s: TSO can't do IP options", __func__)); /* * Check if we should limit by maximum payload * length: */ if (if_hw_tsomax != 0) { /* compute maximum TSO length */ max_len = (if_hw_tsomax - hdrlen - max_linkhdr); if (max_len <= 0) { len = 0; } else if (len > max_len) { sendalot = 1; len = max_len; } } /* * Check if we should limit by maximum segment * size and count: */ if (if_hw_tsomaxsegcount != 0 && if_hw_tsomaxsegsize != 0) { /* * Subtract one segment for the LINK * and TCP/IP headers mbuf that will * be prepended to this mbuf chain * after the code in this section * limits the number of mbufs in the * chain to if_hw_tsomaxsegcount. */ if_hw_tsomaxsegcount -= 1; max_len = 0; mb = sbsndmbuf(&so->so_snd, off, &moff); while (mb != NULL && max_len < len) { u_int mlen; u_int frags; /* * Get length of mbuf fragment * and how many hardware frags, * rounded up, it would use: */ mlen = (mb->m_len - moff); frags = howmany(mlen, if_hw_tsomaxsegsize); /* Handle special case: Zero Length Mbuf */ if (frags == 0) frags = 1; /* * Check if the fragment limit * will be reached or exceeded: */ if (frags >= if_hw_tsomaxsegcount) { max_len += min(mlen, if_hw_tsomaxsegcount * if_hw_tsomaxsegsize); break; } max_len += mlen; if_hw_tsomaxsegcount -= frags; moff = 0; mb = mb->m_next; } if (max_len <= 0) { len = 0; } else if (len > max_len) { sendalot = 1; len = max_len; } } /* * Prevent the last segment from being * fractional unless the send sockbuf can be * emptied: */ max_len = (tp->t_maxseg - optlen); if ((off + len) < sbavail(&so->so_snd)) { moff = len % max_len; if (moff != 0) { len -= moff; sendalot = 1; } } /* * In case there are too many small fragments * don't use TSO: */ if (len <= max_len) { len = max_len; sendalot = 1; tso = 0; } /* * Send the FIN in a separate segment * after the bulk sending is done. * We don't trust the TSO implementations * to clear the FIN flag on all but the * last segment. */ if (tp->t_flags & TF_NEEDFIN) sendalot = 1; } else { len = tp->t_maxseg - optlen - ipoptlen; sendalot = 1; } } else tso = 0; KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, ("%s: len > IP_MAXPACKET", __func__)); /*#ifdef DIAGNOSTIC*/ #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else if (max_linkhdr + hdrlen > MHLEN) #endif panic("tcphdr too big"); /*#endif*/ /* * This KASSERT is here to catch edge cases at a well defined place. * Before, those had triggered (random) panic conditions further down. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); /* * Grab a header mbuf, attaching a copy of data to * be transmitted, and initialize the header from * the template for sends on this connection. */ if (len) { struct mbuf *mb; u_int moff; if ((tp->t_flags & TF_FORCEDATA) && len == 1) TCPSTAT_INC(tcps_sndprobe); else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { tp->t_sndrexmitpack++; TCPSTAT_INC(tcps_sndrexmitpack); TCPSTAT_ADD(tcps_sndrexmitbyte, len); } else { TCPSTAT_INC(tcps_sndpack); TCPSTAT_ADD(tcps_sndbyte, len); } #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else #endif m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOBUFS; sack_rxmit = 0; goto out; } m->m_data += max_linkhdr; m->m_len = hdrlen; /* * Start the m_copy functions from the closest mbuf * to the offset in the socket buffer chain. */ mb = sbsndptr(&so->so_snd, off, len, &moff); if (len <= MHLEN - hdrlen - max_linkhdr) { m_copydata(mb, moff, (int)len, mtod(m, caddr_t) + hdrlen); m->m_len += len; } else { - m->m_next = m_copy(mb, moff, (int)len); + m->m_next = m_copym(mb, moff, len, M_NOWAIT); if (m->m_next == NULL) { SOCKBUF_UNLOCK(&so->so_snd); (void) m_free(m); error = ENOBUFS; sack_rxmit = 0; goto out; } } /* * If we're sending everything we've got, set PUSH. * (This will keep happy those implementations which only * give data to the user when a buffer fills or * a PUSH comes in.) */ if ((off + len == sbused(&so->so_snd)) && !(flags & TH_SYN)) flags |= TH_PUSH; SOCKBUF_UNLOCK(&so->so_snd); } else { SOCKBUF_UNLOCK(&so->so_snd); if (tp->t_flags & TF_ACKNOW) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN|TH_FIN|TH_RST)) TCPSTAT_INC(tcps_sndctrl); else if (SEQ_GT(tp->snd_up, tp->snd_una)) TCPSTAT_INC(tcps_sndurg); else TCPSTAT_INC(tcps_sndwinup); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; sack_rxmit = 0; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { M_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } SOCKBUF_UNLOCK_ASSERT(&so->so_snd); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(tp->t_inpcb, m); #endif #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(tp->t_inpcb, ip6, th); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(tp->t_inpcb, ip, th); } /* * Fill in fields, remembering maximum advertised * window for use in delaying messages about window sizes. * If resending a FIN, be sure not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; /* * If we are starting a connection, send ECN setup * SYN packet. If we are on a retransmit, we may * resend those bits a number of times as per * RFC 3168. */ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { if (tp->t_rxtshift >= 1) { if (tp->t_rxtshift <= V_tcp_ecn_maxretries) flags |= TH_ECE|TH_CWR; } else flags |= TH_ECE|TH_CWR; } if (tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & TF_ECN_PERMIT)) { /* * If the peer has ECN, mark data packets with * ECN capable transmission (ECT). * Ignore pure ack packets, retransmissions and window probes. */ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && !((tp->t_flags & TF_FORCEDATA) && len == 1)) { #ifdef INET6 if (isipv6) ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); else #endif ip->ip_tos |= IPTOS_ECN_ECT0; TCPSTAT_INC(tcps_ecn_ect0); } /* * Reply with proper ECN notifications. */ if (tp->t_flags & TF_ECN_SND_CWR) { flags |= TH_CWR; tp->t_flags &= ~TF_ECN_SND_CWR; } if (tp->t_flags & TF_ECN_SND_ECE) flags |= TH_ECE; } /* * If we are doing retransmissions, then snd_nxt will * not reflect the first unsent octet. For ACK only * packets, we do not want the sequence number of the * retransmitted packet, we want the sequence number * of the next unsent octet. So, if there is no data * (and no SYN or FIN), use snd_max instead of snd_nxt * when filling in ti_seq. But if we are in persist * state, snd_max might reflect one byte beyond the * right edge of the window, so use snd_nxt in that * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (sack_rxmit == 0) { if (len || (flags & (TH_SYN|TH_FIN)) || tcp_timer_active(tp, TT_PERSIST)) th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); } else { th->th_seq = htonl(p->rxmit); p->rxmit += len; tp->sackhint.sack_bytes_rexmit += len; } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. */ if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && recwin < (long)tp->t_maxseg) recwin = 0; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) recwin = (long)(tp->rcv_adv - tp->rcv_nxt); if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) recwin = (long)TCP_MAXWIN << tp->rcv_scale; /* * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. The * case is handled in syncache. */ if (flags & TH_SYN) th->th_win = htons((u_short) (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); /* * Adjust the RXWIN0SENT flag - indicate that we have advertised * a 0 window. This may cause the remote transmitter to stall. This * flag tells soreceive() to disable delayed acknowledgements when * draining the buffer. This can occur if the receiver is attempting * to read more data than can be buffered prior to transmitting on * the connection. */ if (th->th_win == 0) { tp->t_sndzerowin++; tp->t_flags |= TF_RXWIN0SENT; } else tp->t_flags &= ~TF_RXWIN0SENT; if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull * the urgent pointer to the left edge of the send window * so that it doesn't drift into the send window on sequence * number wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ #ifdef TCP_SIGNATURE if (to.to_flags & TOF_SIGNATURE) { int sigoff = to.to_signature - opt; tcp_signature_compute(m, 0, len, optlen, (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND); } #endif /* * Put TCP length in extended header, and then * checksum extended header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { /* * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } #endif /* * Enable TSO and specify the size of the segments. * The TCP pseudo header checksum is always provided. */ if (tso) { KASSERT(len > tp->t_maxseg - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; } #ifdef IPSEC KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %ld + %u + %u - %u != %u", __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); #else KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %ld + %u + %u != %u", __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); #endif /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #ifdef TCPDEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) { u_short save = 0; #ifdef INET6 if (!isipv6) #endif { save = ipov->ih_len; ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */); } tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #ifdef INET6 if (!isipv6) #endif ipov->ih_len = save; } #endif /* TCPDEBUG */ TCP_PROBE3(debug__output, tp, th, mtod(m, const char *)); /* * Fill in IP length and desired time to live and * send to IP level. There should be a better way * to handle ttl and tos; we could keep them in * the template, but need a way to checksum without them. */ /* * m->m_pkthdr.len should have been set before checksum calculation, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { struct route_in6 ro; bzero(&ro, sizeof(ro)); /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. * Also, desired default hop limit might be changed via * Neighbor Discovery. */ ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); /* * Set the packet size here for the benefit of DTrace probes. * ip6_output() will set it properly; it's supposed to include * the option header lengths as well. */ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) tp->t_flags2 |= TF2_PLPMTU_PMTUD; else tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); TCP_PROBE5(send, NULL, tp, ip6, tp, th); #ifdef TCPPCAP /* Save packet, if requested. */ tcp_pcap_add(th, m, &(tp->t_outpkts)); #endif /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb); if (error == EMSGSIZE && ro.ro_rt != NULL) mtu = ro.ro_rt->rt_mtu; RO_RTFREE(&ro); } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO) ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL); #endif /* INET6 */ /* * If we do path MTU discovery, then we set DF on every packet. * This might not be the best thing to do according to RFC3390 * Section 2. However the tcp hostcache migitates the problem * so it affects only the first tcp connection with a host. * * NB: Don't set DF on small MTU/MSS to have a safe fallback. */ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { ip->ip_off |= htons(IP_DF); tp->t_flags2 |= TF2_PLPMTU_PMTUD; } else { tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; } if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); TCP_PROBE5(send, NULL, tp, ip, tp, th); #ifdef TCPPCAP /* Save packet, if requested. */ tcp_pcap_add(th, m, &(tp->t_outpkts)); #endif error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, tp->t_inpcb); if (error == EMSGSIZE && tp->t_inpcb->inp_route.ro_rt != NULL) mtu = tp->t_inpcb->inp_route.ro_rt->rt_mtu; } #endif /* INET */ out: /* * In transmit state, time the transmission and arrange for * the retransmit. In persist state, just set snd_max. */ if ((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) { tcp_seq startseq = tp->snd_nxt; /* * Advance snd_nxt over sequence space of this segment. */ if (flags & (TH_SYN|TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; if (flags & TH_FIN) { tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; } } if (sack_rxmit) goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and * not currently timing anything. */ if (tp->t_rtttime == 0) { tp->t_rtttime = ticks; tp->t_rtseq = startseq; TCPSTAT_INC(tcps_segstimed); } } /* * Set retransmit timer if not currently set, * and not doing a pure ack or a keep-alive probe. * Initial value for retransmit timer is smoothed * round-trip time + 2 * round-trip time variance. * Initialize shift counter which is used for backoff * of retransmit time. */ timer: if (!tcp_timer_active(tp, TT_REXMT) && ((sack_rxmit && tp->snd_nxt != tp->snd_max) || (tp->snd_nxt != tp->snd_una))) { if (tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); } else if (len == 0 && sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { /* * Avoid a situation where we do not set persist timer * after a zero window condition. For example: * 1) A -> B: packet with enough data to fill the window * 2) B -> A: ACK for #1 + new data (0 window * advertisement) * 3) A -> B: ACK for #2, 0 len packet * * In this case, A will not activate the persist timer, * because it chose to send a packet. Unless tcp_output * is called for some other reason (delayed ack timer, * another input packet from B, socket syscall), A will * not send zero window probes. * * So, if you send a 0-length packet, but there is data * in the socket buffer, and neither the rexmt or * persist timer is already set, then activate the * persist timer. */ tp->t_rxtshift = 0; tcp_setpersist(tp); } } else { /* * Persist case, update snd_max but since we are in * persist mode (no window) we do not update snd_nxt. */ int xlen = len; if (flags & TH_SYN) ++xlen; if (flags & TH_FIN) { ++xlen; tp->t_flags |= TF_SENTFIN; } if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) tp->snd_max = tp->snd_nxt + len; } if (error) { /* * We know that the packet was lost, so back out the * sequence number advance, if any. * * If the error is EPERM the packet got blocked by the * local firewall. Normally we should terminate the * connection but the blocking may have been spurious * due to a firewall reconfiguration cycle. So we treat * it like a packet loss and let the retransmit timer and * timeouts do their work over time. * XXX: It is a POLA question whether calling tcp_drop right * away would be the really correct behavior instead. */ if (((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) && ((flags & TH_SYN) == 0) && (error != EPERM)) { if (sack_rxmit) { p->rxmit -= len; tp->sackhint.sack_bytes_rexmit -= len; KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); } else tp->snd_nxt -= len; } SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ switch (error) { case EPERM: tp->t_softerror = error; return (error); case ENOBUFS: TCP_XMIT_TIMER_ASSERT(tp, len, flags); tp->snd_cwnd = tp->t_maxseg; return (0); case EMSGSIZE: /* * For some reason the interface we used initially * to send segments changed to another or lowered * its MTU. * If TSO was active we either got an interface * without TSO capabilits or TSO was turned off. * If we obtained mtu from ip_output() then update * it and try again. */ if (tso) tp->t_flags &= ~TF_TSO; if (mtu != 0) { tcp_mss_update(tp, -1, mtu, NULL, NULL); goto again; } return (error); case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: case ENETUNREACH: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; return (0); } /* FALLTHROUGH */ default: return (error); } } TCPSTAT_INC(tcps_sndtotal); /* * Data sent (as far as we can tell). * If this advertises a larger window than any other segment, * then remember the size of the advertised window. * Any pending ACK has now been sent. */ if (recwin >= 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); if (tcp_timer_active(tp, TT_DELACK)) tcp_timer_activate(tp, TT_DELACK, 0); #if 0 /* * This completely breaks TCP if newreno is turned on. What happens * is that if delayed-acks are turned on on the receiver, this code * on the transmitter effectively destroys the TCP window, forcing * it to four packets (1.5Kx4 = 6K window). */ if (sendalot && --maxburst) goto again; #endif if (sendalot) goto again; return (0); } void tcp_setpersist(struct tcpcb *tp) { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; int tt; tp->t_flags &= ~TF_PREVVALID; if (tcp_timer_active(tp, TT_REXMT)) panic("tcp_setpersist: retransmit pending"); /* * Start/restart persistence timer. */ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], tcp_persmin, tcp_persmax); tcp_timer_activate(tp, TT_PERSIST, tt); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; } /* * Insert TCP options according to the supplied parameters to the place * optp in a consistent way. Can handle unaligned destinations. * * The order of the option processing is crucial for optimal packing and * alignment for the scarce option space. * * The optimal order for a SYN/SYN-ACK segment is: * MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) + * Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40. * * The SACK options should be last. SACK blocks consume 8*n+2 bytes. * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks). * At minimum we need 10 bytes (to generate 1 SACK block). If both * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present, * we only have 10 bytes for SACK options (40 - (12 + 18)). */ int tcp_addoptions(struct tcpopt *to, u_char *optp) { u_int32_t mask, optlen = 0; for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { if ((to->to_flags & mask) != mask) continue; if (optlen == TCP_MAXOLEN) break; switch (to->to_flags & mask) { case TOF_MSS: while (optlen % 4) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG) continue; optlen += TCPOLEN_MAXSEG; *optp++ = TCPOPT_MAXSEG; *optp++ = TCPOLEN_MAXSEG; to->to_mss = htons(to->to_mss); bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss)); optp += sizeof(to->to_mss); break; case TOF_SCALE: while (!optlen || optlen % 2 != 1) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW) continue; optlen += TCPOLEN_WINDOW; *optp++ = TCPOPT_WINDOW; *optp++ = TCPOLEN_WINDOW; *optp++ = to->to_wscale; break; case TOF_SACKPERM: while (optlen % 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED) continue; optlen += TCPOLEN_SACK_PERMITTED; *optp++ = TCPOPT_SACK_PERMITTED; *optp++ = TCPOLEN_SACK_PERMITTED; break; case TOF_TS: while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP) continue; optlen += TCPOLEN_TIMESTAMP; *optp++ = TCPOPT_TIMESTAMP; *optp++ = TCPOLEN_TIMESTAMP; to->to_tsval = htonl(to->to_tsval); to->to_tsecr = htonl(to->to_tsecr); bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval)); optp += sizeof(to->to_tsval); bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr)); optp += sizeof(to->to_tsecr); break; #ifdef TCP_SIGNATURE case TOF_SIGNATURE: { int siglen = TCPOLEN_SIGNATURE - 2; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE) continue; optlen += TCPOLEN_SIGNATURE; *optp++ = TCPOPT_SIGNATURE; *optp++ = TCPOLEN_SIGNATURE; to->to_signature = optp; while (siglen--) *optp++ = 0; break; } #endif case TOF_SACK: { int sackblks = 0; struct sackblk *sack = (struct sackblk *)to->to_sacks; tcp_seq sack_seq; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK) continue; optlen += TCPOLEN_SACKHDR; *optp++ = TCPOPT_SACK; sackblks = min(to->to_nsacks, (TCP_MAXOLEN - optlen) / TCPOLEN_SACK); *optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK; while (sackblks--) { sack_seq = htonl(sack->start); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); sack_seq = htonl(sack->end); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); optlen += TCPOLEN_SACK; sack++; } TCPSTAT_INC(tcps_sack_send_blocks); break; } #ifdef TCP_RFC7413 case TOF_FASTOPEN: { int total_len; /* XXX is there any point to aligning this option? */ total_len = TCPOLEN_FAST_OPEN_EMPTY + to->to_tfo_len; if (TCP_MAXOLEN - optlen < total_len) continue; *optp++ = TCPOPT_FAST_OPEN; *optp++ = total_len; if (to->to_tfo_len > 0) { bcopy(to->to_tfo_cookie, optp, to->to_tfo_len); optp += to->to_tfo_len; } optlen += total_len; break; } #endif default: panic("%s: unknown TCP option type", __func__); break; } } /* Terminate and pad TCP options to a 4 byte boundary. */ if (optlen % 4) { optlen += TCPOLEN_EOL; *optp++ = TCPOPT_EOL; } /* * According to RFC 793 (STD0007): * "The content of the header beyond the End-of-Option option * must be header padding (i.e., zero)." * and later: "The padding is composed of zeros." */ while (optlen % 4) { optlen += TCPOLEN_PAD; *optp++ = TCPOPT_PAD; } KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__)); return (optlen); } Index: head/sys/netinet/udp_usrreq.c =================================================================== --- head/sys/netinet/udp_usrreq.c (revision 305823) +++ head/sys/netinet/udp_usrreq.c (revision 305824) @@ -1,1959 +1,1960 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2008 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2014 Kevin Lo * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef IPSEC #include #include #endif #include #include /* * UDP and UDP-Lite protocols implementation. * Per RFC 768, August, 1980. * Per RFC 3828, July, 2004. */ /* * BSD 4.2 defaulted the udp checksum to be off. Turning off udp checksums * removes the only data integrity mechanism for packets and malformed * packets that would otherwise be discarded due to bad checksums, and may * cause problems (especially for NFS data blocks). */ VNET_DEFINE(int, udp_cksum) = 1; SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_cksum), 0, "compute udp checksum"); int udp_log_in_vain = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, &udp_log_in_vain, 0, "Log all incoming UDP packets"); VNET_DEFINE(int, udp_blackhole) = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_blackhole), 0, "Do not send port unreachables for refused connects"); static VNET_DEFINE(int, udp_require_l2_bcast) = 0; #define V_udp_require_l2_bcast VNET(udp_require_l2_bcast) SYSCTL_INT(_net_inet_udp, OID_AUTO, require_l2_bcast, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_require_l2_bcast), 0, "Only treat packets sent to an L2 broadcast address as broadcast packets"); u_long udp_sendspace = 9216; /* really max datagram size */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); u_long udp_recvspace = 40 * (1024 + #ifdef INET6 sizeof(struct sockaddr_in6) #else sizeof(struct sockaddr_in) #endif ); /* 40 1K datagrams */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); VNET_DEFINE(struct inpcbhead, udb); /* from udp_var.h */ VNET_DEFINE(struct inpcbinfo, udbinfo); VNET_DEFINE(struct inpcbhead, ulitecb); VNET_DEFINE(struct inpcbinfo, ulitecbinfo); static VNET_DEFINE(uma_zone_t, udpcb_zone); #define V_udpcb_zone VNET(udpcb_zone) #ifndef UDBHASHSIZE #define UDBHASHSIZE 128 #endif VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat); /* from udp_var.h */ VNET_PCPUSTAT_SYSINIT(udpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(udpstat); #endif /* VIMAGE */ #ifdef INET static void udp_detach(struct socket *so); static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *); #endif #ifdef IPSEC #ifdef IPSEC_NAT_T #define UF_ESPINUDP_ALL (UF_ESPINUDP_NON_IKE|UF_ESPINUDP) #ifdef INET static struct mbuf *udp4_espdecap(struct inpcb *, struct mbuf *, int); #endif #endif /* IPSEC_NAT_T */ #endif /* IPSEC */ static void udp_zone_change(void *tag) { uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets); uma_zone_set_max(V_udpcb_zone, maxsockets); } static int udp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp; inp = mem; INP_LOCK_INIT(inp, "inp", "udpinp"); return (0); } static int udplite_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp; inp = mem; INP_LOCK_INIT(inp, "inp", "udpliteinp"); return (0); } void udp_init(void) { /* * For now default to 2-tuple UDP hashing - until the fragment * reassembly code can also update the flowid. * * Once we can calculate the flowid that way and re-establish * a 4-tuple, flip this to 4-tuple. */ in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, "udp_inpcb", udp_inpcb_init, NULL, 0, IPI_HASHFIELDS_2TUPLE); V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_udpcb_zone, maxsockets); uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } void udplite_init(void) { in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE, UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init, NULL, 0, IPI_HASHFIELDS_2TUPLE); } /* * Kernel module interface for updating udpstat. The argument is an index * into udpstat treated as an array of u_long. While this encodes the * general layout of udpstat into the caller, it doesn't encode its location, * so that future changes to add, for example, per-CPU stats support won't * cause binary compatibility problems for kernel modules. */ void kmod_udpstat_inc(int statnum) { counter_u64_add(VNET(udpstat)[statnum], 1); } int udp_newudpcb(struct inpcb *inp) { struct udpcb *up; up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO); if (up == NULL) return (ENOBUFS); inp->inp_ppcb = up; return (0); } void udp_discardcb(struct udpcb *up) { uma_zfree(V_udpcb_zone, up); } #ifdef VIMAGE static void udp_destroy(void *unused __unused) { in_pcbinfo_destroy(&V_udbinfo); uma_zdestroy(V_udpcb_zone); } VNET_SYSUNINIT(udp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_destroy, NULL); static void udplite_destroy(void *unused __unused) { in_pcbinfo_destroy(&V_ulitecbinfo); } VNET_SYSUNINIT(udplite, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udplite_destroy, NULL); #endif #ifdef INET /* * Subroutine of udp_input(), which appends the provided mbuf chain to the * passed pcb/socket. The caller must provide a sockaddr_in via udp_in that * contains the source address. If the socket ends up being an IPv6 socket, * udp_append() will convert to a sockaddr_in6 before passing the address * into the socket code. * * In the normal case udp_append() will return 0, indicating that you * must unlock the inp. However if a tunneling protocol is in place we increment * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we * then decrement the reference count. If the inp_rele returns 1, indicating the * inp is gone, we return that to the caller to tell them *not* to unlock * the inp. In the case of multi-cast this will cause the distribution * to stop (though most tunneling protocols known currently do *not* use * multicast). */ static int udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, struct sockaddr_in *udp_in) { struct sockaddr *append_sa; struct socket *so; struct mbuf *opts = NULL; #ifdef INET6 struct sockaddr_in6 udp_in6; #endif struct udpcb *up; INP_LOCK_ASSERT(inp); /* * Engage the tunneling protocol. */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { in_pcbref(inp); INP_RUNLOCK(inp); (*up->u_tun_func)(n, off, inp, (struct sockaddr *)udp_in, up->u_tun_ctx); INP_RLOCK(inp); return (in_pcbrele_rlocked(inp)); } off += sizeof(struct udphdr); #ifdef IPSEC /* Check AH/ESP integrity. */ if (ipsec4_in_reject(n, inp)) { m_freem(n); return (0); } #ifdef IPSEC_NAT_T up = intoudpcb(inp); KASSERT(up != NULL, ("%s: udpcb NULL", __func__)); if (up->u_flags & UF_ESPINUDP_ALL) { /* IPSec UDP encaps. */ n = udp4_espdecap(inp, n, off); if (n == NULL) /* Consumed. */ return (0); } #endif /* IPSEC_NAT_T */ #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); return (0); } #endif /* MAC */ if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) { #ifdef INET6 if (inp->inp_vflag & INP_IPV6) (void)ip6_savecontrol_v4(inp, n, &opts, NULL); else #endif /* INET6 */ ip_savecontrol(inp, &opts, ip, n); } #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { bzero(&udp_in6, sizeof(udp_in6)); udp_in6.sin6_len = sizeof(udp_in6); udp_in6.sin6_family = AF_INET6; in6_sin_2_v4mapsin6(udp_in, &udp_in6); append_sa = (struct sockaddr *)&udp_in6; } else #endif /* INET6 */ append_sa = (struct sockaddr *)udp_in; m_adj(n, off); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); m_freem(n); if (opts) m_freem(opts); UDPSTAT_INC(udps_fullsock); } else sorwakeup_locked(so); return (0); } int udp_input(struct mbuf **mp, int *offp, int proto) { struct ip *ip; struct udphdr *uh; struct ifnet *ifp; struct inpcb *inp; uint16_t len, ip_len; struct inpcbinfo *pcbinfo; struct ip save_ip; struct sockaddr_in udp_in; struct mbuf *m; struct m_tag *fwd_tag; int cscov_partial, iphlen; m = *mp; iphlen = *offp; ifp = m->m_pkthdr.rcvif; *mp = NULL; UDPSTAT_INC(udps_ipackets); /* * Strip IP options, if any; should skip this, make available to * user, and use on returned packets, but we don't yet have a way to * check the checksum with options still present. */ if (iphlen > sizeof (struct ip)) { ip_stripoptions(m); iphlen = sizeof(struct ip); } /* * Get IP and UDP header together in first mbuf. */ ip = mtod(m, struct ip *); if (m->m_len < iphlen + sizeof(struct udphdr)) { if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) { UDPSTAT_INC(udps_hdrops); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); } uh = (struct udphdr *)((caddr_t)ip + iphlen); cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0; /* * Destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; /* * Construct sockaddr format source address. Stuff source address * and datagram in user buffer. */ bzero(&udp_in, sizeof(udp_in)); udp_in.sin_len = sizeof(udp_in); udp_in.sin_family = AF_INET; udp_in.sin_port = uh->uh_sport; udp_in.sin_addr = ip->ip_src; /* * Make mbuf data length reflect UDP length. If not enough data to * reflect UDP length, drop. */ len = ntohs((u_short)uh->uh_ulen); ip_len = ntohs(ip->ip_len) - iphlen; if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) { /* Zero means checksum over the complete packet. */ if (len == 0) len = ip_len; cscov_partial = 0; } if (ip_len != len) { if (len > ip_len || len < sizeof(struct udphdr)) { UDPSTAT_INC(udps_badlen); goto badunlocked; } if (proto == IPPROTO_UDP) m_adj(m, len - ip_len); } /* * Save a copy of the IP header in case we want restore it for * sending an ICMP error message in response. */ if (!V_udp_blackhole) save_ip = *ip; else memset(&save_ip, 0, sizeof(save_ip)); /* * Checksum extended UDP header and data. */ if (uh->uh_sum) { u_short uh_sum; if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) && !cscov_partial) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + proto)); uh_sum ^= 0xffff; } else { char b[9]; bcopy(((struct ipovly *)ip)->ih_x1, b, 9); bzero(((struct ipovly *)ip)->ih_x1, 9); ((struct ipovly *)ip)->ih_len = (proto == IPPROTO_UDP) ? uh->uh_ulen : htons(ip_len); uh_sum = in_cksum(m, len + sizeof (struct ip)); bcopy(b, ((struct ipovly *)ip)->ih_x1, 9); } if (uh_sum) { UDPSTAT_INC(udps_badsum); m_freem(m); return (IPPROTO_DONE); } } else { if (proto == IPPROTO_UDP) { UDPSTAT_INC(udps_nosum); } else { /* UDPLite requires a checksum */ /* XXX: What is the right UDPLite MIB counter here? */ m_freem(m); return (IPPROTO_DONE); } } pcbinfo = udp_get_inpcbinfo(proto); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || ((!V_udp_require_l2_bcast || m->m_flags & M_BCAST) && in_broadcast(ip->ip_dst, ifp))) { struct inpcb *last; struct inpcbhead *pcblist; struct ip_moptions *imo; INP_INFO_RLOCK(pcbinfo); pcblist = udp_get_pcblist(proto); last = NULL; LIST_FOREACH(inp, pcblist, inp_list) { if (inp->inp_lport != uh->uh_dport) continue; #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_laddr.s_addr != INADDR_ANY && inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; if (inp->inp_faddr.s_addr != INADDR_ANY && inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; if (inp->inp_fport != 0 && inp->inp_fport != uh->uh_sport) continue; INP_RLOCK(inp); /* * XXXRW: Because we weren't holding either the inpcb * or the hash lock when we checked for a match * before, we should probably recheck now that the * inpcb lock is held. */ /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ imo = inp->inp_moptions; if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct sockaddr_in group; int blocked; if (imo == NULL) { INP_RUNLOCK(inp); continue; } bzero(&group, sizeof(struct sockaddr_in)); group.sin_len = sizeof(struct sockaddr_in); group.sin_family = AF_INET; group.sin_addr = ip->ip_dst; blocked = imo_multi_filter(imo, ifp, (struct sockaddr *)&group, (struct sockaddr *)&udp_in); if (blocked != MCAST_PASS) { if (blocked == MCAST_NOTGMEMBER) IPSTAT_INC(ips_notmember); if (blocked == MCAST_NOTSMEMBER || blocked == MCAST_MUTED) UDPSTAT_INC(udps_filtermcast); INP_RUNLOCK(inp); continue; } } if (last != NULL) { struct mbuf *n; - if ((n = m_copy(m, 0, M_COPYALL)) != NULL) { + if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != + NULL) { UDP_PROBE(receive, NULL, last, ip, last, uh); if (udp_append(last, ip, n, iphlen, &udp_in)) { goto inp_lost; } } INP_RUNLOCK(last); } last = inp; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids * searching through all pcbs in the common case of a * non-shared port. It assumes that an application * will never clear these options after setting them. */ if ((last->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } if (last == NULL) { /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ UDPSTAT_INC(udps_noportbcast); if (inp) INP_RUNLOCK(inp); INP_INFO_RUNLOCK(pcbinfo); goto badunlocked; } UDP_PROBE(receive, NULL, last, ip, last, uh); if (udp_append(last, ip, m, iphlen, &udp_in) == 0) INP_RUNLOCK(last); inp_lost: INP_INFO_RUNLOCK(pcbinfo); return (IPPROTO_DONE); } /* * Locate pcb for datagram. */ /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ((m->m_flags & M_IP_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in_pcblookup(pcbinfo, ip->ip_src, uh->uh_sport, next_hop->sin_addr, next_hop->sin_port ? htons(next_hop->sin_port) : uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, ifp); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP_NEXTHOP; } else inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, ifp, m); if (inp == NULL) { if (udp_log_in_vain) { char buf[4*sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_dst)); log(LOG_INFO, "Connection attempt to UDP %s:%d from %s:%d\n", buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src), ntohs(uh->uh_sport)); } UDPSTAT_INC(udps_noport); if (m->m_flags & (M_BCAST | M_MCAST)) { UDPSTAT_INC(udps_noportbcast); goto badunlocked; } if (V_udp_blackhole) goto badunlocked; if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) goto badunlocked; *ip = save_ip; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); return (IPPROTO_DONE); } /* * Check the minimum TTL for socket. */ INP_RLOCK_ASSERT(inp); if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) { INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } if (cscov_partial) { struct udpcb *up; up = intoudpcb(inp); if (up->u_rxcslen == 0 || up->u_rxcslen > len) { INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } } UDP_PROBE(receive, NULL, inp, ip, inp, uh); if (udp_append(inp, ip, m, iphlen, &udp_in) == 0) INP_RUNLOCK(inp); return (IPPROTO_DONE); badunlocked: m_freem(m); return (IPPROTO_DONE); } #endif /* INET */ /* * Notify a udp user of an asynchronous error; just wake up so that they can * collect error status. */ struct inpcb * udp_notify(struct inpcb *inp, int errno) { /* * While udp_ctlinput() always calls udp_notify() with a read lock * when invoking it directly, in_pcbnotifyall() currently uses write * locks due to sharing code with TCP. For now, accept either a read * or a write lock, but a read lock is sufficient. */ INP_LOCK_ASSERT(inp); if ((errno == EHOSTUNREACH || errno == ENETUNREACH || errno == EHOSTDOWN) && inp->inp_route.ro_rt) { RTFREE(inp->inp_route.ro_rt); inp->inp_route.ro_rt = (struct rtentry *)NULL; } inp->inp_socket->so_error = errno; sorwakeup(inp->inp_socket); sowwakeup(inp->inp_socket); return (inp); } #ifdef INET static void udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip, struct inpcbinfo *pcbinfo) { struct ip *ip = vip; struct udphdr *uh; struct in_addr faddr; struct inpcb *inp; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (PRC_IS_REDIRECT(cmd)) { /* signal EHOSTDOWN, as it flushes the cached route */ in_pcbnotifyall(&V_udbinfo, faddr, EHOSTDOWN, udp_notify); return; } /* * Hostdead is ugly because it goes linearly through all PCBs. * * XXX: We never get this from ICMP, otherwise it makes an excellent * DoS attack on machines with many connections. */ if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip != NULL) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket != NULL) { udp_notify(inp, inetctlerrmap[cmd]); } INP_RUNLOCK(inp); } else { inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { struct udpcb *up; up = intoudpcb(inp); if (up->u_icmp_func != NULL) { INP_RUNLOCK(inp); (*up->u_icmp_func)(cmd, sa, vip, up->u_tun_ctx); } else { INP_RUNLOCK(inp); } } } } else in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd], udp_notify); } void udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo)); } void udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip) { return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo)); } #endif /* INET */ static int udp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the PCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = V_udbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return (0); } if (req->newptr != 0) return (EPERM); /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&V_udbinfo); gencnt = V_udbinfo.ipi_gencnt; n = V_udbinfo.ipi_count; INP_INFO_RUNLOCK(&V_udbinfo); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xinpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == NULL) return (ENOMEM); INP_INFO_RLOCK(&V_udbinfo); for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { in_pcbref(inp); inp_list[i++] = inp; } INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&V_udbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); xi.xi_inp.inp_gencnt = inp->inp_gencnt; INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else INP_RUNLOCK(inp); } INP_INFO_WLOCK(&V_udbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_udbinfo); if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ INP_INFO_RLOCK(&V_udbinfo); xig.xig_gen = V_udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_udbinfo.ipi_count; INP_INFO_RUNLOCK(&V_udbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, udp_pcblist, "S,xinpcb", "List of active UDP sockets"); #ifdef INET static int udp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, udp_getcred, "S,xucred", "Get the xucred of a UDP connection"); #endif /* INET */ int udp_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp; struct udpcb *up; int isudplite, error, optval; error = 0; isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); if (sopt->sopt_level != so->so_proto->pr_protocol) { #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) { INP_WUNLOCK(inp); error = ip6_ctloutput(so, sopt); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { INP_WUNLOCK(inp); error = ip_ctloutput(so, sopt); } #endif return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case UDP_ENCAP: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); #ifdef IPSEC_NAT_T up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); #endif switch (optval) { case 0: /* Clear all UDP encap. */ #ifdef IPSEC_NAT_T up->u_flags &= ~UF_ESPINUDP_ALL; #endif break; #ifdef IPSEC_NAT_T case UDP_ENCAP_ESPINUDP: case UDP_ENCAP_ESPINUDP_NON_IKE: up->u_flags &= ~UF_ESPINUDP_ALL; if (optval == UDP_ENCAP_ESPINUDP) up->u_flags |= UF_ESPINUDP; else if (optval == UDP_ENCAP_ESPINUDP_NON_IKE) up->u_flags |= UF_ESPINUDP_NON_IKE; break; #endif default: error = EINVAL; break; } INP_WUNLOCK(inp); break; case UDPLITE_SEND_CSCOV: case UDPLITE_RECV_CSCOV: if (!isudplite) { INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); if ((optval != 0 && optval < 8) || (optval > 65535)) { INP_WUNLOCK(inp); error = EINVAL; break; } if (sopt->sopt_name == UDPLITE_SEND_CSCOV) up->u_txcslen = optval; else up->u_rxcslen = optval; INP_WUNLOCK(inp); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { #ifdef IPSEC_NAT_T case UDP_ENCAP: up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); optval = up->u_flags & UF_ESPINUDP_ALL; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; #endif case UDPLITE_SEND_CSCOV: case UDPLITE_RECV_CSCOV: if (!isudplite) { INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); if (sopt->sopt_name == UDPLITE_SEND_CSCOV) optval = up->u_txcslen; else optval = up->u_rxcslen; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; } return (error); } #ifdef INET #define UH_WLOCKED 2 #define UH_RLOCKED 1 #define UH_UNLOCKED 0 static int udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct udpiphdr *ui; int len = m->m_pkthdr.len; struct in_addr faddr, laddr; struct cmsghdr *cm; struct inpcbinfo *pcbinfo; struct sockaddr_in *sin, src; int cscov_partial = 0; int error = 0; int ipflags; u_short fport, lport; int unlock_udbinfo, unlock_inp; u_char tos; uint8_t pr; uint16_t cscov = 0; uint32_t flowid = 0; uint8_t flowtype = M_HASHTYPE_NONE; /* * udp_output() may need to temporarily bind or connect the current * inpcb. As such, we don't know up front whether we will need the * pcbinfo lock or not. Do any work to decide what is needed up * front before acquiring any locks. */ if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { if (control) m_freem(control); m_freem(m); return (EMSGSIZE); } src.sin_family = 0; sin = (struct sockaddr_in *)addr; if (sin == NULL || (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { INP_WLOCK(inp); unlock_inp = UH_WLOCKED; } else { INP_RLOCK(inp); unlock_inp = UH_RLOCKED; } tos = inp->inp_ip_tos; if (control != NULL) { /* * XXX: Currently, we assume all the optional information is * stored in a single mbuf. */ if (control->m_next) { if (unlock_inp == UH_WLOCKED) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); m_freem(control); m_freem(m); return (EINVAL); } for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { cm = mtod(control, struct cmsghdr *); if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) { error = EINVAL; break; } if (cm->cmsg_level != IPPROTO_IP) continue; switch (cm->cmsg_type) { case IP_SENDSRCADDR: if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_addr))) { error = EINVAL; break; } bzero(&src, sizeof(src)); src.sin_family = AF_INET; src.sin_len = sizeof(src); src.sin_port = inp->inp_lport; src.sin_addr = *(struct in_addr *)CMSG_DATA(cm); break; case IP_TOS: if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) { error = EINVAL; break; } tos = *(u_char *)CMSG_DATA(cm); break; case IP_FLOWID: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } flowid = *(uint32_t *) CMSG_DATA(cm); break; case IP_FLOWTYPE: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } flowtype = *(uint32_t *) CMSG_DATA(cm); break; #ifdef RSS case IP_RSSBUCKETID: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } /* This is just a placeholder for now */ break; #endif /* RSS */ default: error = ENOPROTOOPT; break; } if (error) break; } m_freem(control); } if (error) { if (unlock_inp == UH_WLOCKED) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); m_freem(m); return (error); } /* * Depending on whether or not the application has bound or connected * the socket, we may have to do varying levels of work. The optimal * case is for a connected UDP socket, as a global lock isn't * required at all. * * In order to decide which we need, we require stability of the * inpcb binding, which we ensure by acquiring a read lock on the * inpcb. This doesn't strictly follow the lock order, so we play * the trylock and retry game; note that we may end up with more * conservative locks than required the second time around, so later * assertions have to accept that. Further analysis of the number of * misses under contention is required. * * XXXRW: Check that hash locking update here is correct. */ pr = inp->inp_socket->so_proto->pr_protocol; pcbinfo = udp_get_inpcbinfo(pr); sin = (struct sockaddr_in *)addr; if (sin != NULL && (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { INP_HASH_WLOCK(pcbinfo); unlock_udbinfo = UH_WLOCKED; } else if ((sin != NULL && ( (sin->sin_addr.s_addr == INADDR_ANY) || (sin->sin_addr.s_addr == INADDR_BROADCAST) || (inp->inp_laddr.s_addr == INADDR_ANY) || (inp->inp_lport == 0))) || (src.sin_family == AF_INET)) { INP_HASH_RLOCK(pcbinfo); unlock_udbinfo = UH_RLOCKED; } else unlock_udbinfo = UH_UNLOCKED; /* * If the IP_SENDSRCADDR control message was specified, override the * source address for this datagram. Its use is invalidated if the * address thus specified is incomplete or clobbers other inpcbs. */ laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_family == AF_INET) { INP_HASH_LOCK_ASSERT(pcbinfo); if ((lport == 0) || (laddr.s_addr == INADDR_ANY && src.sin_addr.s_addr == INADDR_ANY)) { error = EINVAL; goto release; } error = in_pcbbind_setup(inp, (struct sockaddr *)&src, &laddr.s_addr, &lport, td->td_ucred); if (error) goto release; } /* * If a UDP socket has been connected, then a local address/port will * have been selected and bound. * * If a UDP socket has not been connected to, then an explicit * destination address must be used, in which case a local * address/port may not have been selected and bound. */ if (sin != NULL) { INP_LOCK_ASSERT(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto release; } /* * Jail may rewrite the destination address, so let it do * that before we use it. */ error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error) goto release; /* * If a local address or port hasn't yet been selected, or if * the destination address needs to be rewritten due to using * a special INADDR_ constant, invoke in_pcbconnect_setup() * to do the heavy lifting. Once a port is selected, we * commit the binding back to the socket; we also commit the * binding of the address if in jail. * * If we already have a valid binding and we're not * requesting a destination address rewrite, use a fast path. */ if (inp->inp_laddr.s_addr == INADDR_ANY || inp->inp_lport == 0 || sin->sin_addr.s_addr == INADDR_ANY || sin->sin_addr.s_addr == INADDR_BROADCAST) { INP_HASH_LOCK_ASSERT(pcbinfo); error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, &faddr.s_addr, &fport, NULL, td->td_ucred); if (error) goto release; /* * XXXRW: Why not commit the port if the address is * !INADDR_ANY? */ /* Commit the local port if newly assigned. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); /* * Remember addr if jailed, to prevent * rebinding. */ if (prison_flag(td->td_ucred, PR_IP4)) inp->inp_laddr = laddr; inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->inp_lport = 0; error = EAGAIN; goto release; } inp->inp_flags |= INP_ANONPORT; } } else { faddr = sin->sin_addr; fport = sin->sin_port; } } else { INP_LOCK_ASSERT(inp); faddr = inp->inp_faddr; fport = inp->inp_fport; if (faddr.s_addr == INADDR_ANY) { error = ENOTCONN; goto release; } } /* * Calculate data length and get a mbuf for UDP, IP, and possible * link-layer headers. Immediate slide the data pointer back forward * since we won't use that space at this layer. */ M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto release; } m->m_data += max_linkhdr; m->m_len -= max_linkhdr; m->m_pkthdr.len -= max_linkhdr; /* * Fill in mbuf with extended UDP header and addresses and length put * into network format. */ ui = mtod(m, struct udpiphdr *); bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ ui->ui_pr = pr; ui->ui_src = laddr; ui->ui_dst = faddr; ui->ui_sport = lport; ui->ui_dport = fport; ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); if (pr == IPPROTO_UDPLITE) { struct udpcb *up; uint16_t plen; up = intoudpcb(inp); cscov = up->u_txcslen; plen = (u_short)len + sizeof(struct udphdr); if (cscov >= plen) cscov = 0; ui->ui_len = htons(plen); ui->ui_ulen = htons(cscov); /* * For UDP-Lite, checksum coverage length of zero means * the entire UDPLite packet is covered by the checksum. */ cscov_partial = (cscov == 0) ? 0 : 1; } else ui->ui_v = IPVERSION << 4; /* * Set the Don't Fragment bit in the IP header. */ if (inp->inp_flags & INP_DONTFRAG) { struct ip *ip; ip = (struct ip *)&ui->ui_i; ip->ip_off |= htons(IP_DF); } ipflags = 0; if (inp->inp_socket->so_options & SO_DONTROUTE) ipflags |= IP_ROUTETOIF; if (inp->inp_socket->so_options & SO_BROADCAST) ipflags |= IP_ALLOWBROADCAST; if (inp->inp_flags & INP_ONESBCAST) ipflags |= IP_SENDONES; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif /* * Set up checksum and output datagram. */ ui->ui_sum = 0; if (pr == IPPROTO_UDPLITE) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; if (cscov_partial) { if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0) ui->ui_sum = 0xffff; } else { if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0) ui->ui_sum = 0xffff; } } else if (V_udp_cksum) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr, htons((u_short)len + sizeof(struct udphdr) + pr)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } ((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len); ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ ((struct ip *)ui)->ip_tos = tos; /* XXX */ UDPSTAT_INC(udps_opackets); /* * Setup flowid / RSS information for outbound socket. * * Once the UDP code decides to set a flowid some other way, * this allows the flowid to be overridden by userland. */ if (flowtype != M_HASHTYPE_NONE) { m->m_pkthdr.flowid = flowid; M_HASHTYPE_SET(m, flowtype); #ifdef RSS } else { uint32_t hash_val, hash_type; /* * Calculate an appropriate RSS hash for UDP and * UDP Lite. * * The called function will take care of figuring out * whether a 2-tuple or 4-tuple hash is required based * on the currently configured scheme. * * Later later on connected socket values should be * cached in the inpcb and reused, rather than constantly * re-calculating it. * * UDP Lite is a different protocol number and will * likely end up being hashed as a 2-tuple until * RSS / NICs grow UDP Lite protocol awareness. */ if (rss_proto_software_hash_v4(faddr, laddr, fport, lport, pr, &hash_val, &hash_type) == 0) { m->m_pkthdr.flowid = hash_val; M_HASHTYPE_SET(m, hash_type); } #endif } #ifdef RSS /* * Don't override with the inp cached flowid value. * * Depending upon the kind of send being done, the inp * flowid/flowtype values may actually not be appropriate * for this particular socket send. * * We should either leave the flowid at zero (which is what is * currently done) or set it to some software generated * hash value based on the packet contents. */ ipflags |= IP_NODEFAULTFLOWID; #endif /* RSS */ if (unlock_udbinfo == UH_WLOCKED) INP_HASH_WUNLOCK(pcbinfo); else if (unlock_udbinfo == UH_RLOCKED) INP_HASH_RUNLOCK(pcbinfo); UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); error = ip_output(m, inp->inp_options, (unlock_inp == UH_WLOCKED ? &inp->inp_route : NULL), ipflags, inp->inp_moptions, inp); if (unlock_inp == UH_WLOCKED) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); return (error); release: if (unlock_udbinfo == UH_WLOCKED) { INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); } else if (unlock_udbinfo == UH_RLOCKED) { INP_HASH_RUNLOCK(pcbinfo); INP_RUNLOCK(inp); } else INP_RUNLOCK(inp); m_freem(m); return (error); } #if defined(IPSEC) && defined(IPSEC_NAT_T) /* * Potentially decap ESP in UDP frame. Check for an ESP header * and optional marker; if present, strip the UDP header and * push the result through IPSec. * * Returns mbuf to be processed (potentially re-allocated) or * NULL if consumed and/or processed. */ static struct mbuf * udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) { size_t minlen, payload, skip, iphlen; caddr_t data; struct udpcb *up; struct m_tag *tag; struct udphdr *udphdr; struct ip *ip; INP_RLOCK_ASSERT(inp); /* * Pull up data so the longest case is contiguous: * IP/UDP hdr + non ESP marker + ESP hdr. */ minlen = off + sizeof(uint64_t) + sizeof(struct esp); if (minlen > m->m_pkthdr.len) minlen = m->m_pkthdr.len; if ((m = m_pullup(m, minlen)) == NULL) { IPSECSTAT_INC(ips_in_inval); return (NULL); /* Bypass caller processing. */ } data = mtod(m, caddr_t); /* Points to ip header. */ payload = m->m_len - off; /* Size of payload. */ if (payload == 1 && data[off] == '\xff') return (m); /* NB: keepalive packet, no decap. */ up = intoudpcb(inp); KASSERT(up != NULL, ("%s: udpcb NULL", __func__)); KASSERT((up->u_flags & UF_ESPINUDP_ALL) != 0, ("u_flags 0x%x", up->u_flags)); /* * Check that the payload is large enough to hold an * ESP header and compute the amount of data to remove. * * NB: the caller has already done a pullup for us. * XXX can we assume alignment and eliminate bcopys? */ if (up->u_flags & UF_ESPINUDP_NON_IKE) { /* * draft-ietf-ipsec-nat-t-ike-0[01].txt and * draft-ietf-ipsec-udp-encaps-(00/)01.txt, ignoring * possible AH mode non-IKE marker+non-ESP marker * from draft-ietf-ipsec-udp-encaps-00.txt. */ uint64_t marker; if (payload <= sizeof(uint64_t) + sizeof(struct esp)) return (m); /* NB: no decap. */ bcopy(data + off, &marker, sizeof(uint64_t)); if (marker != 0) /* Non-IKE marker. */ return (m); /* NB: no decap. */ skip = sizeof(uint64_t) + sizeof(struct udphdr); } else { uint32_t spi; if (payload <= sizeof(struct esp)) { IPSECSTAT_INC(ips_in_inval); m_freem(m); return (NULL); /* Discard. */ } bcopy(data + off, &spi, sizeof(uint32_t)); if (spi == 0) /* Non-ESP marker. */ return (m); /* NB: no decap. */ skip = sizeof(struct udphdr); } /* * Setup a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember * the UDP ports. This is required if we want to select * the right SPD for multiple hosts behind same NAT. * * NB: ports are maintained in network byte order everywhere * in the NAT-T code. */ tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS, 2 * sizeof(uint16_t), M_NOWAIT); if (tag == NULL) { IPSECSTAT_INC(ips_in_nomem); m_freem(m); return (NULL); /* Discard. */ } iphlen = off - sizeof(struct udphdr); udphdr = (struct udphdr *)(data + iphlen); ((uint16_t *)(tag + 1))[0] = udphdr->uh_sport; ((uint16_t *)(tag + 1))[1] = udphdr->uh_dport; m_tag_prepend(m, tag); /* * Remove the UDP header (and possibly the non ESP marker) * IP header length is iphlen * Before: * <--- off ---> * +----+------+-----+ * | IP | UDP | ESP | * +----+------+-----+ * <-skip-> * After: * +----+-----+ * | IP | ESP | * +----+-----+ * <-skip-> */ ovbcopy(data, data + skip, iphlen); m_adj(m, skip); ip = mtod(m, struct ip *); ip->ip_len = htons(ntohs(ip->ip_len) - skip); ip->ip_p = IPPROTO_ESP; /* * We cannot yet update the cksums so clear any * h/w cksum flags as they are no longer valid. */ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR); (void) ipsec_common_input(m, iphlen, offsetof(struct ip, ip_p), AF_INET, ip->ip_p); return (NULL); /* NB: consumed, bypass processing. */ } #endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */ static void udp_abort(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_abort: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp_attach: inp != NULL")); error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); INP_INFO_WLOCK(pcbinfo); error = in_pcballoc(so, pcbinfo); if (error) { INP_INFO_WUNLOCK(pcbinfo); return (error); } inp = sotoinpcb(so); inp->inp_vflag |= INP_IPV4; inp->inp_ip_ttl = V_ip_defttl; error = udp_newudpcb(inp); if (error) { in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); return (error); } INP_WUNLOCK(inp); INP_INFO_WUNLOCK(pcbinfo); return (0); } #endif /* INET */ int udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, udp_tun_icmp_t i, void *ctx) { struct inpcb *inp; struct udpcb *up; KASSERT(so->so_type == SOCK_DGRAM, ("udp_set_kernel_tunneling: !dgram")); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL")); INP_WLOCK(inp); up = intoudpcb(inp); if ((up->u_tun_func != NULL) || (up->u_icmp_func != NULL)) { INP_WUNLOCK(inp); return (EBUSY); } up->u_tun_func = f; up->u_icmp_func = i; up->u_tun_ctx = ctx; INP_WUNLOCK(inp); return (0); } #ifdef INET static int udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_bind: inp == NULL")); INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); } static void udp_close(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_close: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct sockaddr_in *sin; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_connect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_WUNLOCK(inp); return (EISCONN); } sin = (struct sockaddr_in *)nam; error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error != 0) { INP_WUNLOCK(inp); return (error); } INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error == 0) soisconnected(so); INP_WUNLOCK(inp); return (error); } static void udp_detach(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct udpcb *up; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("udp_detach: not disconnected")); INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); inp->inp_ppcb = NULL; in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); udp_discardcb(up); } static int udp_disconnect(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_disconnect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr == INADDR_ANY) { INP_WUNLOCK(inp); return (ENOTCONN); } INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); INP_WUNLOCK(inp); return (0); } static int udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_send: inp == NULL")); return (udp_output(inp, m, addr, control, td)); } #endif /* INET */ int udp_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } #ifdef INET struct pr_usrreqs udp_usrreqs = { .pru_abort = udp_abort, .pru_attach = udp_attach, .pru_bind = udp_bind, .pru_connect = udp_connect, .pru_control = in_control, .pru_detach = udp_detach, .pru_disconnect = udp_disconnect, .pru_peeraddr = in_getpeeraddr, .pru_send = udp_send, .pru_soreceive = soreceive_dgram, .pru_sosend = sosend_dgram, .pru_shutdown = udp_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = udp_close, }; #endif /* INET */ Index: head/sys/netinet6/icmp6.c =================================================================== --- head/sys/netinet6/icmp6.c (revision 305823) +++ head/sys/netinet6/icmp6.c (revision 305824) @@ -1,2870 +1,2870 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #define MBUF_PRIVATE /* XXXRW: Optimisation tries to avoid M_EXT mbufs */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern struct domain inet6domain; VNET_PCPUSTAT_DEFINE(struct icmp6stat, icmp6stat); VNET_PCPUSTAT_SYSINIT(icmp6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(icmp6stat); #endif /* VIMAGE */ VNET_DECLARE(struct inpcbinfo, ripcbinfo); VNET_DECLARE(struct inpcbhead, ripcb); VNET_DECLARE(int, icmp6errppslim); static VNET_DEFINE(int, icmp6errpps_count) = 0; static VNET_DEFINE(struct timeval, icmp6errppslim_last); VNET_DECLARE(int, icmp6_nodeinfo); #define V_ripcbinfo VNET(ripcbinfo) #define V_ripcb VNET(ripcb) #define V_icmp6errppslim VNET(icmp6errppslim) #define V_icmp6errpps_count VNET(icmp6errpps_count) #define V_icmp6errppslim_last VNET(icmp6errppslim_last) #define V_icmp6_nodeinfo VNET(icmp6_nodeinfo) static void icmp6_errcount(int, int); static int icmp6_rip6_input(struct mbuf **, int); static int icmp6_ratelimit(const struct in6_addr *, const int, const int); static const char *icmp6_redirect_diag(struct in6_addr *, struct in6_addr *, struct in6_addr *); static struct mbuf *ni6_input(struct mbuf *, int); static struct mbuf *ni6_nametodns(const char *, int, int); static int ni6_dnsmatch(const char *, int, const char *, int); static int ni6_addrs(struct icmp6_nodeinfo *, struct mbuf *, struct ifnet **, struct in6_addr *); static int ni6_store_addrs(struct icmp6_nodeinfo *, struct icmp6_nodeinfo *, struct ifnet *, int); static int icmp6_notify_error(struct mbuf **, int, int, int); /* * Kernel module interface for updating icmp6stat. The argument is an index * into icmp6stat treated as an array of u_quad_t. While this encodes the * general layout of icmp6stat into the caller, it doesn't encode its * location, so that future changes to add, for example, per-CPU stats * support won't cause binary compatibility problems for kernel modules. */ void kmod_icmp6stat_inc(int statnum) { counter_u64_add(VNET(icmp6stat)[statnum], 1); } static void icmp6_errcount(int type, int code) { switch (type) { case ICMP6_DST_UNREACH: switch (code) { case ICMP6_DST_UNREACH_NOROUTE: ICMP6STAT_INC(icp6s_odst_unreach_noroute); return; case ICMP6_DST_UNREACH_ADMIN: ICMP6STAT_INC(icp6s_odst_unreach_admin); return; case ICMP6_DST_UNREACH_BEYONDSCOPE: ICMP6STAT_INC(icp6s_odst_unreach_beyondscope); return; case ICMP6_DST_UNREACH_ADDR: ICMP6STAT_INC(icp6s_odst_unreach_addr); return; case ICMP6_DST_UNREACH_NOPORT: ICMP6STAT_INC(icp6s_odst_unreach_noport); return; } break; case ICMP6_PACKET_TOO_BIG: ICMP6STAT_INC(icp6s_opacket_too_big); return; case ICMP6_TIME_EXCEEDED: switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: ICMP6STAT_INC(icp6s_otime_exceed_transit); return; case ICMP6_TIME_EXCEED_REASSEMBLY: ICMP6STAT_INC(icp6s_otime_exceed_reassembly); return; } break; case ICMP6_PARAM_PROB: switch (code) { case ICMP6_PARAMPROB_HEADER: ICMP6STAT_INC(icp6s_oparamprob_header); return; case ICMP6_PARAMPROB_NEXTHEADER: ICMP6STAT_INC(icp6s_oparamprob_nextheader); return; case ICMP6_PARAMPROB_OPTION: ICMP6STAT_INC(icp6s_oparamprob_option); return; } break; case ND_REDIRECT: ICMP6STAT_INC(icp6s_oredirect); return; } ICMP6STAT_INC(icp6s_ounknown); } /* * A wrapper function for icmp6_error() necessary when the erroneous packet * may not contain enough scope zone information. */ void icmp6_error2(struct mbuf *m, int type, int code, int param, struct ifnet *ifp) { struct ip6_hdr *ip6; if (ifp == NULL) return; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); #else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) return; } #endif ip6 = mtod(m, struct ip6_hdr *); if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0) return; if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) return; icmp6_error(m, type, code, param); } /* * Generate an error packet of type error in response to bad IP6 packet. */ void icmp6_error(struct mbuf *m, int type, int code, int param) { struct ip6_hdr *oip6, *nip6; struct icmp6_hdr *icmp6; u_int preplen; int off; int nxt; ICMP6STAT_INC(icp6s_error); /* count per-type-code statistics */ icmp6_errcount(type, code); #ifdef M_DECRYPTED /*not openbsd*/ if (m->m_flags & M_DECRYPTED) { ICMP6STAT_INC(icp6s_canterror); goto freeit; } #endif #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); #else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) return; } #endif oip6 = mtod(m, struct ip6_hdr *); /* * If the destination address of the erroneous packet is a multicast * address, or the packet was sent using link-layer multicast, * we should basically suppress sending an error (RFC 2463, Section * 2.4). * We have two exceptions (the item e.2 in that section): * - the Packet Too Big message can be sent for path MTU discovery. * - the Parameter Problem Message that can be allowed an icmp6 error * in the option type field. This check has been done in * ip6_unknown_opt(), so we can just check the type and code. */ if ((m->m_flags & (M_BCAST|M_MCAST) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) && (type != ICMP6_PACKET_TOO_BIG && (type != ICMP6_PARAM_PROB || code != ICMP6_PARAMPROB_OPTION))) goto freeit; /* * RFC 2463, 2.4 (e.5): source address check. * XXX: the case of anycast source? */ if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_src)) goto freeit; /* * If we are about to send ICMPv6 against ICMPv6 error/redirect, * don't do it. */ nxt = -1; off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); if (off >= 0 && nxt == IPPROTO_ICMPV6) { struct icmp6_hdr *icp; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, off + sizeof(struct icmp6_hdr), ); icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off, sizeof(*icp)); if (icp == NULL) { ICMP6STAT_INC(icp6s_tooshort); return; } #endif if (icp->icmp6_type < ICMP6_ECHO_REQUEST || icp->icmp6_type == ND_REDIRECT) { /* * ICMPv6 error * Special case: for redirect (which is * informational) we must not send icmp6 error. */ ICMP6STAT_INC(icp6s_canterror); goto freeit; } else { /* ICMPv6 informational - send the error */ } } else { /* non-ICMPv6 - send the error */ } oip6 = mtod(m, struct ip6_hdr *); /* adjust pointer */ /* Finally, do rate limitation check. */ if (icmp6_ratelimit(&oip6->ip6_src, type, code)) { ICMP6STAT_INC(icp6s_toofreq); goto freeit; } /* * OK, ICMP6 can be generated. */ if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN) m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len); preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); M_PREPEND(m, preplen, M_NOWAIT); /* FIB is also copied over. */ if (m == NULL) { nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__)); return; } nip6 = mtod(m, struct ip6_hdr *); nip6->ip6_src = oip6->ip6_src; nip6->ip6_dst = oip6->ip6_dst; in6_clearscope(&oip6->ip6_src); in6_clearscope(&oip6->ip6_dst); icmp6 = (struct icmp6_hdr *)(nip6 + 1); icmp6->icmp6_type = type; icmp6->icmp6_code = code; icmp6->icmp6_pptr = htonl((u_int32_t)param); /* * icmp6_reflect() is designed to be in the input path. * icmp6_error() can be called from both input and output path, * and if we are in output path rcvif could contain bogus value. * clear m->m_pkthdr.rcvif for safety, we should have enough scope * information in ip header (nip6). */ m->m_pkthdr.rcvif = NULL; ICMP6STAT_INC(icp6s_outhist[type]); icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */ return; freeit: /* * If we can't tell whether or not we can generate ICMP6, free it. */ m_freem(m); } /* * Process a received ICMP6 message. */ int icmp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp, *n; struct ifnet *ifp; struct ip6_hdr *ip6, *nip6; struct icmp6_hdr *icmp6, *nicmp6; int off = *offp; int icmp6len = m->m_pkthdr.len - *offp; int code, sum, noff; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; int ip6len, error; ifp = m->m_pkthdr.rcvif; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), IPPROTO_DONE); /* m might change if M_LOOP. So, call mtod after this */ #endif /* * Locate icmp6 structure in mbuf, and check * that not corrupted and of at least minimum length */ ip6 = mtod(m, struct ip6_hdr *); ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); if (icmp6len < sizeof(struct icmp6_hdr)) { ICMP6STAT_INC(icp6s_tooshort); goto freeit; } /* * Check multicast group membership. * Note: SSM filters are not applied for ICMPv6 traffic. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct in6_multi *inm; inm = in6m_lookup(ifp, &ip6->ip6_dst); if (inm == NULL) { IP6STAT_INC(ip6s_notmember); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto freeit; } } /* * calculate the checksum */ #ifndef PULLDOWN_TEST icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { ICMP6STAT_INC(icp6s_tooshort); return IPPROTO_DONE; } #endif code = icmp6->icmp6_code; if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) { nd6log((LOG_ERR, "ICMP6 checksum error(%d|%x) %s\n", icmp6->icmp6_type, sum, ip6_sprintf(ip6bufs, &ip6->ip6_src))); ICMP6STAT_INC(icp6s_checksum); goto freeit; } ICMP6STAT_INC(icp6s_inhist[icmp6->icmp6_type]); icmp6_ifstat_inc(ifp, ifs6_in_msg); if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK) icmp6_ifstat_inc(ifp, ifs6_in_error); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: icmp6_ifstat_inc(ifp, ifs6_in_dstunreach); switch (code) { case ICMP6_DST_UNREACH_NOROUTE: case ICMP6_DST_UNREACH_ADDR: /* PRC_HOSTDEAD is a DOS */ code = PRC_UNREACH_NET; break; case ICMP6_DST_UNREACH_ADMIN: icmp6_ifstat_inc(ifp, ifs6_in_adminprohib); code = PRC_UNREACH_PROTOCOL; /* is this a good code? */ break; case ICMP6_DST_UNREACH_BEYONDSCOPE: /* I mean "source address was incorrect." */ code = PRC_PARAMPROB; break; case ICMP6_DST_UNREACH_NOPORT: code = PRC_UNREACH_PORT; break; default: goto badcode; } goto deliver; break; case ICMP6_PACKET_TOO_BIG: icmp6_ifstat_inc(ifp, ifs6_in_pkttoobig); /* validation is made in icmp6_mtudisc_update */ code = PRC_MSGSIZE; /* * Updating the path MTU will be done after examining * intermediate extension headers. */ goto deliver; break; case ICMP6_TIME_EXCEEDED: icmp6_ifstat_inc(ifp, ifs6_in_timeexceed); switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: code = PRC_TIMXCEED_INTRANS; break; case ICMP6_TIME_EXCEED_REASSEMBLY: code = PRC_TIMXCEED_REASS; break; default: goto badcode; } goto deliver; break; case ICMP6_PARAM_PROB: icmp6_ifstat_inc(ifp, ifs6_in_paramprob); switch (code) { case ICMP6_PARAMPROB_NEXTHEADER: code = PRC_UNREACH_PROTOCOL; break; case ICMP6_PARAMPROB_HEADER: case ICMP6_PARAMPROB_OPTION: code = PRC_PARAMPROB; break; default: goto badcode; } goto deliver; break; case ICMP6_ECHO_REQUEST: icmp6_ifstat_inc(ifp, ifs6_in_echo); if (code != 0) goto badcode; - if ((n = m_copy(m, 0, M_COPYALL)) == NULL) { + if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) { /* Give up remote */ break; } if (!M_WRITABLE(n) || n->m_len < off + sizeof(struct icmp6_hdr)) { struct mbuf *n0 = n; int n0len; CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) <= MHLEN); n = m_gethdr(M_NOWAIT, n0->m_type); if (n == NULL) { /* Give up remote */ m_freem(n0); break; } m_move_pkthdr(n, n0); /* FIB copied. */ n0len = n0->m_pkthdr.len; /* save for use below */ /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); noff = sizeof(struct ip6_hdr); /* new mbuf contains only ipv6+icmpv6 headers */ n->m_len = noff + sizeof(struct icmp6_hdr); /* * Adjust mbuf. ip6_plen will be adjusted in * ip6_output(). */ m_adj(n0, off + sizeof(struct icmp6_hdr)); /* recalculate complete packet size */ n->m_pkthdr.len = n0len + (noff - off); n->m_next = n0; } else { nip6 = mtod(n, struct ip6_hdr *); IP6_EXTHDR_GET(nicmp6, struct icmp6_hdr *, n, off, sizeof(*nicmp6)); noff = off; } nicmp6->icmp6_type = ICMP6_ECHO_REPLY; nicmp6->icmp6_code = 0; if (n) { ICMP6STAT_INC(icp6s_reflect); ICMP6STAT_INC(icp6s_outhist[ICMP6_ECHO_REPLY]); icmp6_reflect(n, noff); } break; case ICMP6_ECHO_REPLY: icmp6_ifstat_inc(ifp, ifs6_in_echoreply); if (code != 0) goto badcode; break; case MLD_LISTENER_QUERY: case MLD_LISTENER_REPORT: case MLD_LISTENER_DONE: case MLDV2_LISTENER_REPORT: /* * Drop MLD traffic which is not link-local, has a hop limit * of greater than 1 hop, or which does not have the * IPv6 HBH Router Alert option. * As IPv6 HBH options are stripped in ip6_input() we must * check an mbuf header flag. * XXX Should we also sanity check that these messages * were directed to a link-local multicast prefix? */ if ((ip6->ip6_hlim != 1) || (m->m_flags & M_RTALERT_MLD) == 0) goto freeit; if (mld_input(m, off, icmp6len) != 0) return (IPPROTO_DONE); /* m stays. */ break; case ICMP6_WRUREQUEST: /* ICMP6_FQDN_QUERY */ { enum { WRU, FQDN } mode; if (!V_icmp6_nodeinfo) break; if (icmp6len == sizeof(struct icmp6_hdr) + 4) mode = WRU; else if (icmp6len >= sizeof(struct icmp6_nodeinfo)) mode = FQDN; else goto badlen; if (mode == FQDN) { #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo), IPPROTO_DONE); #endif - n = m_copy(m, 0, M_COPYALL); + n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n) n = ni6_input(n, off); /* XXX meaningless if n == NULL */ noff = sizeof(struct ip6_hdr); } else { struct prison *pr; u_char *p; int maxhlen, hlen; /* * XXX: this combination of flags is pointless, * but should we keep this for compatibility? */ if ((V_icmp6_nodeinfo & (ICMP6_NODEINFO_FQDNOK | ICMP6_NODEINFO_TMPADDROK)) != (ICMP6_NODEINFO_FQDNOK | ICMP6_NODEINFO_TMPADDROK)) break; if (code != 0) goto badcode; CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) + 4 <= MHLEN); n = m_gethdr(M_NOWAIT, m->m_type); if (n == NULL) { /* Give up remote */ break; } if (!m_dup_pkthdr(n, m, M_NOWAIT)) { /* * Previous code did a blind M_COPY_PKTHDR * and said "just for rcvif". If true, then * we could tolerate the dup failing (due to * the deep copy of the tag chain). For now * be conservative and just fail. */ m_free(n); n = NULL; } maxhlen = M_TRAILINGSPACE(n) - (sizeof(*nip6) + sizeof(*nicmp6) + 4); pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); hlen = strlen(pr->pr_hostname); if (maxhlen > hlen) maxhlen = hlen; /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); p = (u_char *)(nicmp6 + 1); bzero(p, 4); /* meaningless TTL */ bcopy(pr->pr_hostname, p + 4, maxhlen); mtx_unlock(&pr->pr_mtx); noff = sizeof(struct ip6_hdr); n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + 4 + maxhlen; nicmp6->icmp6_type = ICMP6_WRUREPLY; nicmp6->icmp6_code = 0; } if (n) { ICMP6STAT_INC(icp6s_reflect); ICMP6STAT_INC(icp6s_outhist[ICMP6_WRUREPLY]); icmp6_reflect(n, noff); } break; } case ICMP6_WRUREPLY: if (code != 0) goto badcode; break; case ND_ROUTER_SOLICIT: icmp6_ifstat_inc(ifp, ifs6_in_routersolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_solicit)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) { /* give up local */ /* Send incoming SeND packet to user space. */ if (send_sendso_input_hook != NULL) { IP6_EXTHDR_CHECK(m, off, icmp6len, IPPROTO_DONE); error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); /* -1 == no app on SEND socket */ if (error == 0) return (IPPROTO_DONE); nd6_rs_input(m, off, icmp6len); } else nd6_rs_input(m, off, icmp6len); m = NULL; goto freeit; } if (send_sendso_input_hook != NULL) { IP6_EXTHDR_CHECK(n, off, icmp6len, IPPROTO_DONE); error = send_sendso_input_hook(n, ifp, SND_IN, ip6len); if (error == 0) goto freeit; /* -1 == no app on SEND socket */ nd6_rs_input(n, off, icmp6len); } else nd6_rs_input(n, off, icmp6len); /* m stays. */ break; case ND_ROUTER_ADVERT: icmp6_ifstat_inc(ifp, ifs6_in_routeradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_advert)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) { /* Send incoming SeND-protected/ND packet to user space. */ if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) return (IPPROTO_DONE); nd6_ra_input(m, off, icmp6len); } else nd6_ra_input(m, off, icmp6len); m = NULL; goto freeit; } if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(n, ifp, SND_IN, ip6len); if (error == 0) goto freeit; nd6_ra_input(n, off, icmp6len); } else nd6_ra_input(n, off, icmp6len); /* m stays. */ break; case ND_NEIGHBOR_SOLICIT: icmp6_ifstat_inc(ifp, ifs6_in_neighborsolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_solicit)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) { if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) return (IPPROTO_DONE); nd6_ns_input(m, off, icmp6len); } else nd6_ns_input(m, off, icmp6len); m = NULL; goto freeit; } if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(n, ifp, SND_IN, ip6len); if (error == 0) goto freeit; nd6_ns_input(n, off, icmp6len); } else nd6_ns_input(n, off, icmp6len); /* m stays. */ break; case ND_NEIGHBOR_ADVERT: icmp6_ifstat_inc(ifp, ifs6_in_neighboradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_advert)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) { /* Send incoming SeND-protected/ND packet to user space. */ if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) return (IPPROTO_DONE); nd6_na_input(m, off, icmp6len); } else nd6_na_input(m, off, icmp6len); m = NULL; goto freeit; } if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(n, ifp, SND_IN, ip6len); if (error == 0) goto freeit; nd6_na_input(n, off, icmp6len); } else nd6_na_input(n, off, icmp6len); /* m stays. */ break; case ND_REDIRECT: icmp6_ifstat_inc(ifp, ifs6_in_redirect); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_redirect)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) { if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) return (IPPROTO_DONE); icmp6_redirect_input(m, off); } else icmp6_redirect_input(m, off); m = NULL; goto freeit; } if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(n, ifp, SND_IN, ip6len); if (error == 0) goto freeit; icmp6_redirect_input(n, off); } else icmp6_redirect_input(n, off); /* m stays. */ break; case ICMP6_ROUTER_RENUMBERING: if (code != ICMP6_ROUTER_RENUMBERING_COMMAND && code != ICMP6_ROUTER_RENUMBERING_RESULT) goto badcode; if (icmp6len < sizeof(struct icmp6_router_renum)) goto badlen; break; default: nd6log((LOG_DEBUG, "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n", icmp6->icmp6_type, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ifp ? ifp->if_index : 0)); if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) { /* ICMPv6 error: MUST deliver it by spec... */ code = PRC_NCMDS; /* deliver */ } else { /* ICMPv6 informational: MUST not deliver */ break; } deliver: if (icmp6_notify_error(&m, off, icmp6len, code) != 0) { /* In this case, m should've been freed. */ return (IPPROTO_DONE); } break; badcode: ICMP6STAT_INC(icp6s_badcode); break; badlen: ICMP6STAT_INC(icp6s_badlen); break; } /* deliver the packet to appropriate sockets */ icmp6_rip6_input(&m, *offp); return IPPROTO_DONE; freeit: m_freem(m); return IPPROTO_DONE; } static int icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code) { struct mbuf *m = *mp; struct icmp6_hdr *icmp6; struct ip6_hdr *eip6; u_int32_t notifymtu; struct sockaddr_in6 icmp6src, icmp6dst; if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) { ICMP6STAT_INC(icp6s_tooshort); goto freeit; } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr), -1); icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif eip6 = (struct ip6_hdr *)(icmp6 + 1); /* Detect the upper level protocol */ { void (*ctlfunc)(int, struct sockaddr *, void *); u_int8_t nxt = eip6->ip6_nxt; int eoff = off + sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr); struct ip6ctlparam ip6cp; struct in6_addr *finaldst = NULL; int icmp6type = icmp6->icmp6_type; struct ip6_frag *fh; struct ip6_rthdr *rth; struct ip6_rthdr0 *rth0; int rthlen; while (1) { /* XXX: should avoid infinite loop explicitly? */ struct ip6_ext *eh; switch (nxt) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_AH: #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_ext), -1); eh = (struct ip6_ext *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(eh, struct ip6_ext *, m, eoff, sizeof(*eh)); if (eh == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif if (nxt == IPPROTO_AH) eoff += (eh->ip6e_len + 2) << 2; else eoff += (eh->ip6e_len + 1) << 3; nxt = eh->ip6e_nxt; break; case IPPROTO_ROUTING: /* * When the erroneous packet contains a * routing header, we should examine the * header to determine the final destination. * Otherwise, we can't properly update * information that depends on the final * destination (e.g. path MTU). */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(*rth), -1); rth = (struct ip6_rthdr *) (mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m, eoff, sizeof(*rth)); if (rth == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif rthlen = (rth->ip6r_len + 1) << 3; /* * XXX: currently there is no * officially defined type other * than type-0. * Note that if the segment left field * is 0, all intermediate hops must * have been passed. */ if (rth->ip6r_segleft && rth->ip6r_type == IPV6_RTHDR_TYPE_0) { int hops; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + rthlen, -1); rth0 = (struct ip6_rthdr0 *) (mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(rth0, struct ip6_rthdr0 *, m, eoff, rthlen); if (rth0 == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif /* just ignore a bogus header */ if ((rth0->ip6r0_len % 2) == 0 && (hops = rth0->ip6r0_len/2)) finaldst = (struct in6_addr *)(rth0 + 1) + (hops - 1); } eoff += rthlen; nxt = rth->ip6r_nxt; break; case IPPROTO_FRAGMENT: #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_frag), -1); fh = (struct ip6_frag *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(fh, struct ip6_frag *, m, eoff, sizeof(*fh)); if (fh == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif /* * Data after a fragment header is meaningless * unless it is the first fragment, but * we'll go to the notify label for path MTU * discovery. */ if (fh->ip6f_offlg & IP6F_OFF_MASK) goto notify; eoff += sizeof(struct ip6_frag); nxt = fh->ip6f_nxt; break; default: /* * This case includes ESP and the No Next * Header. In such cases going to the notify * label does not have any meaning * (i.e. ctlfunc will be NULL), but we go * anyway since we might have to update * path MTU information. */ goto notify; } } notify: #ifndef PULLDOWN_TEST icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { ICMP6STAT_INC(icp6s_tooshort); return (-1); } #endif /* * retrieve parameters from the inner IPv6 header, and convert * them into sockaddr structures. * XXX: there is no guarantee that the source or destination * addresses of the inner packet are in the same scope as * the addresses of the icmp packet. But there is no other * way to determine the zone. */ eip6 = (struct ip6_hdr *)(icmp6 + 1); bzero(&icmp6dst, sizeof(icmp6dst)); icmp6dst.sin6_len = sizeof(struct sockaddr_in6); icmp6dst.sin6_family = AF_INET6; if (finaldst == NULL) icmp6dst.sin6_addr = eip6->ip6_dst; else icmp6dst.sin6_addr = *finaldst; if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; bzero(&icmp6src, sizeof(icmp6src)); icmp6src.sin6_len = sizeof(struct sockaddr_in6); icmp6src.sin6_family = AF_INET6; icmp6src.sin6_addr = eip6->ip6_src; if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; icmp6src.sin6_flowinfo = (eip6->ip6_flow & IPV6_FLOWLABEL_MASK); if (finaldst == NULL) finaldst = &eip6->ip6_dst; ip6cp.ip6c_m = m; ip6cp.ip6c_icmp6 = icmp6; ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1); ip6cp.ip6c_off = eoff; ip6cp.ip6c_finaldst = finaldst; ip6cp.ip6c_src = &icmp6src; ip6cp.ip6c_nxt = nxt; if (icmp6type == ICMP6_PACKET_TOO_BIG) { notifymtu = ntohl(icmp6->icmp6_mtu); ip6cp.ip6c_cmdarg = (void *)¬ifymtu; icmp6_mtudisc_update(&ip6cp, 1); /*XXX*/ } ctlfunc = (void (*)(int, struct sockaddr *, void *)) (inet6sw[ip6_protox[nxt]].pr_ctlinput); if (ctlfunc) { (void) (*ctlfunc)(code, (struct sockaddr *)&icmp6dst, &ip6cp); } } *mp = m; return (0); freeit: m_freem(m); return (-1); } void icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated) { struct in6_addr *dst = ip6cp->ip6c_finaldst; struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6; struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */ u_int mtu = ntohl(icmp6->icmp6_mtu); struct in_conninfo inc; #if 0 /* * RFC2460 section 5, last paragraph. * even though minimum link MTU for IPv6 is IPV6_MMTU, * we may see ICMPv6 too big with mtu < IPV6_MMTU * due to packet translator in the middle. * see ip6_output() and ip6_getpmtu() "alwaysfrag" case for * special handling. */ if (mtu < IPV6_MMTU) return; #endif /* * we reject ICMPv6 too big with abnormally small value. * XXX what is the good definition of "abnormally small"? */ if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8) return; if (!validated) return; /* * In case the suggested mtu is less than IPV6_MMTU, we * only need to remember that it was for above mentioned * "alwaysfrag" case. * Try to be as close to the spec as possible. */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU - 8; bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) return; if (mtu < tcp_maxmtu6(&inc, NULL)) { tcp_hc_updatemtu(&inc, mtu); ICMP6STAT_INC(icp6s_pmtuchg); } } /* * Process a Node Information Query packet, based on * draft-ietf-ipngwg-icmp-name-lookups-07. * * Spec incompatibilities: * - IPv6 Subject address handling * - IPv4 Subject address handling support missing * - Proxy reply (answer even if it's not for me) * - joins NI group address at in6_ifattach() time only, does not cope * with hostname changes by sethostname(3) */ static struct mbuf * ni6_input(struct mbuf *m, int off) { struct icmp6_nodeinfo *ni6, *nni6; struct mbuf *n = NULL; struct prison *pr; u_int16_t qtype; int subjlen; int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); struct ni_reply_fqdn *fqdn; int addrs; /* for NI_QTYPE_NODEADDR */ struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */ struct in6_addr in6_subj; /* subject address */ struct ip6_hdr *ip6; int oldfqdn = 0; /* if 1, return pascal string (03 draft) */ char *subj = NULL; struct in6_ifaddr *ia6 = NULL; ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo *, m, off, sizeof(*ni6)); if (ni6 == NULL) { /* m is already reclaimed */ return (NULL); } #endif /* * Validate IPv6 source address. * The default configuration MUST be to refuse answering queries from * global-scope addresses according to RFC4602. * Notes: * - it's not very clear what "refuse" means; this implementation * simply drops it. * - it's not very easy to identify global-scope (unicast) addresses * since there are many prefixes for them. It should be safer * and in practice sufficient to check "all" but loopback and * link-local (note that site-local unicast was deprecated and * ULA is defined as global scope-wise) */ if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 && !IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) && !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) goto bad; /* * Validate IPv6 destination address. * * The Responder must discard the Query without further processing * unless it is one of the Responder's unicast or anycast addresses, or * a link-local scope multicast address which the Responder has joined. * [RFC4602, Section 5.] */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) goto bad; /* else it's a link-local multicast, fine */ } else { /* unicast or anycast */ ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia6 == NULL) goto bad; /* XXX impossible */ if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) && !(V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) { ifa_free(&ia6->ia_ifa); nd6log((LOG_DEBUG, "ni6_input: ignore node info to " "a temporary address in %s:%d", __FILE__, __LINE__)); goto bad; } ifa_free(&ia6->ia_ifa); } /* validate query Subject field. */ qtype = ntohs(ni6->ni_qtype); subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo); switch (qtype) { case NI_QTYPE_NOOP: case NI_QTYPE_SUPTYPES: /* 07 draft */ if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0) break; /* FALLTHROUGH */ case NI_QTYPE_FQDN: case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: #if ICMP6_NI_SUBJ_IPV6 != 0 case 0: #endif /* * backward compatibility - try to accept 03 draft * format, where no Subject is present. */ if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 && subjlen == 0) { oldfqdn++; break; } #if ICMP6_NI_SUBJ_IPV6 != 0 if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6) goto bad; #endif if (subjlen != sizeof(struct in6_addr)) goto bad; /* * Validate Subject address. * * Not sure what exactly "address belongs to the node" * means in the spec, is it just unicast, or what? * * At this moment we consider Subject address as * "belong to the node" if the Subject address equals * to the IPv6 destination address; validation for * IPv6 destination address should have done enough * check for us. * * We do not do proxy at this moment. */ /* m_pulldown instead of copy? */ m_copydata(m, off + sizeof(struct icmp6_nodeinfo), subjlen, (caddr_t)&in6_subj); if (in6_setscope(&in6_subj, m->m_pkthdr.rcvif, NULL)) goto bad; subj = (char *)&in6_subj; if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj)) break; /* * XXX if we are to allow other cases, we should really * be careful about scope here. * basically, we should disallow queries toward IPv6 * destination X with subject Y, * if scope(X) > scope(Y). * if we allow scope(X) > scope(Y), it will result in * information leakage across scope boundary. */ goto bad; case ICMP6_NI_SUBJ_FQDN: /* * Validate Subject name with gethostname(3). * * The behavior may need some debate, since: * - we are not sure if the node has FQDN as * hostname (returned by gethostname(3)). * - the code does wildcard match for truncated names. * however, we are not sure if we want to perform * wildcard match, if gethostname(3) side has * truncated hostname. */ pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); n = ni6_nametodns(pr->pr_hostname, strlen(pr->pr_hostname), 0); mtx_unlock(&pr->pr_mtx); if (!n || n->m_next || n->m_len == 0) goto bad; IP6_EXTHDR_GET(subj, char *, m, off + sizeof(struct icmp6_nodeinfo), subjlen); if (subj == NULL) goto bad; if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *), n->m_len)) { goto bad; } m_freem(n); n = NULL; break; case ICMP6_NI_SUBJ_IPV4: /* XXX: to be implemented? */ default: goto bad; } break; } /* refuse based on configuration. XXX ICMP6_NI_REFUSED? */ switch (qtype) { case NI_QTYPE_FQDN: if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0) goto bad; break; case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0) goto bad; break; } /* guess reply length */ switch (qtype) { case NI_QTYPE_NOOP: break; /* no reply data */ case NI_QTYPE_SUPTYPES: replylen += sizeof(u_int32_t); break; case NI_QTYPE_FQDN: /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); break; case NI_QTYPE_NODEADDR: addrs = ni6_addrs(ni6, m, &ifp, (struct in6_addr *)subj); if ((replylen += addrs * (sizeof(struct in6_addr) + sizeof(u_int32_t))) > MCLBYTES) replylen = MCLBYTES; /* XXX: will truncate pkt later */ break; case NI_QTYPE_IPV4ADDR: /* unsupported - should respond with unknown Qtype? */ break; default: /* * XXX: We must return a reply with the ICMP6 code * `unknown Qtype' in this case. However we regard the case * as an FQDN query for backward compatibility. * Older versions set a random value to this field, * so it rarely varies in the defined qtypes. * But the mechanism is not reliable... * maybe we should obsolete older versions. */ qtype = NI_QTYPE_FQDN; /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); oldfqdn++; break; } /* Allocate an mbuf to reply. */ if (replylen > MCLBYTES) { /* * XXX: should we try to allocate more? But MCLBYTES * is probably much larger than IPV6_MMTU... */ goto bad; } if (replylen > MHLEN) n = m_getcl(M_NOWAIT, m->m_type, M_PKTHDR); else n = m_gethdr(M_NOWAIT, m->m_type); if (n == NULL) { m_freem(m); return (NULL); } m_move_pkthdr(n, m); /* just for recvif and FIB */ n->m_pkthdr.len = n->m_len = replylen; /* copy mbuf header and IPv6 + Node Information base headers */ bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr)); nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1); bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo)); /* qtype dependent procedure */ switch (qtype) { case NI_QTYPE_NOOP: nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = 0; break; case NI_QTYPE_SUPTYPES: { u_int32_t v; nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = htons(0x0000); /* raw bitmap */ /* supports NOOP, SUPTYPES, FQDN, and NODEADDR */ v = (u_int32_t)htonl(0x0000000f); bcopy(&v, nni6 + 1, sizeof(u_int32_t)); break; } case NI_QTYPE_FQDN: nni6->ni_code = ICMP6_NI_SUCCESS; fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) + sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo)); nni6->ni_flags = 0; /* XXX: meaningless TTL */ fqdn->ni_fqdn_ttl = 0; /* ditto. */ /* * XXX do we really have FQDN in hostname? */ pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); n->m_next = ni6_nametodns(pr->pr_hostname, strlen(pr->pr_hostname), oldfqdn); mtx_unlock(&pr->pr_mtx); if (n->m_next == NULL) goto bad; /* XXX we assume that n->m_next is not a chain */ if (n->m_next->m_next != NULL) goto bad; n->m_pkthdr.len += n->m_next->m_len; break; case NI_QTYPE_NODEADDR: { int lenlim, copied; nni6->ni_code = ICMP6_NI_SUCCESS; n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); lenlim = M_TRAILINGSPACE(n); copied = ni6_store_addrs(ni6, nni6, ifp, lenlim); /* XXX: reset mbuf length */ n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo) + copied; break; } default: break; /* XXX impossible! */ } nni6->ni_type = ICMP6_NI_REPLY; m_freem(m); return (n); bad: m_freem(m); if (n) m_freem(n); return (NULL); } /* * make a mbuf with DNS-encoded string. no compression support. * * XXX names with less than 2 dots (like "foo" or "foo.section") will be * treated as truncated name (two \0 at the end). this is a wild guess. * * old - return pascal string if non-zero */ static struct mbuf * ni6_nametodns(const char *name, int namelen, int old) { struct mbuf *m; char *cp, *ep; const char *p, *q; int i, len, nterm; if (old) len = namelen + 1; else len = MCLBYTES; /* Because MAXHOSTNAMELEN is usually 256, we use cluster mbuf. */ if (len > MLEN) m = m_getcl(M_NOWAIT, MT_DATA, 0); else m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) goto fail; if (old) { m->m_len = len; *mtod(m, char *) = namelen; bcopy(name, mtod(m, char *) + 1, namelen); return m; } else { m->m_len = 0; cp = mtod(m, char *); ep = mtod(m, char *) + M_TRAILINGSPACE(m); /* if not certain about my name, return empty buffer */ if (namelen == 0) return m; /* * guess if it looks like shortened hostname, or FQDN. * shortened hostname needs two trailing "\0". */ i = 0; for (p = name; p < name + namelen; p++) { if (*p && *p == '.') i++; } if (i < 2) nterm = 2; else nterm = 1; p = name; while (cp < ep && p < name + namelen) { i = 0; for (q = p; q < name + namelen && *q && *q != '.'; q++) i++; /* result does not fit into mbuf */ if (cp + i + 1 >= ep) goto fail; /* * DNS label length restriction, RFC1035 page 8. * "i == 0" case is included here to avoid returning * 0-length label on "foo..bar". */ if (i <= 0 || i >= 64) goto fail; *cp++ = i; bcopy(p, cp, i); cp += i; p = q; if (p < name + namelen && *p == '.') p++; } /* termination */ if (cp + nterm >= ep) goto fail; while (nterm-- > 0) *cp++ = '\0'; m->m_len = cp - mtod(m, char *); return m; } panic("should not reach here"); /* NOTREACHED */ fail: if (m) m_freem(m); return NULL; } /* * check if two DNS-encoded string matches. takes care of truncated * form (with \0\0 at the end). no compression support. * XXX upper/lowercase match (see RFC2065) */ static int ni6_dnsmatch(const char *a, int alen, const char *b, int blen) { const char *a0, *b0; int l; /* simplest case - need validation? */ if (alen == blen && bcmp(a, b, alen) == 0) return 1; a0 = a; b0 = b; /* termination is mandatory */ if (alen < 2 || blen < 2) return 0; if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0') return 0; alen--; blen--; while (a - a0 < alen && b - b0 < blen) { if (a - a0 + 1 > alen || b - b0 + 1 > blen) return 0; if ((signed char)a[0] < 0 || (signed char)b[0] < 0) return 0; /* we don't support compression yet */ if (a[0] >= 64 || b[0] >= 64) return 0; /* truncated case */ if (a[0] == 0 && a - a0 == alen - 1) return 1; if (b[0] == 0 && b - b0 == blen - 1) return 1; if (a[0] == 0 || b[0] == 0) return 0; if (a[0] != b[0]) return 0; l = a[0]; if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen) return 0; if (bcmp(a + 1, b + 1, l) != 0) return 0; a += 1 + l; b += 1 + l; } if (a - a0 == alen && b - b0 == blen) return 1; else return 0; } /* * calculate the number of addresses to be returned in the node info reply. */ static int ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp, struct in6_addr *subj) { struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; int addrs = 0, addrsofif, iffound = 0; int niflags = ni6->ni_flags; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) { switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: if (subj == NULL) /* must be impossible... */ return (0); break; default: /* * XXX: we only support IPv6 subject address for * this Qtype. */ return (0); } } IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { addrsofif = 0; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 && IN6_ARE_ADDR_EQUAL(subj, &ifa6->ia_addr.sin6_addr)) iffound = 1; /* * IPv4-mapped addresses can only be returned by a * Node Information proxy, since they represent * addresses of IPv4-only nodes, which perforce do * not implement this protocol. * [icmp-name-lookups-07, Section 5.4] * So we don't support NI_NODEADDR_FLAG_COMPAT in * this function at this moment. */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; /* we need only unicast addresses */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } addrsofif++; /* count the address */ } IF_ADDR_RUNLOCK(ifp); if (iffound) { *ifpp = ifp; IFNET_RUNLOCK_NOSLEEP(); return (addrsofif); } addrs += addrsofif; } IFNET_RUNLOCK_NOSLEEP(); return (addrs); } static int ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, struct ifnet *ifp0, int resid) { struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; struct ifnet *ifp_dep = NULL; int copied = 0, allow_deprecated = 0; u_char *cp = (u_char *)(nni6 + 1); int niflags = ni6->ni_flags; u_int32_t ltime; if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL)) return (0); /* needless to copy */ IFNET_RLOCK_NOSLEEP(); ifp = ifp0 ? ifp0 : TAILQ_FIRST(&V_ifnet); again: for (; ifp; ifp = TAILQ_NEXT(ifp, if_link)) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 && allow_deprecated == 0) { /* * prefererred address should be put before * deprecated addresses. */ /* record the interface for later search */ if (ifp_dep == NULL) ifp_dep = ifp; continue; } else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 && allow_deprecated != 0) continue; /* we now collect deprecated addrs */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } /* now we can copy the address */ if (resid < sizeof(struct in6_addr) + sizeof(u_int32_t)) { IF_ADDR_RUNLOCK(ifp); /* * We give up much more copy. * Set the truncate flag and return. */ nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE; IFNET_RUNLOCK_NOSLEEP(); return (copied); } /* * Set the TTL of the address. * The TTL value should be one of the following * according to the specification: * * 1. The remaining lifetime of a DHCP lease on the * address, or * 2. The remaining Valid Lifetime of a prefix from * which the address was derived through Stateless * Autoconfiguration. * * Note that we currently do not support stateful * address configuration by DHCPv6, so the former * case can't happen. */ if (ifa6->ia6_lifetime.ia6t_expire == 0) ltime = ND6_INFINITE_LIFETIME; else { if (ifa6->ia6_lifetime.ia6t_expire > time_uptime) ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_uptime); else ltime = 0; } bcopy(<ime, cp, sizeof(u_int32_t)); cp += sizeof(u_int32_t); /* copy the address itself */ bcopy(&ifa6->ia_addr.sin6_addr, cp, sizeof(struct in6_addr)); in6_clearscope((struct in6_addr *)cp); /* XXX */ cp += sizeof(struct in6_addr); resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t)); copied += (sizeof(struct in6_addr) + sizeof(u_int32_t)); } IF_ADDR_RUNLOCK(ifp); if (ifp0) /* we need search only on the specified IF */ break; } if (allow_deprecated == 0 && ifp_dep != NULL) { ifp = ifp_dep; allow_deprecated = 1; goto again; } IFNET_RUNLOCK_NOSLEEP(); return (copied); } /* * XXX almost dup'ed code with rip6_input. */ static int icmp6_rip6_input(struct mbuf **mp, int off) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb *in6p; struct inpcb *last = NULL; struct sockaddr_in6 fromsa; struct icmp6_hdr *icmp6; struct mbuf *opts = NULL; #ifndef PULLDOWN_TEST /* this is assumed to be safe. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { /* m is already reclaimed */ return (IPPROTO_DONE); } #endif /* * XXX: the address may have embedded scope zone ID, which should be * hidden from applications. */ bzero(&fromsa, sizeof(fromsa)); fromsa.sin6_family = AF_INET6; fromsa.sin6_len = sizeof(struct sockaddr_in6); fromsa.sin6_addr = ip6->ip6_src; if (sa6_recoverscope(&fromsa)) { m_freem(m); return (IPPROTO_DONE); } INP_INFO_RLOCK(&V_ripcbinfo); LIST_FOREACH(in6p, &V_ripcb, inp_list) { if ((in6p->inp_vflag & INP_IPV6) == 0) continue; if (in6p->inp_ip_p != IPPROTO_ICMPV6) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst)) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) continue; INP_RLOCK(in6p); if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, in6p->in6p_icmp6filt)) { INP_RUNLOCK(in6p); continue; } if (last != NULL) { struct mbuf *n = NULL; /* * Recent network drivers tend to allocate a single * mbuf cluster, rather than to make a couple of * mbufs without clusters. Also, since the IPv6 code * path tries to avoid m_pullup(), it is highly * probable that we still have an mbuf cluster here * even though the necessary length can be stored in an * mbuf's internal buffer. * Meanwhile, the default size of the receive socket * buffer for raw sockets is not so large. This means * the possibility of packet loss is relatively higher * than before. To avoid this scenario, we copy the * received data to a separate mbuf that does not use * a cluster, if possible. * XXX: it is better to copy the data after stripping * intermediate headers. */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { n = m_get(M_NOWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; } else { m_free(n); n = NULL; } } } if (n != NULL || - (n = m_copy(m, 0, (int)M_COPYALL)) != NULL) { + (n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { if (last->inp_flags & INP_CONTROLOPTS) ip6_savecontrol(last, n, &opts); /* strip intermediate headers */ m_adj(n, off); SOCKBUF_LOCK(&last->inp_socket->so_rcv); if (sbappendaddr_locked( &last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) { m_freem(opts); } SOCKBUF_UNLOCK( &last->inp_socket->so_rcv); } else sorwakeup_locked(last->inp_socket); opts = NULL; } INP_RUNLOCK(last); } last = in6p; } INP_INFO_RUNLOCK(&V_ripcbinfo); if (last != NULL) { if (last->inp_flags & INP_CONTROLOPTS) ip6_savecontrol(last, m, &opts); /* strip intermediate headers */ m_adj(m, off); /* avoid using mbuf clusters if possible (see above) */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { struct mbuf *n; n = m_get(M_NOWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; m_freem(m); m = n; } else { m_freem(n); n = NULL; } } } SOCKBUF_LOCK(&last->inp_socket->so_rcv); if (sbappendaddr_locked(&last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, m, opts) == 0) { m_freem(m); if (opts) m_freem(opts); SOCKBUF_UNLOCK(&last->inp_socket->so_rcv); } else sorwakeup_locked(last->inp_socket); INP_RUNLOCK(last); } else { m_freem(m); IP6STAT_DEC(ip6s_delivered); } return IPPROTO_DONE; } /* * Reflect the ip6 packet back to the source. * OFF points to the icmp6 header, counted from the top of the mbuf. */ void icmp6_reflect(struct mbuf *m, size_t off) { struct in6_addr src6, *srcp; struct ip6_hdr *ip6; struct icmp6_hdr *icmp6; struct in6_ifaddr *ia = NULL; struct ifnet *outif = NULL; int plen; int type, code, hlim; /* too short to reflect */ if (off < sizeof(struct ip6_hdr)) { nd6log((LOG_DEBUG, "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n", (u_long)off, (u_long)sizeof(struct ip6_hdr), __FILE__, __LINE__)); goto bad; } /* * If there are extra headers between IPv6 and ICMPv6, strip * off that header first. */ #ifdef DIAGNOSTIC if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN) panic("assumption failed in icmp6_reflect"); #endif if (off > sizeof(struct ip6_hdr)) { size_t l; struct ip6_hdr nip6; l = off - sizeof(struct ip6_hdr); m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6); m_adj(m, l); l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6)); } else /* off == sizeof(struct ip6_hdr) */ { size_t l; l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } } plen = m->m_pkthdr.len - sizeof(struct ip6_hdr); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_nxt = IPPROTO_ICMPV6; icmp6 = (struct icmp6_hdr *)(ip6 + 1); type = icmp6->icmp6_type; /* keep type for statistics */ code = icmp6->icmp6_code; /* ditto. */ hlim = 0; srcp = NULL; /* * If the incoming packet was addressed directly to us (i.e. unicast), * use dst as the src for the reply. * The IN6_IFF_NOTREADY case should be VERY rare, but is possible * (for example) when we encounter an error while forwarding procedure * destined to a duplicated address of ours. */ if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia != NULL && !(ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY))) { src6 = ia->ia_addr.sin6_addr; srcp = &src6; if (m->m_pkthdr.rcvif != NULL) { /* XXX: This may not be the outgoing interface */ hlim = ND_IFINFO(m->m_pkthdr.rcvif)->chlim; } else hlim = V_ip6_defhlim; } if (ia != NULL) ifa_free(&ia->ia_ifa); } if (srcp == NULL) { int error; struct in6_addr dst6; uint32_t scopeid; /* * This case matches to multicasts, our anycast, or unicasts * that we do not own. Select a source address based on the * source address of the erroneous packet. */ in6_splitscope(&ip6->ip6_dst, &dst6, &scopeid); error = in6_selectsrc_addr(RT_DEFAULT_FIB, &dst6, scopeid, NULL, &src6, &hlim); if (error) { char ip6buf[INET6_ADDRSTRLEN]; nd6log((LOG_DEBUG, "icmp6_reflect: source can't be determined: " "dst=%s, error=%d\n", ip6_sprintf(ip6buf, &ip6->ip6_dst), error)); goto bad; } srcp = &src6; } /* * ip6_input() drops a packet if its src is multicast. * So, the src is never multicast. */ ip6->ip6_dst = ip6->ip6_src; ip6->ip6_src = *srcp; ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = hlim; icmp6->icmp6_cksum = 0; icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), plen); /* * XXX option handling */ m->m_flags &= ~(M_BCAST|M_MCAST); ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) icmp6_ifoutstat_inc(outif, type, code); return; bad: m_freem(m); return; } void icmp6_fasttimo(void) { mld_fasttimo(); } void icmp6_slowtimo(void) { mld_slowtimo(); } static const char * icmp6_redirect_diag(struct in6_addr *src6, struct in6_addr *dst6, struct in6_addr *tgt6) { static char buf[1024]; char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; char ip6buft[INET6_ADDRSTRLEN]; snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)", ip6_sprintf(ip6bufs, src6), ip6_sprintf(ip6bufd, dst6), ip6_sprintf(ip6buft, tgt6)); return buf; } void icmp6_redirect_input(struct mbuf *m, int off) { struct ifnet *ifp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_redirect *nd_rd; int icmp6len = ntohs(ip6->ip6_plen); char *lladdr = NULL; int lladdrlen = 0; int is_router; int is_onlink; struct in6_addr src6 = ip6->ip6_src; struct in6_addr redtgt6; struct in6_addr reddst6; union nd_opts ndopts; char ip6buf[INET6_ADDRSTRLEN]; M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: no rcvif", __func__)); ifp = m->m_pkthdr.rcvif; /* XXX if we are router, we don't update route by icmp6 redirect */ if (V_ip6_forwarding) goto freeit; if (!V_icmp6_rediraccept) goto freeit; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len); if (nd_rd == NULL) { ICMP6STAT_INC(icp6s_tooshort); return; } #endif redtgt6 = nd_rd->nd_rd_target; reddst6 = nd_rd->nd_rd_dst; if (in6_setscope(&redtgt6, m->m_pkthdr.rcvif, NULL) || in6_setscope(&reddst6, m->m_pkthdr.rcvif, NULL)) { goto freeit; } /* validation */ if (!IN6_IS_ADDR_LINKLOCAL(&src6)) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "must be from linklocal\n", ip6_sprintf(ip6buf, &src6))); goto bad; } if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "hlim=%d (must be 255)\n", ip6_sprintf(ip6buf, &src6), ip6->ip6_hlim)); goto bad; } { /* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */ struct nhop6_basic nh6; struct in6_addr kdst; uint32_t scopeid; in6_splitscope(&reddst6, &kdst, &scopeid); if (fib6_lookup_nh_basic(RT_DEFAULT_FIB, &kdst, scopeid, 0, 0,&nh6)==0){ if ((nh6.nh_flags & NHF_GATEWAY) == 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; no route " "with inet6 gateway found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } if (IN6_ARE_ADDR_EQUAL(&src6, &nh6.nh_addr) == 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "not equal to gw-for-src=%s (must be same): " "%s\n", ip6_sprintf(ip6buf, &nh6.nh_addr), icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } } else { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "no route found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } } if (IN6_IS_ADDR_MULTICAST(&reddst6)) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "redirect dst must be unicast: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } is_router = is_onlink = 0; if (IN6_IS_ADDR_LINKLOCAL(&redtgt6)) is_router = 1; /* router case */ if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0) is_onlink = 1; /* on-link destination case */ if (!is_router && !is_onlink) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "neither router case nor onlink case: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } icmp6len -= sizeof(*nd_rd); nd6_option_init(nd_rd + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "%s: invalid ND option, rejected: %s\n", __func__, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "%s: lladdrlen mismatch for %s " "(if %d, icmp6 packet %d): %s\n", __func__, ip6_sprintf(ip6buf, &redtgt6), ifp->if_addrlen, lladdrlen - 2, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* Validation passed. */ /* RFC 2461 8.3 */ nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT, is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER); /* * Install a gateway route in the better-router case or an interface * route in the on-link-destination case. */ { struct sockaddr_in6 sdst; struct sockaddr_in6 sgw; struct sockaddr_in6 ssrc; struct sockaddr *gw; int rt_flags; u_int fibnum; bzero(&sdst, sizeof(sdst)); bzero(&ssrc, sizeof(ssrc)); sdst.sin6_family = ssrc.sin6_family = AF_INET6; sdst.sin6_len = ssrc.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr)); rt_flags = RTF_HOST; if (is_router) { bzero(&sgw, sizeof(sgw)); sgw.sin6_family = AF_INET6; sgw.sin6_len = sizeof(struct sockaddr_in6); bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr)); gw = (struct sockaddr *)&sgw; rt_flags |= RTF_GATEWAY; } else gw = ifp->if_addr->ifa_addr; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) in6_rtredirect((struct sockaddr *)&sdst, gw, (struct sockaddr *)NULL, rt_flags, (struct sockaddr *)&ssrc, fibnum); } /* finally update cached route in each socket via pfctlinput */ { struct sockaddr_in6 sdst; bzero(&sdst, sizeof(sdst)); sdst.sin6_family = AF_INET6; sdst.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&sdst); } freeit: m_freem(m); return; bad: ICMP6STAT_INC(icp6s_badredirect); m_freem(m); } void icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt) { struct ifnet *ifp; /* my outgoing interface */ struct in6_addr *ifp_ll6; struct in6_addr *router_ll6; struct ip6_hdr *sip6; /* m0 as struct ip6_hdr */ struct mbuf *m = NULL; /* newly allocated one */ struct m_tag *mtag; struct ip6_hdr *ip6; /* m as struct ip6_hdr */ struct nd_redirect *nd_rd; struct llentry *ln = NULL; size_t maxlen; u_char *p; struct ifnet *outif = NULL; struct sockaddr_in6 src_sa; icmp6_errcount(ND_REDIRECT, 0); /* if we are not router, we don't send icmp6 redirect */ if (!V_ip6_forwarding) goto fail; /* sanity check */ if (!m0 || !rt || !(rt->rt_flags & RTF_UP) || !(ifp = rt->rt_ifp)) goto fail; /* * Address check: * the source address must identify a neighbor, and * the destination address must not be a multicast address * [RFC 2461, sec 8.2] */ sip6 = mtod(m0, struct ip6_hdr *); bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = sip6->ip6_src; if (nd6_is_addr_neighbor(&src_sa, ifp) == 0) goto fail; if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst)) goto fail; /* what should we do here? */ /* rate limit */ if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0)) goto fail; /* * Since we are going to append up to 1280 bytes (= IPV6_MMTU), * we almost always ask for an mbuf cluster for simplicity. * (MHLEN < IPV6_MMTU is almost always true) */ #if IPV6_MMTU >= MCLBYTES # error assumption failed about IPV6_MMTU and MCLBYTES #endif m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) goto fail; M_SETFIB(m, rt->rt_fibnum); maxlen = M_TRAILINGSPACE(m); maxlen = min(IPV6_MMTU, maxlen); /* just for safety */ if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) { goto fail; } { /* get ip6 linklocal address for ifp(my outgoing interface). */ struct in6_ifaddr *ia; if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY| IN6_IFF_ANYCAST)) == NULL) goto fail; ifp_ll6 = &ia->ia_addr.sin6_addr; /* XXXRW: reference released prematurely. */ ifa_free(&ia->ia_ifa); } /* get ip6 linklocal address for the router. */ if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)rt->rt_gateway; router_ll6 = &sin6->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(router_ll6)) router_ll6 = (struct in6_addr *)NULL; } else router_ll6 = (struct in6_addr *)NULL; /* ip6 */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; /* ip6->ip6_src must be linklocal addr for my outgoing if. */ bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr)); bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr)); /* ND Redirect */ nd_rd = (struct nd_redirect *)(ip6 + 1); nd_rd->nd_rd_type = ND_REDIRECT; nd_rd->nd_rd_code = 0; nd_rd->nd_rd_reserved = 0; if (rt->rt_flags & RTF_GATEWAY) { /* * nd_rd->nd_rd_target must be a link-local address in * better router cases. */ if (!router_ll6) goto fail; bcopy(router_ll6, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } else { /* make sure redtgt == reddst */ bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } p = (u_char *)(nd_rd + 1); if (!router_ll6) goto nolladdropt; { /* target lladdr option */ int len; struct nd_opt_hdr *nd_opt; char *lladdr; IF_AFDATA_RLOCK(ifp); ln = nd6_lookup(router_ll6, 0, ifp); IF_AFDATA_RUNLOCK(ifp); if (ln == NULL) goto nolladdropt; len = sizeof(*nd_opt) + ifp->if_addrlen; len = (len + 7) & ~7; /* round by 8 */ /* safety check */ if (len + (p - (u_char *)ip6) > maxlen) goto nolladdropt; if (ln->la_flags & LLE_VALID) { nd_opt = (struct nd_opt_hdr *)p; nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = len >> 3; lladdr = (char *)(nd_opt + 1); bcopy(ln->ll_addr, lladdr, ifp->if_addrlen); p += len; } } nolladdropt: if (ln != NULL) LLE_RUNLOCK(ln); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* just to be safe */ #ifdef M_DECRYPTED /*not openbsd*/ if (m0->m_flags & M_DECRYPTED) goto noredhdropt; #endif if (p - (u_char *)ip6 > maxlen) goto noredhdropt; { /* redirected header option */ int len; struct nd_opt_rd_hdr *nd_opt_rh; /* * compute the maximum size for icmp6 redirect header option. * XXX room for auth header? */ len = maxlen - (p - (u_char *)ip6); len &= ~7; /* This is just for simplicity. */ if (m0->m_pkthdr.len != m0->m_len) { if (m0->m_next) { m_freem(m0->m_next); m0->m_next = NULL; } m0->m_pkthdr.len = m0->m_len; } /* * Redirected header option spec (RFC2461 4.6.3) talks nothing * about padding/truncate rule for the original IP packet. * From the discussion on IPv6imp in Feb 1999, * the consensus was: * - "attach as much as possible" is the goal * - pad if not aligned (original size can be guessed by * original ip6 header) * Following code adds the padding if it is simple enough, * and truncates if not. */ if (m0->m_next || m0->m_pkthdr.len != m0->m_len) panic("assumption failed in %s:%d", __FILE__, __LINE__); if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) { /* not enough room, truncate */ m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } else { /* enough room, pad or truncate */ size_t extra; extra = m0->m_pkthdr.len % 8; if (extra) { /* pad if easy enough, truncate if not */ if (8 - extra <= M_TRAILINGSPACE(m0)) { /* pad */ m0->m_len += (8 - extra); m0->m_pkthdr.len += (8 - extra); } else { /* truncate */ m0->m_pkthdr.len -= extra; m0->m_len -= extra; } } len = m0->m_pkthdr.len + sizeof(*nd_opt_rh); m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } nd_opt_rh = (struct nd_opt_rd_hdr *)p; bzero(nd_opt_rh, sizeof(*nd_opt_rh)); nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER; nd_opt_rh->nd_opt_rh_len = len >> 3; p += sizeof(*nd_opt_rh); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* connect m0 to m */ m_tag_delete_chain(m0, NULL); m0->m_flags &= ~M_PKTHDR; m->m_next = m0; m->m_pkthdr.len = m->m_len + m0->m_len; m0 = NULL; } noredhdropt:; if (m0) { m_freem(m0); m0 = NULL; } /* XXX: clear embedded link IDs in the inner header */ in6_clearscope(&sip6->ip6_src); in6_clearscope(&sip6->ip6_dst); in6_clearscope(&nd_rd->nd_rd_target); in6_clearscope(&nd_rd->nd_rd_dst); ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); nd_rd->nd_rd_cksum = 0; nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), ntohs(ip6->ip6_plen)); if (send_sendso_input_hook != NULL) { mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short), M_NOWAIT); if (mtag == NULL) goto fail; *(unsigned short *)(mtag + 1) = nd_rd->nd_rd_type; m_tag_prepend(m, mtag); } /* send the packet to outside... */ ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_redirect); } ICMP6STAT_INC(icp6s_outhist[ND_REDIRECT]); return; fail: if (m) m_freem(m); if (m0) m_freem(m0); } /* * ICMPv6 socket option processing. */ int icmp6_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0; int optlen; struct inpcb *inp = sotoinpcb(so); int level, op, optname; if (sopt) { level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; } else level = op = optname = optlen = 0; if (level != IPPROTO_ICMPV6) { return EINVAL; } switch (op) { case PRCO_SETOPT: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter ic6f; if (optlen != sizeof(ic6f)) { error = EMSGSIZE; break; } error = sooptcopyin(sopt, &ic6f, optlen, optlen); if (error == 0) { INP_WLOCK(inp); *inp->in6p_icmp6filt = ic6f; INP_WUNLOCK(inp); } break; } default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter ic6f; INP_RLOCK(inp); ic6f = *inp->in6p_icmp6filt; INP_RUNLOCK(inp); error = sooptcopyout(sopt, &ic6f, sizeof(ic6f)); break; } default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Perform rate limit check. * Returns 0 if it is okay to send the icmp6 packet. * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate * limitation. * * XXX per-destination/type check necessary? * * dst - not used at this moment * type - not used at this moment * code - not used at this moment */ static int icmp6_ratelimit(const struct in6_addr *dst, const int type, const int code) { int ret; ret = 0; /* okay to send */ /* PPS limit */ if (!ppsratecheck(&V_icmp6errppslim_last, &V_icmp6errpps_count, V_icmp6errppslim)) { /* The packet is subject to rate limit */ ret++; } return ret; } Index: head/sys/netinet6/ip6_forward.c =================================================================== --- head/sys/netinet6/ip6_forward.c (revision 305823) +++ head/sys/netinet6/ip6_forward.c (revision 305824) @@ -1,626 +1,627 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_forward.c,v 1.69 2001/05/17 03:48:30 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_ipstealth.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #include #include #endif /* IPSEC */ /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful * icmp message because icmp doesn't have a large enough repertoire * of codes and types. * * If not forwarding, just drop the packet. This could be confusing * if ipforwarding was zero but some routing protocol was advancing * us as a gateway to somewhere. However, we must let the routing * protocol deal with that. * */ void ip6_forward(struct mbuf *m, int srcrt) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct sockaddr_in6 *dst = NULL; struct rtentry *rt = NULL; struct route_in6 rin6; int error, type = 0, code = 0; struct mbuf *mcopy = NULL; struct ifnet *origifp; /* maybe unnecessary */ u_int32_t inzone, outzone; struct in6_addr src_in6, dst_in6, odst; #ifdef IPSEC struct secpolicy *sp = NULL; #endif struct m_tag *fwd_tag; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; /* * Do not forward packets to multicast destination (should be handled * by ip6_mforward(). * Do not forward packets with unspecified source. It was discussed * in July 2000, on the ipngwg mailing list. */ if ((m->m_flags & (M_BCAST|M_MCAST)) != 0 || IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { IP6STAT_INC(ip6s_cantforward); /* XXX in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard) */ if (V_ip6_log_time + V_ip6_log_interval < time_uptime) { V_ip6_log_time = time_uptime; log(LOG_DEBUG, "cannot forward " "from %s to %s nxt %d received on %s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif)); } m_freem(m); return; } #ifdef IPSEC /* * Check if this packet has an active SA and needs to be dropped * instead of forwarded. */ if (ip6_ipsec_fwd(m) != 0) { IP6STAT_INC(ip6s_cantforward); m_freem(m); return; } #endif /* IPSEC */ #ifdef IPSTEALTH if (!V_ip6stealth) { #endif if (ip6->ip6_hlim <= IPV6_HLIMDEC) { /* XXX in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard) */ icmp6_error(m, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_TRANSIT, 0); return; } ip6->ip6_hlim -= IPV6_HLIMDEC; #ifdef IPSTEALTH } #endif /* * Save at most ICMPV6_PLD_MAXLEN (= the min IPv6 MTU - * size of IPv6 + ICMPv6 headers) bytes of the packet in case * we need to generate an ICMP6 message to the src. * Thanks to M_EXT, in most cases copy will not occur. * * It is important to save it before IPsec processing as IPsec * processing may modify the mbuf. */ - mcopy = m_copy(m, 0, imin(m->m_pkthdr.len, ICMPV6_PLD_MAXLEN)); + mcopy = m_copym(m, 0, imin(m->m_pkthdr.len, ICMPV6_PLD_MAXLEN), + M_NOWAIT); #ifdef IPSEC /* get a security policy for this packet */ sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, &error); if (sp == NULL) { IPSEC6STAT_INC(ips_out_inval); IP6STAT_INC(ip6s_cantforward); if (mcopy) { #if 0 /* XXX: what icmp ? */ #else m_freem(mcopy); #endif } m_freem(m); return; } error = 0; /* check policy */ switch (sp->policy) { case IPSEC_POLICY_DISCARD: /* * This packet is just discarded. */ IPSEC6STAT_INC(ips_out_polvio); IP6STAT_INC(ip6s_cantforward); KEY_FREESP(&sp); if (mcopy) { #if 0 /* XXX: what icmp ? */ #else m_freem(mcopy); #endif } m_freem(m); return; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: /* no need to do IPsec. */ KEY_FREESP(&sp); goto skip_ipsec; case IPSEC_POLICY_IPSEC: if (sp->req == NULL) { /* XXX should be panic ? */ printf("ip6_forward: No IPsec request specified.\n"); IP6STAT_INC(ip6s_cantforward); KEY_FREESP(&sp); if (mcopy) { #if 0 /* XXX: what icmp ? */ #else m_freem(mcopy); #endif } m_freem(m); return; } /* do IPsec */ break; case IPSEC_POLICY_ENTRUST: default: /* should be panic ?? */ printf("ip6_forward: Invalid policy found. %d\n", sp->policy); KEY_FREESP(&sp); goto skip_ipsec; } { struct ipsecrequest *isr = NULL; /* * when the kernel forwards a packet, it is not proper to apply * IPsec transport mode to the packet. This check avoid from this. * at present, if there is even a transport mode SA request in the * security policy, the kernel does not apply IPsec to the packet. * this check is not enough because the following case is valid. * ipsec esp/tunnel/xxx-xxx/require esp/transport//require; */ for (isr = sp->req; isr; isr = isr->next) { if (isr->saidx.mode == IPSEC_MODE_ANY) goto doipsectunnel; if (isr->saidx.mode == IPSEC_MODE_TUNNEL) goto doipsectunnel; } /* * if there's no need for tunnel mode IPsec, skip. */ if (!isr) goto skip_ipsec; doipsectunnel: /* * All the extension headers will become inaccessible * (since they can be encrypted). * Don't panic, we need no more updates to extension headers * on inner IPv6 packet (since they are now encapsulated). * * IPv6 [ESP|AH] IPv6 [extension headers] payload */ /* * If we need to encapsulate the packet, do it here * ipsec6_proces_packet will send the packet using ip6_output */ error = ipsec6_process_packet(m, sp->req); /* Release SP if an error occurred */ if (error != 0) KEY_FREESP(&sp); if (error == EJUSTRETURN) { /* * We had a SP with a level of 'use' and no SA. We * will just continue to process the packet without * IPsec processing. */ error = 0; goto skip_ipsec; } if (error) { /* mbuf is already reclaimed in ipsec6_process_packet. */ switch (error) { case EHOSTUNREACH: case ENETUNREACH: case EMSGSIZE: case ENOBUFS: case ENOMEM: break; default: printf("ip6_output (ipsec): error code %d\n", error); /* FALLTHROUGH */ case ENOENT: /* don't show these error codes to the user */ break; } IP6STAT_INC(ip6s_cantforward); if (mcopy) { #if 0 /* XXX: what icmp ? */ #else m_freem(mcopy); #endif } return; } else { /* * In the FAST IPSec case we have already * re-injected the packet and it has been freed * by the ipsec_done() function. So, just clean * up after ourselves. */ m = NULL; goto freecopy; } } skip_ipsec: #endif again: bzero(&rin6, sizeof(struct route_in6)); dst = (struct sockaddr_in6 *)&rin6.ro_dst; dst->sin6_len = sizeof(struct sockaddr_in6); dst->sin6_family = AF_INET6; dst->sin6_addr = ip6->ip6_dst; again2: rin6.ro_rt = in6_rtalloc1((struct sockaddr *)dst, 0, 0, M_GETFIB(m)); rt = rin6.ro_rt; if (rin6.ro_rt != NULL) RT_UNLOCK(rin6.ro_rt); else { IP6STAT_INC(ip6s_noroute); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute); if (mcopy) { icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE, 0); } goto bad; } /* * Source scope check: if a packet can't be delivered to its * destination for the reason that the destination is beyond the scope * of the source address, discard the packet and return an icmp6 * destination unreachable error with Code 2 (beyond scope of source * address). We use a local copy of ip6_src, since in6_setscope() * will possibly modify its first argument. * [draft-ietf-ipngwg-icmp-v3-04.txt, Section 3.1] */ src_in6 = ip6->ip6_src; if (in6_setscope(&src_in6, rt->rt_ifp, &outzone)) { /* XXX: this should not happen */ IP6STAT_INC(ip6s_cantforward); IP6STAT_INC(ip6s_badscope); goto bad; } if (in6_setscope(&src_in6, m->m_pkthdr.rcvif, &inzone)) { IP6STAT_INC(ip6s_cantforward); IP6STAT_INC(ip6s_badscope); goto bad; } if (inzone != outzone) { IP6STAT_INC(ip6s_cantforward); IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard); if (V_ip6_log_time + V_ip6_log_interval < time_uptime) { V_ip6_log_time = time_uptime; log(LOG_DEBUG, "cannot forward " "src %s, dst %s, nxt %d, rcvif %s, outif %s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif), if_name(rt->rt_ifp)); } if (mcopy) icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_BEYONDSCOPE, 0); goto bad; } /* * Destination scope check: if a packet is going to break the scope * zone of packet's destination address, discard it. This case should * usually be prevented by appropriately-configured routing table, but * we need an explicit check because we may mistakenly forward the * packet to a different zone by (e.g.) a default route. */ dst_in6 = ip6->ip6_dst; if (in6_setscope(&dst_in6, m->m_pkthdr.rcvif, &inzone) != 0 || in6_setscope(&dst_in6, rt->rt_ifp, &outzone) != 0 || inzone != outzone) { IP6STAT_INC(ip6s_cantforward); IP6STAT_INC(ip6s_badscope); goto bad; } if (rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in6 *)rt->rt_gateway; /* * If we are to forward the packet using the same interface * as one we got the packet from, perhaps we should send a redirect * to sender to shortcut a hop. * Only send redirect if source is sending directly to us, * and if packet was not source routed (or has any options). * Also, don't send redirect if forwarding using a route * modified by a redirect. */ if (V_ip6_sendredirects && rt->rt_ifp == m->m_pkthdr.rcvif && !srcrt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0) { if ((rt->rt_ifp->if_flags & IFF_POINTOPOINT) != 0) { /* * If the incoming interface is equal to the outgoing * one, and the link attached to the interface is * point-to-point, then it will be highly probable * that a routing loop occurs. Thus, we immediately * drop the packet and send an ICMPv6 error message. * * type/code is based on suggestion by Rich Draves. * not sure if it is the best pick. */ icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0); goto bad; } type = ND_REDIRECT; } /* * Fake scoped addresses. Note that even link-local source or * destinaion can appear, if the originating node just sends the * packet to us (without address resolution for the destination). * Since both icmp6_error and icmp6_redirect_output fill the embedded * link identifiers, we can do this stuff after making a copy for * returning an error. */ if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { /* * See corresponding comments in ip6_output. * XXX: but is it possible that ip6_forward() sends a packet * to a loopback interface? I don't think so, and thus * I bark here. (jinmei@kame.net) * XXX: it is common to route invalid packets to loopback. * also, the codepath will be visited on use of ::1 in * rthdr. (itojun) */ #if 1 if (0) #else if ((rt->rt_flags & (RTF_BLACKHOLE|RTF_REJECT)) == 0) #endif { printf("ip6_forward: outgoing interface is loopback. " "src %s, dst %s, nxt %d, rcvif %s, outif %s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif), if_name(rt->rt_ifp)); } /* we can just use rcvif in forwarding. */ origifp = m->m_pkthdr.rcvif; } else origifp = rt->rt_ifp; /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet6_pfil_hook)) goto pass; odst = ip6->ip6_dst; /* Run through list of hooks for output packets. */ error = pfil_run_hooks(&V_inet6_pfil_hook, &m, rt->rt_ifp, PFIL_OUT, NULL); if (error != 0 || m == NULL) goto freecopy; /* consumed by filter */ ip6 = mtod(m, struct ip6_hdr *); /* See if destination IP address was changed by packet filter. */ if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip6_input(). */ if (in6_localip(&ip6->ip6_dst)) m->m_flags |= M_FASTFWD_OURS; else { RTFREE(rt); goto again; /* Redo the routing table lookup. */ } } /* See if local, if yes, send it to netisr. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto out; } /* Or forward to some other address? */ if ((m->m_flags & M_IP6_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { dst = (struct sockaddr_in6 *)&rin6.ro_dst; bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in6)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP6_NEXTHOP; m_tag_delete(m, fwd_tag); RTFREE(rt); goto again2; } pass: /* See if the size was changed by the packet filter. */ if (m->m_pkthdr.len > IN6_LINKMTU(rt->rt_ifp)) { in6_ifstat_inc(rt->rt_ifp, ifs6_in_toobig); if (mcopy) { u_long mtu; #ifdef IPSEC size_t ipsechdrsiz; #endif /* IPSEC */ mtu = IN6_LINKMTU(rt->rt_ifp); #ifdef IPSEC /* * When we do IPsec tunnel ingress, we need to play * with the link value (decrement IPsec header size * from mtu value). The code is much simpler than v4 * case, as we have the outgoing interface for * encapsulated packet as "rt->rt_ifp". */ ipsechdrsiz = ipsec_hdrsiz(mcopy, IPSEC_DIR_OUTBOUND, NULL); if (ipsechdrsiz < mtu) mtu -= ipsechdrsiz; /* * if mtu becomes less than minimum MTU, * tell minimum MTU (and I'll need to fragment it). */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU; #endif /* IPSEC */ icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, mtu); } goto bad; } error = nd6_output_ifp(rt->rt_ifp, origifp, m, dst, NULL); if (error) { in6_ifstat_inc(rt->rt_ifp, ifs6_out_discard); IP6STAT_INC(ip6s_cantforward); } else { IP6STAT_INC(ip6s_forward); in6_ifstat_inc(rt->rt_ifp, ifs6_out_forward); if (type) IP6STAT_INC(ip6s_redirectsent); else { if (mcopy) goto freecopy; } } if (mcopy == NULL) goto out; switch (error) { case 0: if (type == ND_REDIRECT) { icmp6_redirect_output(mcopy, rt); goto out; } goto freecopy; case EMSGSIZE: /* xxx MTU is constant in PPP? */ goto freecopy; case ENOBUFS: /* Tell source to slow down like source quench in IP? */ goto freecopy; case ENETUNREACH: /* shouldn't happen, checked above */ case EHOSTUNREACH: case ENETDOWN: case EHOSTDOWN: default: type = ICMP6_DST_UNREACH; code = ICMP6_DST_UNREACH_ADDR; break; } icmp6_error(mcopy, type, code, 0); goto out; freecopy: m_freem(mcopy); goto out; bad: m_freem(m); out: if (rt != NULL) RTFREE(rt); } Index: head/sys/netinet6/ip6_mroute.c =================================================================== --- head/sys/netinet6/ip6_mroute.c (revision 305823) +++ head/sys/netinet6/ip6_mroute.c (revision 305824) @@ -1,1969 +1,1970 @@ /*- * Copyright (C) 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_mroute.c,v 1.58 2001/12/18 02:36:31 itojun Exp $ */ /*- * Copyright (c) 1989 Stephen Deering * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 * BSDI ip_mroute.c,v 2.10 1996/11/14 00:29:52 jch Exp */ /* * IP multicast forwarding procedures * * Written by David Waitzman, BBN Labs, August 1988. * Modified by Steve Deering, Stanford, February 1989. * Modified by Mark J. Steiglitz, Stanford, May, 1991 * Modified by Van Jacobson, LBL, January 1993 * Modified by Ajit Thyagarajan, PARC, August 1993 * Modified by Bill Fenner, PARC, April 1994 * * MROUTING Revision: 3.5.1.2 + PIM-SMv2 (pimd) Support */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_MRTABLE6, "mf6c", "multicast forwarding cache entry"); static int ip6_mdq(struct mbuf *, struct ifnet *, struct mf6c *); static void phyint_send(struct ip6_hdr *, struct mif6 *, struct mbuf *); static int register_send(struct ip6_hdr *, struct mif6 *, struct mbuf *); static int set_pim6(int *); static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in6 *); extern int in6_mcast_loop; extern struct domain inet6domain; static const struct encaptab *pim6_encap_cookie; static const struct protosw in6_pim_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = pim6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }; static int pim6_encapcheck(const struct mbuf *, int, int, void *); static VNET_DEFINE(int, ip6_mrouter_ver) = 0; #define V_ip6_mrouter_ver VNET(ip6_mrouter_ver) SYSCTL_DECL(_net_inet6); SYSCTL_DECL(_net_inet6_ip6); static SYSCTL_NODE(_net_inet6, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM"); static struct mrt6stat mrt6stat; SYSCTL_STRUCT(_net_inet6_ip6, OID_AUTO, mrt6stat, CTLFLAG_RW, &mrt6stat, mrt6stat, "Multicast Routing Statistics (struct mrt6stat, netinet6/ip6_mroute.h)"); #define MRT6STAT_INC(name) mrt6stat.name += 1 #define NO_RTE_FOUND 0x1 #define RTE_FOUND 0x2 static struct mtx mrouter6_mtx; #define MROUTER6_LOCK() mtx_lock(&mrouter6_mtx) #define MROUTER6_UNLOCK() mtx_unlock(&mrouter6_mtx) #define MROUTER6_LOCK_ASSERT() do { \ mtx_assert(&mrouter6_mtx, MA_OWNED); \ NET_ASSERT_GIANT(); \ } while (0) #define MROUTER6_LOCK_INIT() \ mtx_init(&mrouter6_mtx, "IPv6 multicast forwarding", NULL, MTX_DEF) #define MROUTER6_LOCK_DESTROY() mtx_destroy(&mrouter6_mtx) static struct mf6c *mf6ctable[MF6CTBLSIZ]; SYSCTL_OPAQUE(_net_inet6_ip6, OID_AUTO, mf6ctable, CTLFLAG_RD, &mf6ctable, sizeof(mf6ctable), "S,*mf6ctable[MF6CTBLSIZ]", "IPv6 Multicast Forwarding Table (struct *mf6ctable[MF6CTBLSIZ], " "netinet6/ip6_mroute.h)"); static struct mtx mfc6_mtx; #define MFC6_LOCK() mtx_lock(&mfc6_mtx) #define MFC6_UNLOCK() mtx_unlock(&mfc6_mtx) #define MFC6_LOCK_ASSERT() do { \ mtx_assert(&mfc6_mtx, MA_OWNED); \ NET_ASSERT_GIANT(); \ } while (0) #define MFC6_LOCK_INIT() \ mtx_init(&mfc6_mtx, "IPv6 multicast forwarding cache", NULL, MTX_DEF) #define MFC6_LOCK_DESTROY() mtx_destroy(&mfc6_mtx) static u_char n6expire[MF6CTBLSIZ]; static struct mif6 mif6table[MAXMIFS]; static int sysctl_mif6table(SYSCTL_HANDLER_ARGS) { struct mif6_sctl *out; int error; out = malloc(sizeof(struct mif6_sctl) * MAXMIFS, M_TEMP, M_WAITOK); for (int i = 0; i < MAXMIFS; i++) { out[i].m6_flags = mif6table[i].m6_flags; out[i].m6_rate_limit = mif6table[i].m6_rate_limit; out[i].m6_lcl_addr = mif6table[i].m6_lcl_addr; if (mif6table[i].m6_ifp != NULL) out[i].m6_ifp = mif6table[i].m6_ifp->if_index; else out[i].m6_ifp = 0; out[i].m6_pkt_in = mif6table[i].m6_pkt_in; out[i].m6_pkt_out = mif6table[i].m6_pkt_out; out[i].m6_bytes_in = mif6table[i].m6_bytes_in; out[i].m6_bytes_out = mif6table[i].m6_bytes_out; } error = SYSCTL_OUT(req, out, sizeof(struct mif6_sctl) * MAXMIFS); free(out, M_TEMP); return (error); } SYSCTL_PROC(_net_inet6_ip6, OID_AUTO, mif6table, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, sysctl_mif6table, "S,mif6_sctl[MAXMIFS]", "IPv6 Multicast Interfaces (struct mif6_sctl[MAXMIFS], " "netinet6/ip6_mroute.h)"); static struct mtx mif6_mtx; #define MIF6_LOCK() mtx_lock(&mif6_mtx) #define MIF6_UNLOCK() mtx_unlock(&mif6_mtx) #define MIF6_LOCK_ASSERT() mtx_assert(&mif6_mtx, MA_OWNED) #define MIF6_LOCK_INIT() \ mtx_init(&mif6_mtx, "IPv6 multicast interfaces", NULL, MTX_DEF) #define MIF6_LOCK_DESTROY() mtx_destroy(&mif6_mtx) #ifdef MRT6DEBUG static VNET_DEFINE(u_int, mrt6debug) = 0; /* debug level */ #define V_mrt6debug VNET(mrt6debug) #define DEBUG_MFC 0x02 #define DEBUG_FORWARD 0x04 #define DEBUG_EXPIRE 0x08 #define DEBUG_XMIT 0x10 #define DEBUG_REG 0x20 #define DEBUG_PIM 0x40 #define DEBUG_ERR 0x80 #define DEBUG_ANY 0x7f #define MRT6_DLOG(m, fmt, ...) \ if (V_mrt6debug & (m)) \ log(((m) & DEBUG_ERR) ? LOG_ERR: LOG_DEBUG, \ "%s: " fmt "\n", __func__, ##__VA_ARGS__) #else #define MRT6_DLOG(m, fmt, ...) #endif static void expire_upcalls(void *); #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ /* * XXX TODO: maintain a count to if_allmulti() calls in struct ifnet. */ /* * 'Interfaces' associated with decapsulator (so we can tell * packets that went through it from ones that get reflected * by a broken gateway). Different from IPv4 register_if, * these interfaces are linked into the system ifnet list, * because per-interface IPv6 statistics are maintained in * ifp->if_afdata. But it does not have any routes point * to them. I.e., packets can't be sent this way. They * only exist as a placeholder for multicast source * verification. */ static struct ifnet *multicast_register_if6; #define ENCAP_HOPS 64 /* * Private variables. */ static mifi_t nummifs = 0; static mifi_t reg_mif_num = (mifi_t)-1; static struct pim6stat pim6stat; SYSCTL_STRUCT(_net_inet6_pim, PIM6CTL_STATS, stats, CTLFLAG_RW, &pim6stat, pim6stat, "PIM Statistics (struct pim6stat, netinet6/pim6_var.h)"); #define PIM6STAT_INC(name) pim6stat.name += 1 static VNET_DEFINE(int, pim6); #define V_pim6 VNET(pim6) /* * Hash function for a source, group entry */ #define MF6CHASH(a, g) MF6CHASHMOD((a).s6_addr32[0] ^ (a).s6_addr32[1] ^ \ (a).s6_addr32[2] ^ (a).s6_addr32[3] ^ \ (g).s6_addr32[0] ^ (g).s6_addr32[1] ^ \ (g).s6_addr32[2] ^ (g).s6_addr32[3]) /* * Find a route for a given origin IPv6 address and Multicast group address. */ #define MF6CFIND(o, g, rt) do { \ struct mf6c *_rt = mf6ctable[MF6CHASH(o,g)]; \ rt = NULL; \ while (_rt) { \ if (IN6_ARE_ADDR_EQUAL(&_rt->mf6c_origin.sin6_addr, &(o)) && \ IN6_ARE_ADDR_EQUAL(&_rt->mf6c_mcastgrp.sin6_addr, &(g)) && \ (_rt->mf6c_stall == NULL)) { \ rt = _rt; \ break; \ } \ _rt = _rt->mf6c_next; \ } \ if (rt == NULL) { \ MRT6STAT_INC(mrt6s_mfc_misses); \ } \ } while (/*CONSTCOND*/ 0) /* * Macros to compute elapsed time efficiently * Borrowed from Van Jacobson's scheduling code * XXX: replace with timersub() ? */ #define TV_DELTA(a, b, delta) do { \ int xxs; \ \ delta = (a).tv_usec - (b).tv_usec; \ if ((xxs = (a).tv_sec - (b).tv_sec)) { \ switch (xxs) { \ case 2: \ delta += 1000000; \ /* FALLTHROUGH */ \ case 1: \ delta += 1000000; \ break; \ default: \ delta += (1000000 * xxs); \ } \ } \ } while (/*CONSTCOND*/ 0) /* XXX: replace with timercmp(a, b, <) ? */ #define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) #ifdef UPCALL_TIMING #define UPCALL_MAX 50 static u_long upcall_data[UPCALL_MAX + 1]; static void collate(); #endif /* UPCALL_TIMING */ static int ip6_mrouter_init(struct socket *, int, int); static int add_m6fc(struct mf6cctl *); static int add_m6if(struct mif6ctl *); static int del_m6fc(struct mf6cctl *); static int del_m6if(mifi_t *); static int del_m6if_locked(mifi_t *); static int get_mif6_cnt(struct sioc_mif_req6 *); static int get_sg_cnt(struct sioc_sg_req6 *); static struct callout expire_upcalls_ch; int X_ip6_mforward(struct ip6_hdr *, struct ifnet *, struct mbuf *); int X_ip6_mrouter_done(void); int X_ip6_mrouter_set(struct socket *, struct sockopt *); int X_ip6_mrouter_get(struct socket *, struct sockopt *); int X_mrt6_ioctl(u_long, caddr_t); /* * Handle MRT setsockopt commands to modify the multicast routing tables. */ int X_ip6_mrouter_set(struct socket *so, struct sockopt *sopt) { int error = 0; int optval; struct mif6ctl mifc; struct mf6cctl mfcc; mifi_t mifi; if (so != V_ip6_mrouter && sopt->sopt_name != MRT6_INIT) return (EPERM); switch (sopt->sopt_name) { case MRT6_INIT: #ifdef MRT6_OINIT case MRT6_OINIT: #endif error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; error = ip6_mrouter_init(so, optval, sopt->sopt_name); break; case MRT6_DONE: error = X_ip6_mrouter_done(); break; case MRT6_ADD_MIF: error = sooptcopyin(sopt, &mifc, sizeof(mifc), sizeof(mifc)); if (error) break; error = add_m6if(&mifc); break; case MRT6_ADD_MFC: error = sooptcopyin(sopt, &mfcc, sizeof(mfcc), sizeof(mfcc)); if (error) break; error = add_m6fc(&mfcc); break; case MRT6_DEL_MFC: error = sooptcopyin(sopt, &mfcc, sizeof(mfcc), sizeof(mfcc)); if (error) break; error = del_m6fc(&mfcc); break; case MRT6_DEL_MIF: error = sooptcopyin(sopt, &mifi, sizeof(mifi), sizeof(mifi)); if (error) break; error = del_m6if(&mifi); break; case MRT6_PIM: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; error = set_pim6(&optval); break; default: error = EOPNOTSUPP; break; } return (error); } /* * Handle MRT getsockopt commands */ int X_ip6_mrouter_get(struct socket *so, struct sockopt *sopt) { int error = 0; if (so != V_ip6_mrouter) return (EACCES); switch (sopt->sopt_name) { case MRT6_PIM: error = sooptcopyout(sopt, &V_pim6, sizeof(V_pim6)); break; } return (error); } /* * Handle ioctl commands to obtain information from the cache */ int X_mrt6_ioctl(u_long cmd, caddr_t data) { int ret; ret = EINVAL; switch (cmd) { case SIOCGETSGCNT_IN6: ret = get_sg_cnt((struct sioc_sg_req6 *)data); break; case SIOCGETMIFCNT_IN6: ret = get_mif6_cnt((struct sioc_mif_req6 *)data); break; default: break; } return (ret); } /* * returns the packet, byte, rpf-failure count for the source group provided */ static int get_sg_cnt(struct sioc_sg_req6 *req) { struct mf6c *rt; int ret; ret = 0; MFC6_LOCK(); MF6CFIND(req->src.sin6_addr, req->grp.sin6_addr, rt); if (rt == NULL) { ret = ESRCH; } else { req->pktcnt = rt->mf6c_pkt_cnt; req->bytecnt = rt->mf6c_byte_cnt; req->wrong_if = rt->mf6c_wrong_if; } MFC6_UNLOCK(); return (ret); } /* * returns the input and output packet and byte counts on the mif provided */ static int get_mif6_cnt(struct sioc_mif_req6 *req) { mifi_t mifi; int ret; ret = 0; mifi = req->mifi; MIF6_LOCK(); if (mifi >= nummifs) { ret = EINVAL; } else { req->icount = mif6table[mifi].m6_pkt_in; req->ocount = mif6table[mifi].m6_pkt_out; req->ibytes = mif6table[mifi].m6_bytes_in; req->obytes = mif6table[mifi].m6_bytes_out; } MIF6_UNLOCK(); return (ret); } static int set_pim6(int *i) { if ((*i != 1) && (*i != 0)) return (EINVAL); V_pim6 = *i; return (0); } /* * Enable multicast routing */ static int ip6_mrouter_init(struct socket *so, int v, int cmd) { MRT6_DLOG(DEBUG_ANY, "so_type = %d, pr_protocol = %d", so->so_type, so->so_proto->pr_protocol); if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_ICMPV6) return (EOPNOTSUPP); if (v != 1) return (ENOPROTOOPT); MROUTER6_LOCK(); if (V_ip6_mrouter != NULL) { MROUTER6_UNLOCK(); return (EADDRINUSE); } V_ip6_mrouter = so; V_ip6_mrouter_ver = cmd; bzero((caddr_t)mf6ctable, sizeof(mf6ctable)); bzero((caddr_t)n6expire, sizeof(n6expire)); V_pim6 = 0;/* used for stubbing out/in pim stuff */ callout_init(&expire_upcalls_ch, 0); callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); MROUTER6_UNLOCK(); MRT6_DLOG(DEBUG_ANY, "finished"); return (0); } /* * Disable IPv6 multicast forwarding. */ int X_ip6_mrouter_done(void) { mifi_t mifi; u_long i; struct mf6c *rt; struct rtdetq *rte; MROUTER6_LOCK(); if (V_ip6_mrouter == NULL) { MROUTER6_UNLOCK(); return (EINVAL); } /* * For each phyint in use, disable promiscuous reception of all IPv6 * multicasts. */ for (mifi = 0; mifi < nummifs; mifi++) { if (mif6table[mifi].m6_ifp && !(mif6table[mifi].m6_flags & MIFF_REGISTER)) { if_allmulti(mif6table[mifi].m6_ifp, 0); } } bzero((caddr_t)mif6table, sizeof(mif6table)); nummifs = 0; V_pim6 = 0; /* used to stub out/in pim specific code */ callout_stop(&expire_upcalls_ch); /* * Free all multicast forwarding cache entries. */ MFC6_LOCK(); for (i = 0; i < MF6CTBLSIZ; i++) { rt = mf6ctable[i]; while (rt) { struct mf6c *frt; for (rte = rt->mf6c_stall; rte != NULL; ) { struct rtdetq *n = rte->next; m_freem(rte->m); free(rte, M_MRTABLE6); rte = n; } frt = rt; rt = rt->mf6c_next; free(frt, M_MRTABLE6); } } bzero((caddr_t)mf6ctable, sizeof(mf6ctable)); MFC6_UNLOCK(); /* * Reset register interface */ if (reg_mif_num != (mifi_t)-1 && multicast_register_if6 != NULL) { if_detach(multicast_register_if6); if_free(multicast_register_if6); reg_mif_num = (mifi_t)-1; multicast_register_if6 = NULL; } V_ip6_mrouter = NULL; V_ip6_mrouter_ver = 0; MROUTER6_UNLOCK(); MRT6_DLOG(DEBUG_ANY, "finished"); return (0); } static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 }; /* * Add a mif to the mif table */ static int add_m6if(struct mif6ctl *mifcp) { struct mif6 *mifp; struct ifnet *ifp; int error; MIF6_LOCK(); if (mifcp->mif6c_mifi >= MAXMIFS) { MIF6_UNLOCK(); return (EINVAL); } mifp = mif6table + mifcp->mif6c_mifi; if (mifp->m6_ifp != NULL) { MIF6_UNLOCK(); return (EADDRINUSE); /* XXX: is it appropriate? */ } if (mifcp->mif6c_pifi == 0 || mifcp->mif6c_pifi > V_if_index) { MIF6_UNLOCK(); return (ENXIO); } ifp = ifnet_byindex(mifcp->mif6c_pifi); if (mifcp->mif6c_flags & MIFF_REGISTER) { if (reg_mif_num == (mifi_t)-1) { ifp = if_alloc(IFT_OTHER); if_initname(ifp, "register_mif", 0); ifp->if_flags |= IFF_LOOPBACK; if_attach(ifp); multicast_register_if6 = ifp; reg_mif_num = mifcp->mif6c_mifi; /* * it is impossible to guess the ifindex of the * register interface. So mif6c_pifi is automatically * calculated. */ mifcp->mif6c_pifi = ifp->if_index; } else { ifp = multicast_register_if6; } } else { /* Make sure the interface supports multicast */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { MIF6_UNLOCK(); return (EOPNOTSUPP); } error = if_allmulti(ifp, 1); if (error) { MIF6_UNLOCK(); return (error); } } mifp->m6_flags = mifcp->mif6c_flags; mifp->m6_ifp = ifp; /* initialize per mif pkt counters */ mifp->m6_pkt_in = 0; mifp->m6_pkt_out = 0; mifp->m6_bytes_in = 0; mifp->m6_bytes_out = 0; /* Adjust nummifs up if the mifi is higher than nummifs */ if (nummifs <= mifcp->mif6c_mifi) nummifs = mifcp->mif6c_mifi + 1; MIF6_UNLOCK(); MRT6_DLOG(DEBUG_ANY, "mif #%d, phyint %s", mifcp->mif6c_mifi, if_name(ifp)); return (0); } /* * Delete a mif from the mif table */ static int del_m6if_locked(mifi_t *mifip) { struct mif6 *mifp = mif6table + *mifip; mifi_t mifi; struct ifnet *ifp; MIF6_LOCK_ASSERT(); if (*mifip >= nummifs) return (EINVAL); if (mifp->m6_ifp == NULL) return (EINVAL); if (!(mifp->m6_flags & MIFF_REGISTER)) { /* XXX: TODO: Maintain an ALLMULTI refcount in struct ifnet. */ ifp = mifp->m6_ifp; if_allmulti(ifp, 0); } else { if (reg_mif_num != (mifi_t)-1 && multicast_register_if6 != NULL) { if_detach(multicast_register_if6); if_free(multicast_register_if6); reg_mif_num = (mifi_t)-1; multicast_register_if6 = NULL; } } bzero((caddr_t)mifp, sizeof(*mifp)); /* Adjust nummifs down */ for (mifi = nummifs; mifi > 0; mifi--) if (mif6table[mifi - 1].m6_ifp) break; nummifs = mifi; MRT6_DLOG(DEBUG_ANY, "mif %d, nummifs %d", *mifip, nummifs); return (0); } static int del_m6if(mifi_t *mifip) { int cc; MIF6_LOCK(); cc = del_m6if_locked(mifip); MIF6_UNLOCK(); return (cc); } /* * Add an mfc entry */ static int add_m6fc(struct mf6cctl *mfccp) { struct mf6c *rt; u_long hash; struct rtdetq *rte; u_short nstl; char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN]; MFC6_LOCK(); MF6CFIND(mfccp->mf6cc_origin.sin6_addr, mfccp->mf6cc_mcastgrp.sin6_addr, rt); /* If an entry already exists, just update the fields */ if (rt) { MRT6_DLOG(DEBUG_MFC, "no upcall o %s g %s p %x", ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr), ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent); rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; MFC6_UNLOCK(); return (0); } /* * Find the entry for which the upcall was made and update */ hash = MF6CHASH(mfccp->mf6cc_origin.sin6_addr, mfccp->mf6cc_mcastgrp.sin6_addr); for (rt = mf6ctable[hash], nstl = 0; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr, &mfccp->mf6cc_origin.sin6_addr) && IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr, &mfccp->mf6cc_mcastgrp.sin6_addr) && (rt->mf6c_stall != NULL)) { if (nstl++) log(LOG_ERR, "add_m6fc: %s o %s g %s p %x dbx %p\n", "multiple kernel entries", ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr), ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent, rt->mf6c_stall); MRT6_DLOG(DEBUG_MFC, "o %s g %s p %x dbg %p", ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr), ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent, rt->mf6c_stall); rt->mf6c_origin = mfccp->mf6cc_origin; rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp; rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; /* initialize pkt counters per src-grp */ rt->mf6c_pkt_cnt = 0; rt->mf6c_byte_cnt = 0; rt->mf6c_wrong_if = 0; rt->mf6c_expire = 0; /* Don't clean this guy up */ n6expire[hash]--; /* free packets Qed at the end of this entry */ for (rte = rt->mf6c_stall; rte != NULL; ) { struct rtdetq *n = rte->next; ip6_mdq(rte->m, rte->ifp, rt); m_freem(rte->m); #ifdef UPCALL_TIMING collate(&(rte->t)); #endif /* UPCALL_TIMING */ free(rte, M_MRTABLE6); rte = n; } rt->mf6c_stall = NULL; } } /* * It is possible that an entry is being inserted without an upcall */ if (nstl == 0) { MRT6_DLOG(DEBUG_MFC, "no upcall h %lu o %s g %s p %x", hash, ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr), ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr), mfccp->mf6cc_parent); for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr, &mfccp->mf6cc_origin.sin6_addr)&& IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr, &mfccp->mf6cc_mcastgrp.sin6_addr)) { rt->mf6c_origin = mfccp->mf6cc_origin; rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp; rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; /* initialize pkt counters per src-grp */ rt->mf6c_pkt_cnt = 0; rt->mf6c_byte_cnt = 0; rt->mf6c_wrong_if = 0; if (rt->mf6c_expire) n6expire[hash]--; rt->mf6c_expire = 0; } } if (rt == NULL) { /* no upcall, so make a new entry */ rt = (struct mf6c *)malloc(sizeof(*rt), M_MRTABLE6, M_NOWAIT); if (rt == NULL) { MFC6_UNLOCK(); return (ENOBUFS); } /* insert new entry at head of hash chain */ rt->mf6c_origin = mfccp->mf6cc_origin; rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp; rt->mf6c_parent = mfccp->mf6cc_parent; rt->mf6c_ifset = mfccp->mf6cc_ifset; /* initialize pkt counters per src-grp */ rt->mf6c_pkt_cnt = 0; rt->mf6c_byte_cnt = 0; rt->mf6c_wrong_if = 0; rt->mf6c_expire = 0; rt->mf6c_stall = NULL; /* link into table */ rt->mf6c_next = mf6ctable[hash]; mf6ctable[hash] = rt; } } MFC6_UNLOCK(); return (0); } #ifdef UPCALL_TIMING /* * collect delay statistics on the upcalls */ static void collate(struct timeval *t) { u_long d; struct timeval tp; u_long delta; GET_TIME(tp); if (TV_LT(*t, tp)) { TV_DELTA(tp, *t, delta); d = delta >> 10; if (d > UPCALL_MAX) d = UPCALL_MAX; ++upcall_data[d]; } } #endif /* UPCALL_TIMING */ /* * Delete an mfc entry */ static int del_m6fc(struct mf6cctl *mfccp) { #ifdef MRT6DEBUG char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN]; #endif struct sockaddr_in6 origin; struct sockaddr_in6 mcastgrp; struct mf6c *rt; struct mf6c **nptr; u_long hash; origin = mfccp->mf6cc_origin; mcastgrp = mfccp->mf6cc_mcastgrp; hash = MF6CHASH(origin.sin6_addr, mcastgrp.sin6_addr); MRT6_DLOG(DEBUG_MFC, "orig %s mcastgrp %s", ip6_sprintf(ip6bufo, &origin.sin6_addr), ip6_sprintf(ip6bufg, &mcastgrp.sin6_addr)); MFC6_LOCK(); nptr = &mf6ctable[hash]; while ((rt = *nptr) != NULL) { if (IN6_ARE_ADDR_EQUAL(&origin.sin6_addr, &rt->mf6c_origin.sin6_addr) && IN6_ARE_ADDR_EQUAL(&mcastgrp.sin6_addr, &rt->mf6c_mcastgrp.sin6_addr) && rt->mf6c_stall == NULL) break; nptr = &rt->mf6c_next; } if (rt == NULL) { MFC6_UNLOCK(); return (EADDRNOTAVAIL); } *nptr = rt->mf6c_next; free(rt, M_MRTABLE6); MFC6_UNLOCK(); return (0); } static int socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in6 *src) { if (s) { if (sbappendaddr(&s->so_rcv, (struct sockaddr *)src, mm, (struct mbuf *)0) != 0) { sorwakeup(s); return (0); } } m_freem(mm); return (-1); } /* * IPv6 multicast forwarding function. This function assumes that the packet * pointed to by "ip6" has arrived on (or is about to be sent to) the interface * pointed to by "ifp", and the packet is to be relayed to other networks * that have members of the packet's destination IPv6 multicast group. * * The packet is returned unscathed to the caller, unless it is * erroneous, in which case a non-zero return value tells the caller to * discard it. * * NOTE: this implementation assumes that m->m_pkthdr.rcvif is NULL iff * this function is called in the originating context (i.e., not when * forwarding a packet from other node). ip6_output(), which is currently the * only function that calls this function is called in the originating context, * explicitly ensures this condition. It is caller's responsibility to ensure * that if this function is called from somewhere else in the originating * context in the future. */ int X_ip6_mforward(struct ip6_hdr *ip6, struct ifnet *ifp, struct mbuf *m) { struct rtdetq *rte; struct mbuf *mb0; struct mf6c *rt; struct mif6 *mifp; struct mbuf *mm; u_long hash; mifi_t mifi; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; #ifdef UPCALL_TIMING struct timeval tp; GET_TIME(tp); #endif /* UPCALL_TIMING */ MRT6_DLOG(DEBUG_FORWARD, "src %s, dst %s, ifindex %d", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ifp->if_index); /* * Don't forward a packet with Hop limit of zero or one, * or a packet destined to a local-only group. */ if (ip6->ip6_hlim <= 1 || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) || IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) return (0); ip6->ip6_hlim--; /* * Source address check: do not forward packets with unspecified * source. It was discussed in July 2000, on ipngwg mailing list. * This is rather more serious than unicast cases, because some * MLD packets can be sent with the unspecified source address * (although such packets must normally set 1 to the hop limit field). */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { IP6STAT_INC(ip6s_cantforward); if (V_ip6_log_time + V_ip6_log_interval < time_uptime) { V_ip6_log_time = time_uptime; log(LOG_DEBUG, "cannot forward " "from %s to %s nxt %d received on %s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif)); } return (0); } MFC6_LOCK(); /* * Determine forwarding mifs from the forwarding cache table */ MF6CFIND(ip6->ip6_src, ip6->ip6_dst, rt); MRT6STAT_INC(mrt6s_mfc_lookups); /* Entry exists, so forward if necessary */ if (rt) { MFC6_UNLOCK(); return (ip6_mdq(m, ifp, rt)); } /* * If we don't have a route for packet's origin, * Make a copy of the packet & send message to routing daemon. */ MRT6STAT_INC(mrt6s_no_route); MRT6_DLOG(DEBUG_FORWARD | DEBUG_MFC, "no rte s %s g %s", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst)); /* * Allocate mbufs early so that we don't do extra work if we * are just going to fail anyway. */ rte = (struct rtdetq *)malloc(sizeof(*rte), M_MRTABLE6, M_NOWAIT); if (rte == NULL) { MFC6_UNLOCK(); return (ENOBUFS); } - mb0 = m_copy(m, 0, M_COPYALL); + mb0 = m_copym(m, 0, M_COPYALL, M_NOWAIT); /* * Pullup packet header if needed before storing it, * as other references may modify it in the meantime. */ if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < sizeof(struct ip6_hdr))) mb0 = m_pullup(mb0, sizeof(struct ip6_hdr)); if (mb0 == NULL) { free(rte, M_MRTABLE6); MFC6_UNLOCK(); return (ENOBUFS); } /* is there an upcall waiting for this packet? */ hash = MF6CHASH(ip6->ip6_src, ip6->ip6_dst); for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &rt->mf6c_origin.sin6_addr) && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &rt->mf6c_mcastgrp.sin6_addr) && (rt->mf6c_stall != NULL)) break; } if (rt == NULL) { struct mrt6msg *im; #ifdef MRT6_OINIT struct omrt6msg *oim; #endif /* no upcall, so make a new entry */ rt = (struct mf6c *)malloc(sizeof(*rt), M_MRTABLE6, M_NOWAIT); if (rt == NULL) { free(rte, M_MRTABLE6); m_freem(mb0); MFC6_UNLOCK(); return (ENOBUFS); } /* * Make a copy of the header to send to the user * level process */ - mm = m_copy(mb0, 0, sizeof(struct ip6_hdr)); + mm = m_copym(mb0, 0, sizeof(struct ip6_hdr), M_NOWAIT); if (mm == NULL) { free(rte, M_MRTABLE6); m_freem(mb0); free(rt, M_MRTABLE6); MFC6_UNLOCK(); return (ENOBUFS); } /* * Send message to routing daemon */ sin6.sin6_addr = ip6->ip6_src; im = NULL; #ifdef MRT6_OINIT oim = NULL; #endif switch (V_ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim = mtod(mm, struct omrt6msg *); oim->im6_msgtype = MRT6MSG_NOCACHE; oim->im6_mbz = 0; break; #endif case MRT6_INIT: im = mtod(mm, struct mrt6msg *); im->im6_msgtype = MRT6MSG_NOCACHE; im->im6_mbz = 0; break; default: free(rte, M_MRTABLE6); m_freem(mb0); free(rt, M_MRTABLE6); MFC6_UNLOCK(); return (EINVAL); } MRT6_DLOG(DEBUG_FORWARD, "getting the iif info in the kernel"); for (mifp = mif6table, mifi = 0; mifi < nummifs && mifp->m6_ifp != ifp; mifp++, mifi++) ; switch (V_ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim->im6_mif = mifi; break; #endif case MRT6_INIT: im->im6_mif = mifi; break; } if (socket_send(V_ip6_mrouter, mm, &sin6) < 0) { log(LOG_WARNING, "ip6_mforward: ip6_mrouter " "socket queue full\n"); MRT6STAT_INC(mrt6s_upq_sockfull); free(rte, M_MRTABLE6); m_freem(mb0); free(rt, M_MRTABLE6); MFC6_UNLOCK(); return (ENOBUFS); } MRT6STAT_INC(mrt6s_upcalls); /* insert new entry at head of hash chain */ bzero(rt, sizeof(*rt)); rt->mf6c_origin.sin6_family = AF_INET6; rt->mf6c_origin.sin6_len = sizeof(struct sockaddr_in6); rt->mf6c_origin.sin6_addr = ip6->ip6_src; rt->mf6c_mcastgrp.sin6_family = AF_INET6; rt->mf6c_mcastgrp.sin6_len = sizeof(struct sockaddr_in6); rt->mf6c_mcastgrp.sin6_addr = ip6->ip6_dst; rt->mf6c_expire = UPCALL_EXPIRE; n6expire[hash]++; rt->mf6c_parent = MF6C_INCOMPLETE_PARENT; /* link into table */ rt->mf6c_next = mf6ctable[hash]; mf6ctable[hash] = rt; /* Add this entry to the end of the queue */ rt->mf6c_stall = rte; } else { /* determine if q has overflowed */ struct rtdetq **p; int npkts = 0; for (p = &rt->mf6c_stall; *p != NULL; p = &(*p)->next) if (++npkts > MAX_UPQ6) { MRT6STAT_INC(mrt6s_upq_ovflw); free(rte, M_MRTABLE6); m_freem(mb0); MFC6_UNLOCK(); return (0); } /* Add this entry to the end of the queue */ *p = rte; } rte->next = NULL; rte->m = mb0; rte->ifp = ifp; #ifdef UPCALL_TIMING rte->t = tp; #endif /* UPCALL_TIMING */ MFC6_UNLOCK(); return (0); } /* * Clean up cache entries if upcalls are not serviced * Call from the Slow Timeout mechanism, every half second. */ static void expire_upcalls(void *unused) { #ifdef MRT6DEBUG char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN]; #endif struct rtdetq *rte; struct mf6c *mfc, **nptr; u_long i; MFC6_LOCK(); for (i = 0; i < MF6CTBLSIZ; i++) { if (n6expire[i] == 0) continue; nptr = &mf6ctable[i]; while ((mfc = *nptr) != NULL) { rte = mfc->mf6c_stall; /* * Skip real cache entries * Make sure it wasn't marked to not expire (shouldn't happen) * If it expires now */ if (rte != NULL && mfc->mf6c_expire != 0 && --mfc->mf6c_expire == 0) { MRT6_DLOG(DEBUG_EXPIRE, "expiring (%s %s)", ip6_sprintf(ip6bufo, &mfc->mf6c_origin.sin6_addr), ip6_sprintf(ip6bufg, &mfc->mf6c_mcastgrp.sin6_addr)); /* * drop all the packets * free the mbuf with the pkt, if, timing info */ do { struct rtdetq *n = rte->next; m_freem(rte->m); free(rte, M_MRTABLE6); rte = n; } while (rte != NULL); MRT6STAT_INC(mrt6s_cache_cleanups); n6expire[i]--; *nptr = mfc->mf6c_next; free(mfc, M_MRTABLE6); } else { nptr = &mfc->mf6c_next; } } } MFC6_UNLOCK(); callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); } /* * Packet forwarding routine once entry in the cache is made */ static int ip6_mdq(struct mbuf *m, struct ifnet *ifp, struct mf6c *rt) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); mifi_t mifi, iif; struct mif6 *mifp; int plen = m->m_pkthdr.len; struct in6_addr src0, dst0; /* copies for local work */ u_int32_t iszone, idzone, oszone, odzone; int error = 0; /* * Macro to send packet on mif. Since RSVP packets don't get counted on * input, they shouldn't get counted on output, so statistics keeping is * separate. */ #define MC6_SEND(ip6, mifp, m) do { \ if ((mifp)->m6_flags & MIFF_REGISTER) \ register_send((ip6), (mifp), (m)); \ else \ phyint_send((ip6), (mifp), (m)); \ } while (/*CONSTCOND*/ 0) /* * Don't forward if it didn't arrive from the parent mif * for its origin. */ mifi = rt->mf6c_parent; if ((mifi >= nummifs) || (mif6table[mifi].m6_ifp != ifp)) { /* came in the wrong interface */ MRT6_DLOG(DEBUG_FORWARD, "wrong if: ifid %d mifi %d mififid %x", ifp->if_index, mifi, mif6table[mifi].m6_ifp->if_index); MRT6STAT_INC(mrt6s_wrong_if); rt->mf6c_wrong_if++; /* * If we are doing PIM processing, and we are forwarding * packets on this interface, send a message to the * routing daemon. */ /* have to make sure this is a valid mif */ if (mifi < nummifs && mif6table[mifi].m6_ifp) if (V_pim6 && (m->m_flags & M_LOOP) == 0) { /* * Check the M_LOOP flag to avoid an * unnecessary PIM assert. * XXX: M_LOOP is an ad-hoc hack... */ static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 }; struct mbuf *mm; struct mrt6msg *im; #ifdef MRT6_OINIT struct omrt6msg *oim; #endif - mm = m_copy(m, 0, sizeof(struct ip6_hdr)); + mm = m_copym(m, 0, sizeof(struct ip6_hdr), + M_NOWAIT); if (mm && (!M_WRITABLE(mm) || mm->m_len < sizeof(struct ip6_hdr))) mm = m_pullup(mm, sizeof(struct ip6_hdr)); if (mm == NULL) return (ENOBUFS); #ifdef MRT6_OINIT oim = NULL; #endif im = NULL; switch (V_ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim = mtod(mm, struct omrt6msg *); oim->im6_msgtype = MRT6MSG_WRONGMIF; oim->im6_mbz = 0; break; #endif case MRT6_INIT: im = mtod(mm, struct mrt6msg *); im->im6_msgtype = MRT6MSG_WRONGMIF; im->im6_mbz = 0; break; default: m_freem(mm); return (EINVAL); } for (mifp = mif6table, iif = 0; iif < nummifs && mifp && mifp->m6_ifp != ifp; mifp++, iif++) ; switch (V_ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim->im6_mif = iif; sin6.sin6_addr = oim->im6_src; break; #endif case MRT6_INIT: im->im6_mif = iif; sin6.sin6_addr = im->im6_src; break; } MRT6STAT_INC(mrt6s_upcalls); if (socket_send(V_ip6_mrouter, mm, &sin6) < 0) { MRT6_DLOG(DEBUG_ANY, "ip6_mrouter socket queue full"); MRT6STAT_INC(mrt6s_upq_sockfull); return (ENOBUFS); } /* if socket Q full */ } /* if PIM */ return (0); } /* if wrong iif */ /* If I sourced this packet, it counts as output, else it was input. */ if (m->m_pkthdr.rcvif == NULL) { /* XXX: is rcvif really NULL when output?? */ mif6table[mifi].m6_pkt_out++; mif6table[mifi].m6_bytes_out += plen; } else { mif6table[mifi].m6_pkt_in++; mif6table[mifi].m6_bytes_in += plen; } rt->mf6c_pkt_cnt++; rt->mf6c_byte_cnt += plen; /* * For each mif, forward a copy of the packet if there are group * members downstream on the interface. */ src0 = ip6->ip6_src; dst0 = ip6->ip6_dst; if ((error = in6_setscope(&src0, ifp, &iszone)) != 0 || (error = in6_setscope(&dst0, ifp, &idzone)) != 0) { IP6STAT_INC(ip6s_badscope); return (error); } for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++) { if (IF_ISSET(mifi, &rt->mf6c_ifset)) { /* * check if the outgoing packet is going to break * a scope boundary. * XXX For packets through PIM register tunnel * interface, we believe a routing daemon. */ if (!(mif6table[rt->mf6c_parent].m6_flags & MIFF_REGISTER) && !(mif6table[mifi].m6_flags & MIFF_REGISTER)) { if (in6_setscope(&src0, mif6table[mifi].m6_ifp, &oszone) || in6_setscope(&dst0, mif6table[mifi].m6_ifp, &odzone) || iszone != oszone || idzone != odzone) { IP6STAT_INC(ip6s_badscope); continue; } } mifp->m6_pkt_out++; mifp->m6_bytes_out += plen; MC6_SEND(ip6, mifp, m); } } return (0); } static void phyint_send(struct ip6_hdr *ip6, struct mif6 *mifp, struct mbuf *m) { #ifdef MRT6DEBUG char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; #endif struct mbuf *mb_copy; struct ifnet *ifp = mifp->m6_ifp; int error = 0; u_long linkmtu; /* * Make a new reference to the packet; make sure that * the IPv6 header is actually copied, not just referenced, * so that ip6_output() only scribbles on the copy. */ - mb_copy = m_copy(m, 0, M_COPYALL); + mb_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (mb_copy && (!M_WRITABLE(mb_copy) || mb_copy->m_len < sizeof(struct ip6_hdr))) mb_copy = m_pullup(mb_copy, sizeof(struct ip6_hdr)); if (mb_copy == NULL) { return; } /* set MCAST flag to the outgoing packet */ mb_copy->m_flags |= M_MCAST; /* * If we sourced the packet, call ip6_output since we may devide * the packet into fragments when the packet is too big for the * outgoing interface. * Otherwise, we can simply send the packet to the interface * sending queue. */ if (m->m_pkthdr.rcvif == NULL) { struct ip6_moptions im6o; im6o.im6o_multicast_ifp = ifp; /* XXX: ip6_output will override ip6->ip6_hlim */ im6o.im6o_multicast_hlim = ip6->ip6_hlim; im6o.im6o_multicast_loop = 1; error = ip6_output(mb_copy, NULL, NULL, IPV6_FORWARDING, &im6o, NULL, NULL); MRT6_DLOG(DEBUG_XMIT, "mif %u err %d", (uint16_t)(mifp - mif6table), error); return; } /* * If configured to loop back multicasts by default, * loop back a copy now. */ if (in6_mcast_loop) ip6_mloopback(ifp, m); /* * Put the packet into the sending queue of the outgoing interface * if it would fit in the MTU of the interface. */ linkmtu = IN6_LINKMTU(ifp); if (mb_copy->m_pkthdr.len <= linkmtu || linkmtu < IPV6_MMTU) { struct sockaddr_in6 dst6; bzero(&dst6, sizeof(dst6)); dst6.sin6_len = sizeof(struct sockaddr_in6); dst6.sin6_family = AF_INET6; dst6.sin6_addr = ip6->ip6_dst; IP_PROBE(send, NULL, NULL, ip6, ifp, NULL, ip6); /* * We just call if_output instead of nd6_output here, since * we need no ND for a multicast forwarded packet...right? */ m_clrprotoflags(m); /* Avoid confusing lower layers. */ error = (*ifp->if_output)(ifp, mb_copy, (struct sockaddr *)&dst6, NULL); MRT6_DLOG(DEBUG_XMIT, "mif %u err %d", (uint16_t)(mifp - mif6table), error); } else { /* * pMTU discovery is intentionally disabled by default, since * various router may notify pMTU in multicast, which can be * a DDoS to a router */ if (V_ip6_mcast_pmtu) icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0, linkmtu); else { MRT6_DLOG(DEBUG_XMIT, " packet too big on %s o %s " "g %s size %d (discarded)", if_name(ifp), ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), mb_copy->m_pkthdr.len); m_freem(mb_copy); /* simply discard the packet */ } } } static int register_send(struct ip6_hdr *ip6, struct mif6 *mif, struct mbuf *m) { #ifdef MRT6DEBUG char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; #endif struct mbuf *mm; int i, len = m->m_pkthdr.len; static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 }; struct mrt6msg *im6; MRT6_DLOG(DEBUG_ANY, "src %s dst %s", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst)); PIM6STAT_INC(pim6s_snd_registers); /* Make a copy of the packet to send to the user level process. */ mm = m_gethdr(M_NOWAIT, MT_DATA); if (mm == NULL) return (ENOBUFS); mm->m_data += max_linkhdr; mm->m_len = sizeof(struct ip6_hdr); - if ((mm->m_next = m_copy(m, 0, M_COPYALL)) == NULL) { + if ((mm->m_next = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) { m_freem(mm); return (ENOBUFS); } i = MHLEN - M_LEADINGSPACE(mm); if (i > len) i = len; mm = m_pullup(mm, i); if (mm == NULL) return (ENOBUFS); /* TODO: check it! */ mm->m_pkthdr.len = len + sizeof(struct ip6_hdr); /* * Send message to routing daemon */ sin6.sin6_addr = ip6->ip6_src; im6 = mtod(mm, struct mrt6msg *); im6->im6_msgtype = MRT6MSG_WHOLEPKT; im6->im6_mbz = 0; im6->im6_mif = mif - mif6table; /* iif info is not given for reg. encap.n */ MRT6STAT_INC(mrt6s_upcalls); if (socket_send(V_ip6_mrouter, mm, &sin6) < 0) { MRT6_DLOG(DEBUG_ANY, "ip6_mrouter socket queue full"); MRT6STAT_INC(mrt6s_upq_sockfull); return (ENOBUFS); } return (0); } /* * pim6_encapcheck() is called by the encap6_input() path at runtime to * determine if a packet is for PIM; allowing PIM to be dynamically loaded * into the kernel. */ static int pim6_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { #ifdef DIAGNOSTIC KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM")); #endif if (proto != IPPROTO_PIM) return 0; /* not for us; reject the datagram. */ return 64; /* claim the datagram. */ } /* * PIM sparse mode hook * Receives the pim control messages, and passes them up to the listening * socket, using rip6_input. * The only message processed is the REGISTER pim message; the pim header * is stripped off, and the inner packet is passed to register_mforward. */ int pim6_input(struct mbuf **mp, int *offp, int proto) { struct pim *pim; /* pointer to a pim struct */ struct ip6_hdr *ip6; int pimlen; struct mbuf *m = *mp; int minlen; int off = *offp; PIM6STAT_INC(pim6s_rcv_total); ip6 = mtod(m, struct ip6_hdr *); pimlen = m->m_pkthdr.len - *offp; /* * Validate lengths */ if (pimlen < PIM_MINLEN) { PIM6STAT_INC(pim6s_rcv_tooshort); MRT6_DLOG(DEBUG_PIM, "PIM packet too short"); m_freem(m); return (IPPROTO_DONE); } /* * if the packet is at least as big as a REGISTER, go ahead * and grab the PIM REGISTER header size, to avoid another * possible m_pullup() later. * * PIM_MINLEN == pimhdr + u_int32 == 8 * PIM6_REG_MINLEN == pimhdr + reghdr + eip6hdr == 4 + 4 + 40 */ minlen = (pimlen >= PIM6_REG_MINLEN) ? PIM6_REG_MINLEN : PIM_MINLEN; /* * Make sure that the IP6 and PIM headers in contiguous memory, and * possibly the PIM REGISTER header */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, minlen, IPPROTO_DONE); /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* adjust mbuf to point to the PIM header */ pim = (struct pim *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(pim, struct pim *, m, off, minlen); if (pim == NULL) { PIM6STAT_INC(pim6s_rcv_tooshort); return (IPPROTO_DONE); } #endif #define PIM6_CHECKSUM #ifdef PIM6_CHECKSUM { int cksumlen; /* * Validate checksum. * If PIM REGISTER, exclude the data packet */ if (pim->pim_type == PIM_REGISTER) cksumlen = PIM_MINLEN; else cksumlen = pimlen; if (in6_cksum(m, IPPROTO_PIM, off, cksumlen)) { PIM6STAT_INC(pim6s_rcv_badsum); MRT6_DLOG(DEBUG_PIM, "invalid checksum"); m_freem(m); return (IPPROTO_DONE); } } #endif /* PIM_CHECKSUM */ /* PIM version check */ if (pim->pim_ver != PIM_VERSION) { PIM6STAT_INC(pim6s_rcv_badversion); MRT6_DLOG(DEBUG_ANY | DEBUG_ERR, "incorrect version %d, expecting %d", pim->pim_ver, PIM_VERSION); m_freem(m); return (IPPROTO_DONE); } if (pim->pim_type == PIM_REGISTER) { /* * since this is a REGISTER, we'll make a copy of the register * headers ip6+pim+u_int32_t+encap_ip6, to be passed up to the * routing daemon. */ static struct sockaddr_in6 dst = { sizeof(dst), AF_INET6 }; struct mbuf *mcp; struct ip6_hdr *eip6; u_int32_t *reghdr; int rc; #ifdef MRT6DEBUG char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; #endif PIM6STAT_INC(pim6s_rcv_registers); if ((reg_mif_num >= nummifs) || (reg_mif_num == (mifi_t) -1)) { MRT6_DLOG(DEBUG_PIM, "register mif not set: %d", reg_mif_num); m_freem(m); return (IPPROTO_DONE); } reghdr = (u_int32_t *)(pim + 1); if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) goto pim6_input_to_daemon; /* * Validate length */ if (pimlen < PIM6_REG_MINLEN) { PIM6STAT_INC(pim6s_rcv_tooshort); PIM6STAT_INC(pim6s_rcv_badregisters); MRT6_DLOG(DEBUG_ANY | DEBUG_ERR, "register packet " "size too small %d from %s", pimlen, ip6_sprintf(ip6bufs, &ip6->ip6_src)); m_freem(m); return (IPPROTO_DONE); } eip6 = (struct ip6_hdr *) (reghdr + 1); MRT6_DLOG(DEBUG_PIM, "eip6: %s -> %s, eip6 plen %d", ip6_sprintf(ip6bufs, &eip6->ip6_src), ip6_sprintf(ip6bufd, &eip6->ip6_dst), ntohs(eip6->ip6_plen)); /* verify the version number of the inner packet */ if ((eip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { PIM6STAT_INC(pim6s_rcv_badregisters); MRT6_DLOG(DEBUG_ANY, "invalid IP version (%d) " "of the inner packet", (eip6->ip6_vfc & IPV6_VERSION)); m_freem(m); return (IPPROTO_NONE); } /* verify the inner packet is destined to a mcast group */ if (!IN6_IS_ADDR_MULTICAST(&eip6->ip6_dst)) { PIM6STAT_INC(pim6s_rcv_badregisters); MRT6_DLOG(DEBUG_PIM, "inner packet of register " "is not multicast %s", ip6_sprintf(ip6bufd, &eip6->ip6_dst)); m_freem(m); return (IPPROTO_DONE); } /* * make a copy of the whole header to pass to the daemon later. */ - mcp = m_copy(m, 0, off + PIM6_REG_MINLEN); + mcp = m_copym(m, 0, off + PIM6_REG_MINLEN, M_NOWAIT); if (mcp == NULL) { MRT6_DLOG(DEBUG_ANY | DEBUG_ERR, "pim register: " "could not copy register head"); m_freem(m); return (IPPROTO_DONE); } /* * forward the inner ip6 packet; point m_data at the inner ip6. */ m_adj(m, off + PIM_MINLEN); MRT6_DLOG(DEBUG_PIM, "forwarding decapsulated register: " "src %s, dst %s, mif %d", ip6_sprintf(ip6bufs, &eip6->ip6_src), ip6_sprintf(ip6bufd, &eip6->ip6_dst), reg_mif_num); rc = if_simloop(mif6table[reg_mif_num].m6_ifp, m, dst.sin6_family, 0); /* prepare the register head to send to the mrouting daemon */ m = mcp; } /* * Pass the PIM message up to the daemon; if it is a register message * pass the 'head' only up to the daemon. This includes the * encapsulator ip6 header, pim header, register header and the * encapsulated ip6 header. */ pim6_input_to_daemon: rip6_input(&m, offp, proto); return (IPPROTO_DONE); } static int ip6_mroute_modevent(module_t mod, int type, void *unused) { switch (type) { case MOD_LOAD: MROUTER6_LOCK_INIT(); MFC6_LOCK_INIT(); MIF6_LOCK_INIT(); pim6_encap_cookie = encap_attach_func(AF_INET6, IPPROTO_PIM, pim6_encapcheck, (const struct protosw *)&in6_pim_protosw, NULL); if (pim6_encap_cookie == NULL) { printf("ip6_mroute: unable to attach pim6 encap\n"); MIF6_LOCK_DESTROY(); MFC6_LOCK_DESTROY(); MROUTER6_LOCK_DESTROY(); return (EINVAL); } ip6_mforward = X_ip6_mforward; ip6_mrouter_done = X_ip6_mrouter_done; ip6_mrouter_get = X_ip6_mrouter_get; ip6_mrouter_set = X_ip6_mrouter_set; mrt6_ioctl = X_mrt6_ioctl; break; case MOD_UNLOAD: if (V_ip6_mrouter != NULL) return EINVAL; if (pim6_encap_cookie) { encap_detach(pim6_encap_cookie); pim6_encap_cookie = NULL; } X_ip6_mrouter_done(); ip6_mforward = NULL; ip6_mrouter_done = NULL; ip6_mrouter_get = NULL; ip6_mrouter_set = NULL; mrt6_ioctl = NULL; MIF6_LOCK_DESTROY(); MFC6_LOCK_DESTROY(); MROUTER6_LOCK_DESTROY(); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t ip6_mroutemod = { "ip6_mroute", ip6_mroute_modevent, 0 }; DECLARE_MODULE(ip6_mroute, ip6_mroutemod, SI_SUB_PROTO_MC, SI_ORDER_ANY); Index: head/sys/netinet6/ip6_output.c =================================================================== --- head/sys/netinet6/ip6_output.c (revision 305823) +++ head/sys/netinet6/ip6_output.c (revision 305824) @@ -1,3088 +1,3088 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_sctp.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #include #include #endif /* IPSEC */ #ifdef SCTP #include #include #endif #include #include #ifdef FLOWTABLE #include #endif extern int in6_mcast_loop; struct ip6_exthdrs { struct mbuf *ip6e_ip6; struct mbuf *ip6e_hbh; struct mbuf *ip6e_dest1; struct mbuf *ip6e_rthdr; struct mbuf *ip6e_dest2; }; static MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options"); static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **, struct ucred *, int); static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *, struct socket *, struct sockopt *); static int ip6_getpcbopt(struct ip6_pktopts *, int, struct sockopt *); static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *, struct ucred *, int, int, int); static int ip6_copyexthdr(struct mbuf **, caddr_t, int); static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int, struct ip6_frag **); static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t); static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *); static int ip6_getpmtu(struct route_in6 *, int, struct ifnet *, const struct in6_addr *, u_long *, int *, u_int, u_int); static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long, u_long *, int *, u_int); static int ip6_getpmtu_ctl(u_int, const struct in6_addr *, u_long *); static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int); /* * Make an extension header from option data. hp is the source, and * mp is the destination. */ #define MAKE_EXTHDR(hp, mp) \ do { \ if (hp) { \ struct ip6_ext *eh = (struct ip6_ext *)(hp); \ error = ip6_copyexthdr((mp), (caddr_t)(hp), \ ((eh)->ip6e_len + 1) << 3); \ if (error) \ goto freehdrs; \ } \ } while (/*CONSTCOND*/ 0) /* * Form a chain of extension headers. * m is the extension header mbuf * mp is the previous mbuf in the chain * p is the next header * i is the type of option. */ #define MAKE_CHAIN(m, mp, p, i)\ do {\ if (m) {\ if (!hdrsplit) \ panic("assumption failed: hdr not split"); \ *mtod((m), u_char *) = *(p);\ *(p) = (i);\ p = mtod((m), u_char *);\ (m)->m_next = (mp)->m_next;\ (mp)->m_next = (m);\ (mp) = (m);\ }\ } while (/*CONSTCOND*/ 0) void in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset) { u_short csum; csum = in_cksum_skip(m, offset + plen, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(u_short) > m->m_len) { printf("%s: delayed m_pullup, m->len: %d plen %u off %u " "csum_flags=%b\n", __func__, m->m_len, plen, offset, (int)m->m_pkthdr.csum_flags, CSUM_BITS); /* * XXX this should not happen, but if it does, the correct * behavior may be to insert the checksum in the appropriate * next mbuf in the chain. */ return; } *(u_short *)(m->m_data + offset) = csum; } int ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto, int mtu, uint32_t id) { struct mbuf *m, **mnext, *m_frgpart; struct ip6_hdr *ip6, *mhip6; struct ip6_frag *ip6f; int off; int error; int tlen = m0->m_pkthdr.len; m = m0; ip6 = mtod(m, struct ip6_hdr *); mnext = &m->m_nextpkt; for (off = hlen; off < tlen; off += mtu) { m = m_gethdr(M_NOWAIT, MT_DATA); if (!m) { IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } m->m_flags = m0->m_flags & M_COPYFLAGS; *mnext = m; mnext = &m->m_nextpkt; m->m_data += max_linkhdr; mhip6 = mtod(m, struct ip6_hdr *); *mhip6 = *ip6; m->m_len = sizeof(*mhip6); error = ip6_insertfraghdr(m0, m, hlen, &ip6f); if (error) { IP6STAT_INC(ip6s_odropped); return (error); } ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7)); if (off + mtu >= tlen) mtu = tlen - off; else ip6f->ip6f_offlg |= IP6F_MORE_FRAG; mhip6->ip6_plen = htons((u_short)(mtu + hlen + sizeof(*ip6f) - sizeof(struct ip6_hdr))); - if ((m_frgpart = m_copy(m0, off, mtu)) == NULL) { + if ((m_frgpart = m_copym(m0, off, mtu, M_NOWAIT)) == NULL) { IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } m_cat(m, m_frgpart); m->m_pkthdr.len = mtu + hlen + sizeof(*ip6f); m->m_pkthdr.fibnum = m0->m_pkthdr.fibnum; m->m_pkthdr.rcvif = NULL; ip6f->ip6f_reserved = 0; ip6f->ip6f_ident = id; ip6f->ip6f_nxt = nextproto; IP6STAT_INC(ip6s_ofragments); in6_ifstat_inc(ifp, ifs6_out_fragcreat); } return (0); } /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 * header (with pri, len, nxt, hlim, src, dst). * This function may modify ver and hlim only. * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route_in6 ro is present and has ro_rt initialized, route lookup would be * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, * then result of route lookup is stored in ro->ro_rt. * * type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and * nd_ifinfo.linkmtu is u_int32_t. so we use u_long to hold largest one, * which is rt_mtu. * * ifpp - XXX: just for statistics */ /* * XXX TODO: no flowid is assigned for outbound flows? */ int ip6_output(struct mbuf *m0, struct ip6_pktopts *opt, struct route_in6 *ro, int flags, struct ip6_moptions *im6o, struct ifnet **ifpp, struct inpcb *inp) { struct ip6_hdr *ip6; struct ifnet *ifp, *origifp; struct mbuf *m = m0; struct mbuf *mprev = NULL; int hlen, tlen, len; struct route_in6 ip6route; struct rtentry *rt = NULL; struct sockaddr_in6 *dst, src_sa, dst_sa; struct in6_addr odst; int error = 0; struct in6_ifaddr *ia = NULL; u_long mtu; int alwaysfrag, dontfrag; u_int32_t optlen = 0, plen = 0, unfragpartlen = 0; struct ip6_exthdrs exthdrs; struct in6_addr src0, dst0; u_int32_t zone; struct route_in6 *ro_pmtu = NULL; int hdrsplit = 0; int sw_csum, tso; int needfiblookup; uint32_t fibnum; struct m_tag *fwd_tag = NULL; uint32_t id; if (inp != NULL) { M_SETFIB(m, inp->inp_inc.inc_fibnum); if ((flags & IP_NODEFAULTFLOWID) == 0) { /* unconditionally set flowid */ m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } } bzero(&exthdrs, sizeof(exthdrs)); if (opt) { /* Hop-by-Hop options header */ MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh); /* Destination options header(1st part) */ if (opt->ip6po_rthdr) { /* * Destination options header(1st part) * This only makes sense with a routing header. * See Section 9.2 of RFC 3542. * Disabling this part just for MIP6 convenience is * a bad idea. We need to think carefully about a * way to make the advanced API coexist with MIP6 * options, which might automatically be inserted in * the kernel. */ MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1); } /* Routing header */ MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr); /* Destination options header(2nd part) */ MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2); } #ifdef IPSEC /* * IPSec checking which handles several cases. * FAST IPSEC: We re-injected the packet. * XXX: need scope argument. */ switch(ip6_ipsec_output(&m, inp, &error)) { case 1: /* Bad packet */ goto freehdrs; case -1: /* IPSec done */ goto done; case 0: /* No IPSec */ default: break; } #endif /* IPSEC */ /* * Calculate the total length of the extension header chain. * Keep the length of the unfragmentable part for fragmentation. */ optlen = 0; if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len; if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len; if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len; unfragpartlen = optlen + sizeof(struct ip6_hdr); /* NOTE: we don't add AH/ESP length here (done in ip6_ipsec_output) */ if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len; /* * If there is at least one extension header, * separate IP6 header from the payload. */ if (optlen && !hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } ip6 = mtod(m, struct ip6_hdr *); /* adjust mbuf packet header length */ m->m_pkthdr.len += optlen; plen = m->m_pkthdr.len - sizeof(*ip6); /* If this is a jumbo payload, insert a jumbo payload option. */ if (plen > IPV6_MAXPACKET) { if (!hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0) goto freehdrs; ip6->ip6_plen = 0; } else ip6->ip6_plen = htons(plen); /* * Concatenate headers and fill in next header fields. * Here we have, on "m" * IPv6 payload * and we insert headers accordingly. Finally, we should be getting: * IPv6 hbh dest1 rthdr ah* [esp* dest2 payload] * * during the header composing process, "m" points to IPv6 header. * "mprev" points to an extension header prior to esp. */ u_char *nexthdrp = &ip6->ip6_nxt; mprev = m; /* * we treat dest2 specially. this makes IPsec processing * much easier. the goal here is to make mprev point the * mbuf prior to dest2. * * result: IPv6 dest2 payload * m and mprev will point to IPv6 header. */ if (exthdrs.ip6e_dest2) { if (!hdrsplit) panic("assumption failed: hdr not split"); exthdrs.ip6e_dest2->m_next = m->m_next; m->m_next = exthdrs.ip6e_dest2; *mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_DSTOPTS; } /* * result: IPv6 hbh dest1 rthdr dest2 payload * m will point to IPv6 header. mprev will point to the * extension header prior to dest2 (rthdr in the above case). */ MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS); MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp, IPPROTO_DSTOPTS); MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp, IPPROTO_ROUTING); /* * If there is a routing header, discard the packet. */ if (exthdrs.ip6e_rthdr) { error = EINVAL; goto bad; } /* Source address validation */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && (flags & IPV6_UNSPECSRC) == 0) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } IP6STAT_INC(ip6s_localout); /* * Route packet. */ if (ro == NULL) { ro = &ip6route; bzero((caddr_t)ro, sizeof(*ro)); } else ro->ro_flags |= RT_LLE_CACHE; ro_pmtu = ro; if (opt && opt->ip6po_rthdr) ro = &opt->ip6po_route; dst = (struct sockaddr_in6 *)&ro->ro_dst; #ifdef FLOWTABLE if (ro->ro_rt == NULL) (void )flowtable_lookup(AF_INET6, m, (struct route *)ro); #endif fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); again: /* * if specified, try to fill in the traffic class field. * do not override if a non-zero value is already set. * we check the diffserv field and the ecn field separately. */ if (opt && opt->ip6po_tclass >= 0) { int mask = 0; if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0) mask |= 0xfc; if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0) mask |= 0x03; if (mask != 0) ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); } /* fill in or override the hop limit field, if necessary. */ if (opt && opt->ip6po_hlim != -1) ip6->ip6_hlim = opt->ip6po_hlim & 0xff; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (im6o != NULL) ip6->ip6_hlim = im6o->im6o_multicast_hlim; else ip6->ip6_hlim = V_ip6_defmcasthlim; } /* * Validate route against routing table additions; * a better/more specific route might have been added. * Make sure address family is set in route. */ if (inp) { ro->ro_dst.sin6_family = AF_INET6; RT_VALIDATE((struct route *)ro, &inp->inp_rt_cookie, fibnum); } if (ro->ro_rt && fwd_tag == NULL && (ro->ro_rt->rt_flags & RTF_UP) && ro->ro_dst.sin6_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)) { rt = ro->ro_rt; ifp = ro->ro_rt->rt_ifp; } else { if (ro->ro_lle) LLE_FREE(ro->ro_lle); /* zeros ro_lle */ ro->ro_lle = NULL; if (fwd_tag == NULL) { bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; } error = in6_selectroute_fib(&dst_sa, opt, im6o, ro, &ifp, &rt, fibnum); if (error != 0) { if (ifp != NULL) in6_ifstat_inc(ifp, ifs6_out_discard); goto bad; } } if (rt == NULL) { /* * If in6_selectroute() does not return a route entry, * dst may not have been updated. */ *dst = dst_sa; /* XXX */ } /* * then rt (for unicast) and ifp must be non-NULL valid values. */ if ((flags & IPV6_FORWARDING) == 0) { /* XXX: the FORWARDING flag can be set for mrouting. */ in6_ifstat_inc(ifp, ifs6_out_request); } if (rt != NULL) { ia = (struct in6_ifaddr *)(rt->rt_ifa); counter_u64_add(rt->rt_pksent, 1); } /* * The outgoing interface must be in the zone of source and * destination addresses. */ origifp = ifp; src0 = ip6->ip6_src; if (in6_setscope(&src0, origifp, &zone)) goto badscope; bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = ip6->ip6_src; if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id) goto badscope; dst0 = ip6->ip6_dst; if (in6_setscope(&dst0, origifp, &zone)) goto badscope; /* re-initialize to be sure */ bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id) { goto badscope; } /* We should use ia_ifp to support the case of * sending packets to an address of our own. */ if (ia != NULL && ia->ia_ifp) ifp = ia->ia_ifp; /* scope check is done. */ goto routefound; badscope: IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(origifp, ifs6_out_discard); if (error == 0) error = EHOSTUNREACH; /* XXX */ goto bad; routefound: if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (opt && opt->ip6po_nextroute.ro_rt) { /* * The nexthop is explicitly specified by the * application. We assume the next hop is an IPv6 * address. */ dst = (struct sockaddr_in6 *)opt->ip6po_nexthop; } else if ((rt->rt_flags & RTF_GATEWAY)) dst = (struct sockaddr_in6 *)rt->rt_gateway; } if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */ } else { m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST; in6_ifstat_inc(ifp, ifs6_out_mcast); /* * Confirm that the outgoing interface supports multicast. */ if (!(ifp->if_flags & IFF_MULTICAST)) { IP6STAT_INC(ip6s_noroute); in6_ifstat_inc(ifp, ifs6_out_discard); error = ENETUNREACH; goto bad; } if ((im6o == NULL && in6_mcast_loop) || (im6o && im6o->im6o_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we have not joined * the address; protocols will filter it later, * thus deferring a hash lookup and lock acquisition * at the expense of an m_copym(). */ ip6_mloopback(ifp, m); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IPV6_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip6_mloopback(), * above, will be forwarded by the ip6_input() routine, * if necessary. */ if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) { /* * XXX: ip6_mforward expects that rcvif is NULL * when it is called from the originating path. * However, it may not always be the case. */ m->m_pkthdr.rcvif = NULL; if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a hoplimit of zero may be looped back, * above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip6_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) { m_freem(m); goto done; } } /* * Fill the outgoing inteface to tell the upper layer * to increment per-interface statistics. */ if (ifpp) *ifpp = ifp; /* Determine path MTU. */ if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst, &mtu, &alwaysfrag, fibnum, *nexthdrp)) != 0) goto bad; /* * The caller of this function may specify to use the minimum MTU * in some cases. * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU * setting. The logic is a bit complicated; by default, unicast * packets will follow path MTU while multicast packets will be sent at * the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets * including unicast ones will be sent at the minimum MTU. Multicast * packets will always be sent at the minimum MTU unless * IP6PO_MINMTU_DISABLE is explicitly specified. * See RFC 3542 for more details. */ if (mtu > IPV6_MMTU) { if ((flags & IPV6_MINMTU)) mtu = IPV6_MMTU; else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL) mtu = IPV6_MMTU; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && (opt == NULL || opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) { mtu = IPV6_MMTU; } } /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); /* * If the outgoing packet contains a hop-by-hop options header, * it must be examined and processed even by the source node. * (RFC 2460, section 4.) */ if (exthdrs.ip6e_hbh) { struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *); u_int32_t dummy; /* XXX unused */ u_int32_t plen = 0; /* XXX: ip6_process will check the value */ #ifdef DIAGNOSTIC if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len) panic("ip6e_hbh is not contiguous"); #endif /* * XXX: if we have to send an ICMPv6 error to the sender, * we need the M_LOOP flag since icmp6_error() expects * the IPv6 and the hop-by-hop options header are * contiguous unless the flag is set. */ m->m_flags |= M_LOOP; m->m_pkthdr.rcvif = ifp; if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1), ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh), &dummy, &plen) < 0) { /* m was already freed at this point */ error = EINVAL;/* better error? */ goto done; } m->m_flags &= ~M_LOOP; /* XXX */ m->m_pkthdr.rcvif = NULL; } /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet6_pfil_hook)) goto passout; odst = ip6->ip6_dst; /* Run through list of hooks for output packets. */ error = pfil_run_hooks(&V_inet6_pfil_hook, &m, ifp, PFIL_OUT, inp); if (error != 0 || m == NULL) goto done; /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); needfiblookup = 0; /* See if destination IP address was changed by packet filter. */ if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip6_input(). */ if (in6_localip(&ip6->ip6_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } else { RO_RTFREE(ro); needfiblookup = 1; /* Redo the routing table lookup. */ if (ro->ro_lle) LLE_FREE(ro->ro_lle); /* zeros ro_lle */ ro->ro_lle = NULL; } } /* See if fib was changed by packet filter. */ if (fibnum != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; fibnum = M_GETFIB(m); RO_RTFREE(ro); needfiblookup = 1; if (ro->ro_lle) LLE_FREE(ro->ro_lle); /* zeros ro_lle */ ro->ro_lle = NULL; } if (needfiblookup) goto again; /* See if local, if yes, send it to netisr. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } /* Or forward to some other address? */ if ((m->m_flags & M_IP6_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { dst = (struct sockaddr_in6 *)&ro->ro_dst; bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP6_NEXTHOP; m_tag_delete(m, fwd_tag); goto again; } passout: /* * Send the packet to the outgoing interface. * If necessary, do IPv6 fragmentation before sending. * * the logic here is rather complex: * 1: normal case (dontfrag == 0, alwaysfrag == 0) * 1-a: send as is if tlen <= path mtu * 1-b: fragment if tlen > path mtu * * 2: if user asks us not to fragment (dontfrag == 1) * 2-a: send as is if tlen <= interface mtu * 2-b: error if tlen > interface mtu * * 3: if we always need to attach fragment header (alwaysfrag == 1) * always fragment * * 4: if dontfrag == 1 && alwaysfrag == 1 * error, as we cannot handle this conflicting request */ sw_csum = m->m_pkthdr.csum_flags; if (!hdrsplit) { tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0; sw_csum &= ~ifp->if_hwassist; } else tso = 0; /* * If we added extension headers, we will not do TSO and calculate the * checksums ourselves for now. * XXX-BZ Need a framework to know when the NIC can handle it, even * with ext. hdrs. */ if (sw_csum & CSUM_DELAY_DATA_IPV6) { sw_csum &= ~CSUM_DELAY_DATA_IPV6; in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr)); } #ifdef SCTP if (sw_csum & CSUM_SCTP_IPV6) { sw_csum &= ~CSUM_SCTP_IPV6; sctp_delayed_cksum(m, sizeof(struct ip6_hdr)); } #endif m->m_pkthdr.csum_flags &= ifp->if_hwassist; tlen = m->m_pkthdr.len; if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso) dontfrag = 1; else dontfrag = 0; if (dontfrag && alwaysfrag) { /* case 4 */ /* conflicting request - can't transmit */ error = EMSGSIZE; goto bad; } if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) { /* case 2-b */ /* * Even if the DONTFRAG option is specified, we cannot send the * packet when the data length is larger than the MTU of the * outgoing interface. * Notify the error by sending IPV6_PATHMTU ancillary data if * application wanted to know the MTU value. Also return an * error code (this is not described in the API spec). */ if (inp != NULL) ip6_notify_pmtu(inp, &dst_sa, (u_int32_t)mtu); error = EMSGSIZE; goto bad; } /* * transmit packet without fragmentation */ if (dontfrag || (!alwaysfrag && tlen <= mtu)) { /* case 1-a and 2-a */ struct in6_ifaddr *ia6; ip6 = mtod(m, struct ip6_hdr *); ia6 = in6_ifawithifp(ifp, &ip6->ip6_src); if (ia6) { /* Record statistics for this interface address. */ counter_u64_add(ia6->ia_ifa.ifa_opackets, 1); counter_u64_add(ia6->ia_ifa.ifa_obytes, m->m_pkthdr.len); ifa_free(&ia6->ia_ifa); } error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); goto done; } /* * try to fragment the packet. case 1-b and 3 */ if (mtu < IPV6_MMTU) { /* path MTU cannot be less than IPV6_MMTU */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else if (ip6->ip6_plen == 0) { /* jumbo payload cannot be fragmented */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else { u_char nextproto; /* * Too large for the destination or interface; * fragment if possible. * Must be able to put at least 8 bytes per fragment. */ hlen = unfragpartlen; if (mtu > IPV6_MAXPACKET) mtu = IPV6_MAXPACKET; len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7; if (len < 8) { error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } /* * If the interface will not calculate checksums on * fragmented packets, then do it here. * XXX-BZ handle the hw offloading case. Need flags. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { in6_delayed_cksum(m, plen, hlen); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) { sctp_delayed_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6; } #endif /* * Change the next header field of the last header in the * unfragmentable part. */ if (exthdrs.ip6e_rthdr) { nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *); *mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_dest1) { nextproto = *mtod(exthdrs.ip6e_dest1, u_char *); *mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_hbh) { nextproto = *mtod(exthdrs.ip6e_hbh, u_char *); *mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT; } else { nextproto = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_FRAGMENT; } /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto * chain. */ m0 = m; id = htonl(ip6_randomid()); if ((error = ip6_fragment(ifp, m, hlen, nextproto, len, id))) goto sendorfree; in6_ifstat_inc(ifp, ifs6_out_fragok); } /* * Remove leading garbages. */ sendorfree: m = m0->m_nextpkt; m0->m_nextpkt = 0; m_freem(m0); for (m0 = m; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); } else m_freem(m); } if (error == 0) IP6STAT_INC(ip6s_fragmented); done: /* * Release the route if using our private route, or if * (with flowtable) we don't have our own reference. */ if (ro == &ip6route || (ro != NULL && ro->ro_flags & RT_NORTREF)) RO_RTFREE(ro); return (error); freehdrs: m_freem(exthdrs.ip6e_hbh); /* m_freem will check if mbuf is 0 */ m_freem(exthdrs.ip6e_dest1); m_freem(exthdrs.ip6e_rthdr); m_freem(exthdrs.ip6e_dest2); /* FALLTHROUGH */ bad: if (m) m_freem(m); goto done; } static int ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen) { struct mbuf *m; if (hlen > MCLBYTES) return (ENOBUFS); /* XXX */ if (hlen > MLEN) m = m_getcl(M_NOWAIT, MT_DATA, 0); else m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_len = hlen; if (hdr) bcopy(hdr, mtod(m, caddr_t), hlen); *mp = m; return (0); } /* * Insert jumbo payload option. */ static int ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen) { struct mbuf *mopt; u_char *optbuf; u_int32_t v; #define JUMBOOPTLEN 8 /* length of jumbo payload option and padding */ /* * If there is no hop-by-hop options header, allocate new one. * If there is one but it doesn't have enough space to store the * jumbo payload option, allocate a cluster to store the whole options. * Otherwise, use it to store the options. */ if (exthdrs->ip6e_hbh == NULL) { mopt = m_get(M_NOWAIT, MT_DATA); if (mopt == NULL) return (ENOBUFS); mopt->m_len = JUMBOOPTLEN; optbuf = mtod(mopt, u_char *); optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */ exthdrs->ip6e_hbh = mopt; } else { struct ip6_hbh *hbh; mopt = exthdrs->ip6e_hbh; if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) { /* * XXX assumption: * - exthdrs->ip6e_hbh is not referenced from places * other than exthdrs. * - exthdrs->ip6e_hbh is not an mbuf chain. */ int oldoptlen = mopt->m_len; struct mbuf *n; /* * XXX: give up if the whole (new) hbh header does * not fit even in an mbuf cluster. */ if (oldoptlen + JUMBOOPTLEN > MCLBYTES) return (ENOBUFS); /* * As a consequence, we must always prepare a cluster * at this point. */ n = m_getcl(M_NOWAIT, MT_DATA, 0); if (n == NULL) return (ENOBUFS); n->m_len = oldoptlen + JUMBOOPTLEN; bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t), oldoptlen); optbuf = mtod(n, caddr_t) + oldoptlen; m_freem(mopt); mopt = exthdrs->ip6e_hbh = n; } else { optbuf = mtod(mopt, u_char *) + mopt->m_len; mopt->m_len += JUMBOOPTLEN; } optbuf[0] = IP6OPT_PADN; optbuf[1] = 1; /* * Adjust the header length according to the pad and * the jumbo payload option. */ hbh = mtod(mopt, struct ip6_hbh *); hbh->ip6h_len += (JUMBOOPTLEN >> 3); } /* fill in the option. */ optbuf[2] = IP6OPT_JUMBO; optbuf[3] = 4; v = (u_int32_t)htonl(plen + JUMBOOPTLEN); bcopy(&v, &optbuf[4], sizeof(u_int32_t)); /* finally, adjust the packet header length */ exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN; return (0); #undef JUMBOOPTLEN } /* * Insert fragment header and copy unfragmentable header portions. */ static int ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen, struct ip6_frag **frghdrp) { struct mbuf *n, *mlast; if (hlen > sizeof(struct ip6_hdr)) { n = m_copym(m0, sizeof(struct ip6_hdr), hlen - sizeof(struct ip6_hdr), M_NOWAIT); if (n == NULL) return (ENOBUFS); m->m_next = n; } else n = m; /* Search for the last mbuf of unfragmentable part. */ for (mlast = n; mlast->m_next; mlast = mlast->m_next) ; if (M_WRITABLE(mlast) && M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) { /* use the trailing space of the last mbuf for the fragment hdr */ *frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) + mlast->m_len); mlast->m_len += sizeof(struct ip6_frag); m->m_pkthdr.len += sizeof(struct ip6_frag); } else { /* allocate a new mbuf for the fragment header */ struct mbuf *mfrg; mfrg = m_get(M_NOWAIT, MT_DATA); if (mfrg == NULL) return (ENOBUFS); mfrg->m_len = sizeof(struct ip6_frag); *frghdrp = mtod(mfrg, struct ip6_frag *); mlast->m_next = mfrg; } return (0); } /* * Calculates IPv6 path mtu for destination @dst. * Resulting MTU is stored in @mtup. * * Returns 0 on success. */ static int ip6_getpmtu_ctl(u_int fibnum, const struct in6_addr *dst, u_long *mtup) { struct nhop6_extended nh6; struct in6_addr kdst; uint32_t scopeid; struct ifnet *ifp; u_long mtu; int error; in6_splitscope(dst, &kdst, &scopeid); if (fib6_lookup_nh_ext(fibnum, &kdst, scopeid, NHR_REF, 0, &nh6) != 0) return (EHOSTUNREACH); ifp = nh6.nh_ifp; mtu = nh6.nh_mtu; error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL, 0); fib6_free_nh_ext(fibnum, &nh6); return (error); } /* * Calculates IPv6 path MTU for @dst based on transmit @ifp, * and cached data in @ro_pmtu. * MTU from (successful) route lookup is saved (along with dst) * inside @ro_pmtu to avoid subsequent route lookups after packet * filter processing. * * Stores mtu and always-frag value into @mtup and @alwaysfragp. * Returns 0 on success. */ static int ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup, struct ifnet *ifp, const struct in6_addr *dst, u_long *mtup, int *alwaysfragp, u_int fibnum, u_int proto) { struct nhop6_basic nh6; struct in6_addr kdst; uint32_t scopeid; struct sockaddr_in6 *sa6_dst; u_long mtu; mtu = 0; if (do_lookup) { /* * Here ro_pmtu has final destination address, while * ro might represent immediate destination. * Use ro_pmtu destination since mtu might differ. */ sa6_dst = (struct sockaddr_in6 *)&ro_pmtu->ro_dst; if (!IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst)) ro_pmtu->ro_mtu = 0; if (ro_pmtu->ro_mtu == 0) { bzero(sa6_dst, sizeof(*sa6_dst)); sa6_dst->sin6_family = AF_INET6; sa6_dst->sin6_len = sizeof(struct sockaddr_in6); sa6_dst->sin6_addr = *dst; in6_splitscope(dst, &kdst, &scopeid); if (fib6_lookup_nh_basic(fibnum, &kdst, scopeid, 0, 0, &nh6) == 0) ro_pmtu->ro_mtu = nh6.nh_mtu; } mtu = ro_pmtu->ro_mtu; } if (ro_pmtu->ro_rt) mtu = ro_pmtu->ro_rt->rt_mtu; return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto)); } /* * Calculate MTU based on transmit @ifp, route mtu @rt_mtu and * hostcache data for @dst. * Stores mtu and always-frag value into @mtup and @alwaysfragp. * * Returns 0 on success. */ static int ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu, u_long *mtup, int *alwaysfragp, u_int proto) { u_long mtu = 0; int alwaysfrag = 0; int error = 0; if (rt_mtu > 0) { u_int32_t ifmtu; struct in_conninfo inc; bzero(&inc, sizeof(inc)); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; ifmtu = IN6_LINKMTU(ifp); /* TCP is known to react to pmtu changes so skip hc */ if (proto != IPPROTO_TCP) mtu = tcp_hc_getmtu(&inc); if (mtu) mtu = min(mtu, rt_mtu); else mtu = rt_mtu; if (mtu == 0) mtu = ifmtu; else if (mtu < IPV6_MMTU) { /* * RFC2460 section 5, last paragraph: * if we record ICMPv6 too big message with * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU * or smaller, with framgent header attached. * (fragment header is needed regardless from the * packet size, for translators to identify packets) */ alwaysfrag = 1; mtu = IPV6_MMTU; } } else if (ifp) { mtu = IN6_LINKMTU(ifp); } else error = EHOSTUNREACH; /* XXX */ *mtup = mtu; if (alwaysfragp) *alwaysfragp = alwaysfrag; return (error); } /* * IP6 socket option processing. */ int ip6_ctloutput(struct socket *so, struct sockopt *sopt) { int optdatalen, uproto; void *optdata; struct inpcb *in6p = sotoinpcb(so); int error, optval; int level, op, optname; int optlen; struct thread *td; #ifdef RSS uint32_t rss_bucket; int retval; #endif level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; td = sopt->sopt_td; error = 0; optval = 0; uproto = (int)so->so_proto->pr_protocol; if (level != IPPROTO_IPV6) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(in6p); if ((so->so_options & SO_REUSEADDR) != 0) in6p->inp_flags2 |= INP_REUSEADDR; else in6p->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(in6p); error = 0; break; case SO_REUSEPORT: INP_WLOCK(in6p); if ((so->so_options & SO_REUSEPORT) != 0) in6p->inp_flags2 |= INP_REUSEPORT; else in6p->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(in6p); error = 0; break; case SO_SETFIB: INP_WLOCK(in6p); in6p->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(in6p); error = 0; break; default: break; } } } else { /* level == IPPROTO_IPV6 */ switch (op) { case SOPT_SET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif { struct mbuf *m; error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; error = ip6_pcbopts(&in6p->in6p_outputopts, m, so, sopt); m_freem(m); /* XXX */ break; } /* * Use of some Hop-by-Hop options or some * Destination options, might require special * privilege. That is, normal applications * (without special privilege) might be forbidden * from setting certain options in outgoing packets, * and might never see certain options in received * packets. [RFC 2292 Section 6] * KAME specific note: * KAME prevents non-privileged users from sending or * receiving ANY hbh/dst options in order to avoid * overhead of parsing options in the kernel. */ case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) break; } /* FALLTHROUGH */ case IPV6_UNICAST_HOPS: case IPV6_HOPLIMIT: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_RECVTCLASS: case IPV6_RECVFLOWID: #ifdef RSS case IPV6_RECVRSSBUCKETID: #endif case IPV6_V6ONLY: case IPV6_AUTOFLOWLABEL: case IPV6_BINDANY: case IPV6_BINDMULTI: #ifdef RSS case IPV6_RSS_LISTEN_BUCKET: #endif if (optname == IPV6_BINDANY && td != NULL) { error = priv_check(td, PRIV_NETINET_BINDANY); if (error) break; } if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_UNICAST_HOPS: if (optval < -1 || optval >= 256) error = EINVAL; else { /* -1 = kernel default */ in6p->in6p_hops = optval; if ((in6p->inp_vflag & INP_IPV4) != 0) in6p->inp_ip_ttl = optval; } break; #define OPTSET(bit) \ do { \ INP_WLOCK(in6p); \ if (optval) \ in6p->inp_flags |= (bit); \ else \ in6p->inp_flags &= ~(bit); \ INP_WUNLOCK(in6p); \ } while (/*CONSTCOND*/ 0) #define OPTSET2292(bit) \ do { \ INP_WLOCK(in6p); \ in6p->inp_flags |= IN6P_RFC2292; \ if (optval) \ in6p->inp_flags |= (bit); \ else \ in6p->inp_flags &= ~(bit); \ INP_WUNLOCK(in6p); \ } while (/*CONSTCOND*/ 0) #define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0) #define OPTSET2(bit, val) do { \ INP_WLOCK(in6p); \ if (val) \ in6p->inp_flags2 |= bit; \ else \ in6p->inp_flags2 &= ~bit; \ INP_WUNLOCK(in6p); \ } while (0) #define OPTBIT2(bit) (in6p->inp_flags2 & (bit) ? 1 : 0) case IPV6_RECVPKTINFO: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_PKTINFO); break; case IPV6_HOPLIMIT: { struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } optp = &in6p->in6p_outputopts; error = ip6_pcbopt(IPV6_HOPLIMIT, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); break; } case IPV6_RECVHOPLIMIT: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_HOPLIMIT); break; case IPV6_RECVHOPOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_RTHDRDSTOPTS); break; case IPV6_RECVRTHDR: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: /* * We ignore this option for TCP * sockets. * (RFC3542 leaves this case * unspecified.) */ if (uproto != IPPROTO_TCP) OPTSET(IN6P_MTU); break; case IPV6_RECVFLOWID: OPTSET2(INP_RECVFLOWID, optval); break; #ifdef RSS case IPV6_RECVRSSBUCKETID: OPTSET2(INP_RECVRSSBUCKETID, optval); break; #endif case IPV6_V6ONLY: /* * make setsockopt(IPV6_V6ONLY) * available only prior to bind(2). * see ipng mailing list, Jun 22 2001. */ if (in6p->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) { error = EINVAL; break; } OPTSET(IN6P_IPV6_V6ONLY); if (optval) in6p->inp_vflag &= ~INP_IPV4; else in6p->inp_vflag |= INP_IPV4; break; case IPV6_RECVTCLASS: /* cannot mix with RFC2292 XXX */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: OPTSET(IN6P_AUTOFLOWLABEL); break; case IPV6_BINDANY: OPTSET(INP_BINDANY); break; case IPV6_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; #ifdef RSS case IPV6_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { in6p->inp_rss_listen_bucket = optval; OPTSET2(INP_RSS_BUCKET_SET, 1); } else { error = EINVAL; } break; #endif } break; case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: if (optlen != sizeof(optval)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; { struct ip6_pktopts **optp; optp = &in6p->in6p_outputopts; error = ip6_pcbopt(optname, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: /* RFC 2292 */ if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_2292PKTINFO: OPTSET2292(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: OPTSET2292(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: /* * Check super-user privilege. * See comments for IPV6_RECVHOPOPTS. */ if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_HOPOPTS); break; case IPV6_2292DSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */ break; case IPV6_2292RTHDR: OPTSET2292(IN6P_RTHDR); break; } break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: { /* new advanced API (RFC3542) */ u_char *optbuf; u_char optbuf_storage[MCLBYTES]; int optlen; struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } /* * We only ensure valsize is not too large * here. Further validation will be done * later. */ error = sooptcopyin(sopt, optbuf_storage, sizeof(optbuf_storage), 0); if (error) break; optlen = sopt->sopt_valsize; optbuf = optbuf_storage; optp = &in6p->in6p_outputopts; error = ip6_pcbopt(optname, optbuf, optlen, optp, (td != NULL) ? td->td_ucred : NULL, uproto); break; } #undef OPTSET case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: case IPV6_MSFILTER: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = ip6_setmoptions(in6p, sopt); break; case IPV6_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(in6p); switch (optval) { case IPV6_PORTRANGE_DEFAULT: in6p->inp_flags &= ~(INP_LOWPORT); in6p->inp_flags &= ~(INP_HIGHPORT); break; case IPV6_PORTRANGE_HIGH: in6p->inp_flags &= ~(INP_LOWPORT); in6p->inp_flags |= INP_HIGHPORT; break; case IPV6_PORTRANGE_LOW: in6p->inp_flags &= ~(INP_HIGHPORT); in6p->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(in6p); break; #ifdef IPSEC case IPV6_IPSEC_POLICY: { caddr_t req; struct mbuf *m; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; req = mtod(m, caddr_t); error = ipsec_set_policy(in6p, optname, req, m->m_len, (sopt->sopt_td != NULL) ? sopt->sopt_td->td_ucred : NULL); m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif /* * RFC3542 (effectively) deprecated the * semantics of the 2292-style pktoptions. * Since it was not reliable in nature (i.e., * applications had to expect the lack of some * information after all), it would make sense * to simplify this part by always returning * empty data. */ sopt->sopt_valsize = 0; break; case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: case IPV6_UNICAST_HOPS: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_V6ONLY: case IPV6_PORTRANGE: case IPV6_RECVTCLASS: case IPV6_AUTOFLOWLABEL: case IPV6_BINDANY: case IPV6_FLOWID: case IPV6_FLOWTYPE: case IPV6_RECVFLOWID: #ifdef RSS case IPV6_RSSBUCKETID: case IPV6_RECVRSSBUCKETID: #endif case IPV6_BINDMULTI: switch (optname) { case IPV6_RECVHOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: optval = OPTBIT(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: optval = OPTBIT(IN6P_RTHDRDSTOPTS); break; case IPV6_UNICAST_HOPS: optval = in6p->in6p_hops; break; case IPV6_RECVPKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_RECVHOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_RECVRTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: optval = OPTBIT(IN6P_MTU); break; case IPV6_V6ONLY: optval = OPTBIT(IN6P_IPV6_V6ONLY); break; case IPV6_PORTRANGE: { int flags; flags = in6p->inp_flags; if (flags & INP_HIGHPORT) optval = IPV6_PORTRANGE_HIGH; else if (flags & INP_LOWPORT) optval = IPV6_PORTRANGE_LOW; else optval = 0; break; } case IPV6_RECVTCLASS: optval = OPTBIT(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: optval = OPTBIT(IN6P_AUTOFLOWLABEL); break; case IPV6_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IPV6_FLOWID: optval = in6p->inp_flowid; break; case IPV6_FLOWTYPE: optval = in6p->inp_flowtype; break; case IPV6_RECVFLOWID: optval = OPTBIT2(INP_RECVFLOWID); break; #ifdef RSS case IPV6_RSSBUCKETID: retval = rss_hash2bucket(in6p->inp_flowid, in6p->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; case IPV6_RECVRSSBUCKETID: optval = OPTBIT2(INP_RECVRSSBUCKETID); break; #endif case IPV6_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; } if (error) break; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PATHMTU: { u_long pmtu = 0; struct ip6_mtuinfo mtuinfo; if (!(so->so_state & SS_ISCONNECTED)) return (ENOTCONN); /* * XXX: we dot not consider the case of source * routing, or optional information to specify * the outgoing interface. */ error = ip6_getpmtu_ctl(so->so_fibnum, &in6p->in6p_faddr, &pmtu); if (error) break; if (pmtu > IPV6_MAXPACKET) pmtu = IPV6_MAXPACKET; bzero(&mtuinfo, sizeof(mtuinfo)); mtuinfo.ip6m_mtu = (u_int32_t)pmtu; optdata = (void *)&mtuinfo; optdatalen = sizeof(mtuinfo); error = sooptcopyout(sopt, optdata, optdatalen); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292RTHDR: case IPV6_2292DSTOPTS: switch (optname) { case IPV6_2292PKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_2292RTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_2292DSTOPTS: optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: error = ip6_getpcbopt(in6p->in6p_outputopts, optname, sopt); break; case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_MSFILTER: error = ip6_getmoptions(in6p, sopt); break; #ifdef IPSEC case IPV6_IPSEC_POLICY: { caddr_t req = NULL; size_t len = 0; struct mbuf *m = NULL; struct mbuf **mp = &m; size_t ovalsize = sopt->sopt_valsize; caddr_t oval = (caddr_t)sopt->sopt_val; error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; sopt->sopt_valsize = ovalsize; sopt->sopt_val = oval; if (m) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec_get_policy(in6p, req, len, mp); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0 && m) m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } } return (error); } int ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0, optval, optlen; const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum); struct inpcb *in6p = sotoinpcb(so); int level, op, optname; level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; if (level != IPPROTO_IPV6) { return (EINVAL); } switch (optname) { case IPV6_CHECKSUM: /* * For ICMPv6 sockets, no modification allowed for checksum * offset, permit "no change" values to help existing apps. * * RFC3542 says: "An attempt to set IPV6_CHECKSUM * for an ICMPv6 socket will fail." * The current behavior does not meet RFC3542. */ switch (op) { case SOPT_SET: if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; if ((optval % 2) != 0) { /* the API assumes even offset values */ error = EINVAL; } else if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (optval != icmp6off) error = EINVAL; } else in6p->in6p_cksum = optval; break; case SOPT_GET: if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) optval = icmp6off; else optval = in6p->in6p_cksum; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = EINVAL; break; } break; default: error = ENOPROTOOPT; break; } return (error); } /* * Set up IP6 options in pcb for insertion in output packets or * specifying behavior of outgoing packets. */ static int ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m, struct socket *so, struct sockopt *sopt) { struct ip6_pktopts *opt = *pktopt; int error = 0; struct thread *td = sopt->sopt_td; /* turn off any old options. */ if (opt) { #ifdef DIAGNOSTIC if (opt->ip6po_pktinfo || opt->ip6po_nexthop || opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 || opt->ip6po_rhinfo.ip6po_rhi_rthdr) printf("ip6_pcbopts: all specified options are cleared.\n"); #endif ip6_clearpktopts(opt, -1); } else opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK); *pktopt = NULL; if (!m || m->m_len == 0) { /* * Only turning off any previous options, regardless of * whether the opt is just created or given. */ free(opt, M_IP6OPT); return (0); } /* set options specified by user. */ if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ? td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) { ip6_clearpktopts(opt, -1); /* XXX: discard all options */ free(opt, M_IP6OPT); return (error); } *pktopt = opt; return (0); } /* * initialize ip6_pktopts. beware that there are non-zero default values in * the struct. */ void ip6_initpktopts(struct ip6_pktopts *opt) { bzero(opt, sizeof(*opt)); opt->ip6po_hlim = -1; /* -1 means default hop limit */ opt->ip6po_tclass = -1; /* -1 means default traffic class */ opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY; opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM; } static int ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, struct ucred *cred, int uproto) { struct ip6_pktopts *opt; if (*pktopt == NULL) { *pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT, M_WAITOK); ip6_initpktopts(*pktopt); } opt = *pktopt; return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto)); } static int ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt) { void *optdata = NULL; int optdatalen = 0; struct ip6_ext *ip6e; int error = 0; struct in6_pktinfo null_pktinfo; int deftclass = 0, on; int defminmtu = IP6PO_MINMTU_MCASTONLY; int defpreftemp = IP6PO_TEMPADDR_SYSTEM; switch (optname) { case IPV6_PKTINFO: optdata = (void *)&null_pktinfo; if (pktopt && pktopt->ip6po_pktinfo) { bcopy(pktopt->ip6po_pktinfo, &null_pktinfo, sizeof(null_pktinfo)); in6_clearscope(&null_pktinfo.ipi6_addr); } else { /* XXX: we don't have to do this every time... */ bzero(&null_pktinfo, sizeof(null_pktinfo)); } optdatalen = sizeof(struct in6_pktinfo); break; case IPV6_TCLASS: if (pktopt && pktopt->ip6po_tclass >= 0) optdata = (void *)&pktopt->ip6po_tclass; else optdata = (void *)&deftclass; optdatalen = sizeof(int); break; case IPV6_HOPOPTS: if (pktopt && pktopt->ip6po_hbh) { optdata = (void *)pktopt->ip6po_hbh; ip6e = (struct ip6_ext *)pktopt->ip6po_hbh; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_RTHDR: if (pktopt && pktopt->ip6po_rthdr) { optdata = (void *)pktopt->ip6po_rthdr; ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_RTHDRDSTOPTS: if (pktopt && pktopt->ip6po_dest1) { optdata = (void *)pktopt->ip6po_dest1; ip6e = (struct ip6_ext *)pktopt->ip6po_dest1; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_DSTOPTS: if (pktopt && pktopt->ip6po_dest2) { optdata = (void *)pktopt->ip6po_dest2; ip6e = (struct ip6_ext *)pktopt->ip6po_dest2; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_NEXTHOP: if (pktopt && pktopt->ip6po_nexthop) { optdata = (void *)pktopt->ip6po_nexthop; optdatalen = pktopt->ip6po_nexthop->sa_len; } break; case IPV6_USE_MIN_MTU: if (pktopt) optdata = (void *)&pktopt->ip6po_minmtu; else optdata = (void *)&defminmtu; optdatalen = sizeof(int); break; case IPV6_DONTFRAG: if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG)) on = 1; else on = 0; optdata = (void *)&on; optdatalen = sizeof(on); break; case IPV6_PREFER_TEMPADDR: if (pktopt) optdata = (void *)&pktopt->ip6po_prefer_tempaddr; else optdata = (void *)&defpreftemp; optdatalen = sizeof(int); break; default: /* should not happen */ #ifdef DIAGNOSTIC panic("ip6_getpcbopt: unexpected option\n"); #endif return (ENOPROTOOPT); } error = sooptcopyout(sopt, optdata, optdatalen); return (error); } void ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname) { if (pktopt == NULL) return; if (optname == -1 || optname == IPV6_PKTINFO) { if (pktopt->ip6po_pktinfo) free(pktopt->ip6po_pktinfo, M_IP6OPT); pktopt->ip6po_pktinfo = NULL; } if (optname == -1 || optname == IPV6_HOPLIMIT) pktopt->ip6po_hlim = -1; if (optname == -1 || optname == IPV6_TCLASS) pktopt->ip6po_tclass = -1; if (optname == -1 || optname == IPV6_NEXTHOP) { if (pktopt->ip6po_nextroute.ro_rt) { RTFREE(pktopt->ip6po_nextroute.ro_rt); pktopt->ip6po_nextroute.ro_rt = NULL; } if (pktopt->ip6po_nexthop) free(pktopt->ip6po_nexthop, M_IP6OPT); pktopt->ip6po_nexthop = NULL; } if (optname == -1 || optname == IPV6_HOPOPTS) { if (pktopt->ip6po_hbh) free(pktopt->ip6po_hbh, M_IP6OPT); pktopt->ip6po_hbh = NULL; } if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) { if (pktopt->ip6po_dest1) free(pktopt->ip6po_dest1, M_IP6OPT); pktopt->ip6po_dest1 = NULL; } if (optname == -1 || optname == IPV6_RTHDR) { if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT); pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL; if (pktopt->ip6po_route.ro_rt) { RTFREE(pktopt->ip6po_route.ro_rt); pktopt->ip6po_route.ro_rt = NULL; } } if (optname == -1 || optname == IPV6_DSTOPTS) { if (pktopt->ip6po_dest2) free(pktopt->ip6po_dest2, M_IP6OPT); pktopt->ip6po_dest2 = NULL; } } #define PKTOPT_EXTHDRCPY(type) \ do {\ if (src->type) {\ int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\ dst->type = malloc(hlen, M_IP6OPT, canwait);\ if (dst->type == NULL && canwait == M_NOWAIT)\ goto bad;\ bcopy(src->type, dst->type, hlen);\ }\ } while (/*CONSTCOND*/ 0) static int copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait) { if (dst == NULL || src == NULL) { printf("ip6_clearpktopts: invalid argument\n"); return (EINVAL); } dst->ip6po_hlim = src->ip6po_hlim; dst->ip6po_tclass = src->ip6po_tclass; dst->ip6po_flags = src->ip6po_flags; dst->ip6po_minmtu = src->ip6po_minmtu; dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr; if (src->ip6po_pktinfo) { dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo), M_IP6OPT, canwait); if (dst->ip6po_pktinfo == NULL) goto bad; *dst->ip6po_pktinfo = *src->ip6po_pktinfo; } if (src->ip6po_nexthop) { dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len, M_IP6OPT, canwait); if (dst->ip6po_nexthop == NULL) goto bad; bcopy(src->ip6po_nexthop, dst->ip6po_nexthop, src->ip6po_nexthop->sa_len); } PKTOPT_EXTHDRCPY(ip6po_hbh); PKTOPT_EXTHDRCPY(ip6po_dest1); PKTOPT_EXTHDRCPY(ip6po_dest2); PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */ return (0); bad: ip6_clearpktopts(dst, -1); return (ENOBUFS); } #undef PKTOPT_EXTHDRCPY struct ip6_pktopts * ip6_copypktopts(struct ip6_pktopts *src, int canwait) { int error; struct ip6_pktopts *dst; dst = malloc(sizeof(*dst), M_IP6OPT, canwait); if (dst == NULL) return (NULL); ip6_initpktopts(dst); if ((error = copypktopts(dst, src, canwait)) != 0) { free(dst, M_IP6OPT); return (NULL); } return (dst); } void ip6_freepcbopts(struct ip6_pktopts *pktopt) { if (pktopt == NULL) return; ip6_clearpktopts(pktopt, -1); free(pktopt, M_IP6OPT); } /* * Set IPv6 outgoing packet options based on advanced API. */ int ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt, struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto) { struct cmsghdr *cm = NULL; if (control == NULL || opt == NULL) return (EINVAL); ip6_initpktopts(opt); if (stickyopt) { int error; /* * If stickyopt is provided, make a local copy of the options * for this particular packet, then override them by ancillary * objects. * XXX: copypktopts() does not copy the cached route to a next * hop (if any). This is not very good in terms of efficiency, * but we can allow this since this option should be rarely * used. */ if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0) return (error); } /* * XXX: Currently, we assume all the optional information is stored * in a single mbuf. */ if (control->m_next) return (EINVAL); for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { int error; if (control->m_len < CMSG_LEN(0)) return (EINVAL); cm = mtod(control, struct cmsghdr *); if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) return (EINVAL); if (cm->cmsg_level != IPPROTO_IPV6) continue; error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm), cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto); if (error) return (error); } return (0); } /* * Set a particular packet option, as a sticky option or an ancillary data * item. "len" can be 0 only when it's a sticky option. * We have 4 cases of combination of "sticky" and "cmsg": * "sticky=0, cmsg=0": impossible * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data * "sticky=1, cmsg=0": RFC3542 socket option * "sticky=1, cmsg=1": RFC2292 socket option */ static int ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, struct ucred *cred, int sticky, int cmsg, int uproto) { int minmtupolicy, preftemp; int error; if (!sticky && !cmsg) { #ifdef DIAGNOSTIC printf("ip6_setpktopt: impossible case\n"); #endif return (EINVAL); } /* * IPV6_2292xxx is for backward compatibility to RFC2292, and should * not be specified in the context of RFC3542. Conversely, * RFC3542 types should not be specified in the context of RFC2292. */ if (!cmsg) { switch (optname) { case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292NEXTHOP: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: case IPV6_2292PKTOPTIONS: return (ENOPROTOOPT); } } if (sticky && cmsg) { switch (optname) { case IPV6_PKTINFO: case IPV6_HOPLIMIT: case IPV6_NEXTHOP: case IPV6_HOPOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_USE_MIN_MTU: case IPV6_DONTFRAG: case IPV6_TCLASS: case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */ return (ENOPROTOOPT); } } switch (optname) { case IPV6_2292PKTINFO: case IPV6_PKTINFO: { struct ifnet *ifp = NULL; struct in6_pktinfo *pktinfo; if (len != sizeof(struct in6_pktinfo)) return (EINVAL); pktinfo = (struct in6_pktinfo *)buf; /* * An application can clear any sticky IPV6_PKTINFO option by * doing a "regular" setsockopt with ipi6_addr being * in6addr_any and ipi6_ifindex being zero. * [RFC 3542, Section 6] */ if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo && pktinfo->ipi6_ifindex == 0 && IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ip6_clearpktopts(opt, optname); break; } if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO && sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { return (EINVAL); } if (IN6_IS_ADDR_MULTICAST(&pktinfo->ipi6_addr)) return (EINVAL); /* validate the interface index if specified. */ if (pktinfo->ipi6_ifindex > V_if_index) return (ENXIO); if (pktinfo->ipi6_ifindex) { ifp = ifnet_byindex(pktinfo->ipi6_ifindex); if (ifp == NULL) return (ENXIO); } if (ifp != NULL && (ifp->if_afdata[AF_INET6] == NULL || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) != 0)) return (ENETDOWN); if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { struct in6_ifaddr *ia; in6_setscope(&pktinfo->ipi6_addr, ifp, NULL); ia = in6ifa_ifpwithaddr(ifp, &pktinfo->ipi6_addr); if (ia == NULL) return (EADDRNOTAVAIL); ifa_free(&ia->ia_ifa); } /* * We store the address anyway, and let in6_selectsrc() * validate the specified address. This is because ipi6_addr * may not have enough information about its scope zone, and * we may need additional information (such as outgoing * interface or the scope zone of a destination address) to * disambiguate the scope. * XXX: the delay of the validation may confuse the * application when it is used as a sticky option. */ if (opt->ip6po_pktinfo == NULL) { opt->ip6po_pktinfo = malloc(sizeof(*pktinfo), M_IP6OPT, M_NOWAIT); if (opt->ip6po_pktinfo == NULL) return (ENOBUFS); } bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo)); break; } case IPV6_2292HOPLIMIT: case IPV6_HOPLIMIT: { int *hlimp; /* * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT * to simplify the ordering among hoplimit options. */ if (optname == IPV6_HOPLIMIT && sticky) return (ENOPROTOOPT); if (len != sizeof(int)) return (EINVAL); hlimp = (int *)buf; if (*hlimp < -1 || *hlimp > 255) return (EINVAL); opt->ip6po_hlim = *hlimp; break; } case IPV6_TCLASS: { int tclass; if (len != sizeof(int)) return (EINVAL); tclass = *(int *)buf; if (tclass < -1 || tclass > 255) return (EINVAL); opt->ip6po_tclass = tclass; break; } case IPV6_2292NEXTHOP: case IPV6_NEXTHOP: if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS, 0); if (error) return (error); } if (len == 0) { /* just remove the option */ ip6_clearpktopts(opt, IPV6_NEXTHOP); break; } /* check if cmsg_len is large enough for sa_len */ if (len < sizeof(struct sockaddr) || len < *buf) return (EINVAL); switch (((struct sockaddr *)buf)->sa_family) { case AF_INET6: { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf; int error; if (sa6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) { return (EINVAL); } if ((error = sa6_embedscope(sa6, V_ip6_use_defzone)) != 0) { return (error); } break; } case AF_LINK: /* should eventually be supported */ default: return (EAFNOSUPPORT); } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_NEXTHOP); opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT); if (opt->ip6po_nexthop == NULL) return (ENOBUFS); bcopy(buf, opt->ip6po_nexthop, *buf); break; case IPV6_2292HOPOPTS: case IPV6_HOPOPTS: { struct ip6_hbh *hbh; int hbhlen; /* * XXX: We don't allow a non-privileged user to set ANY HbH * options, since per-option restriction has too much * overhead. */ if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS, 0); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, IPV6_HOPOPTS); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_hbh)) return (EINVAL); hbh = (struct ip6_hbh *)buf; hbhlen = (hbh->ip6h_len + 1) << 3; if (len != hbhlen) return (EINVAL); /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_HOPOPTS); opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_hbh == NULL) return (ENOBUFS); bcopy(hbh, opt->ip6po_hbh, hbhlen); break; } case IPV6_2292DSTOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: { struct ip6_dest *dest, **newdest = NULL; int destlen; if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */ error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS, 0); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, optname); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_dest)) return (EINVAL); dest = (struct ip6_dest *)buf; destlen = (dest->ip6d_len + 1) << 3; if (len != destlen) return (EINVAL); /* * Determine the position that the destination options header * should be inserted; before or after the routing header. */ switch (optname) { case IPV6_2292DSTOPTS: /* * The old advacned API is ambiguous on this point. * Our approach is to determine the position based * according to the existence of a routing header. * Note, however, that this depends on the order of the * extension headers in the ancillary data; the 1st * part of the destination options header must appear * before the routing header in the ancillary data, * too. * RFC3542 solved the ambiguity by introducing * separate ancillary data or option types. */ if (opt->ip6po_rthdr == NULL) newdest = &opt->ip6po_dest1; else newdest = &opt->ip6po_dest2; break; case IPV6_RTHDRDSTOPTS: newdest = &opt->ip6po_dest1; break; case IPV6_DSTOPTS: newdest = &opt->ip6po_dest2; break; } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, optname); *newdest = malloc(destlen, M_IP6OPT, M_NOWAIT); if (*newdest == NULL) return (ENOBUFS); bcopy(dest, *newdest, destlen); break; } case IPV6_2292RTHDR: case IPV6_RTHDR: { struct ip6_rthdr *rth; int rthlen; if (len == 0) { ip6_clearpktopts(opt, IPV6_RTHDR); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_rthdr)) return (EINVAL); rth = (struct ip6_rthdr *)buf; rthlen = (rth->ip6r_len + 1) << 3; if (len != rthlen) return (EINVAL); switch (rth->ip6r_type) { case IPV6_RTHDR_TYPE_0: if (rth->ip6r_len == 0) /* must contain one addr */ return (EINVAL); if (rth->ip6r_len % 2) /* length must be even */ return (EINVAL); if (rth->ip6r_len / 2 != rth->ip6r_segleft) return (EINVAL); break; default: return (EINVAL); /* not supported */ } /* turn off the previous option */ ip6_clearpktopts(opt, IPV6_RTHDR); opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_rthdr == NULL) return (ENOBUFS); bcopy(rth, opt->ip6po_rthdr, rthlen); break; } case IPV6_USE_MIN_MTU: if (len != sizeof(int)) return (EINVAL); minmtupolicy = *(int *)buf; if (minmtupolicy != IP6PO_MINMTU_MCASTONLY && minmtupolicy != IP6PO_MINMTU_DISABLE && minmtupolicy != IP6PO_MINMTU_ALL) { return (EINVAL); } opt->ip6po_minmtu = minmtupolicy; break; case IPV6_DONTFRAG: if (len != sizeof(int)) return (EINVAL); if (uproto == IPPROTO_TCP || *(int *)buf == 0) { /* * we ignore this option for TCP sockets. * (RFC3542 leaves this case unspecified.) */ opt->ip6po_flags &= ~IP6PO_DONTFRAG; } else opt->ip6po_flags |= IP6PO_DONTFRAG; break; case IPV6_PREFER_TEMPADDR: if (len != sizeof(int)) return (EINVAL); preftemp = *(int *)buf; if (preftemp != IP6PO_TEMPADDR_SYSTEM && preftemp != IP6PO_TEMPADDR_NOTPREFER && preftemp != IP6PO_TEMPADDR_PREFER) { return (EINVAL); } opt->ip6po_prefer_tempaddr = preftemp; break; default: return (ENOPROTOOPT); } /* end of switch */ return (0); } /* * Routine called from ip6_output() to loop back a copy of an IP6 multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be &loif -- easier than replicating that code here. */ void ip6_mloopback(struct ifnet *ifp, struct mbuf *m) { struct mbuf *copym; struct ip6_hdr *ip6; - copym = m_copy(m, 0, M_COPYALL); + copym = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (copym == NULL) return; /* * Make sure to deep-copy IPv6 header portion in case the data * is in an mbuf cluster, so that we can safely override the IPv6 * header portion later. */ if (!M_WRITABLE(copym) || copym->m_len < sizeof(struct ip6_hdr)) { copym = m_pullup(copym, sizeof(struct ip6_hdr)); if (copym == NULL) return; } ip6 = mtod(copym, struct ip6_hdr *); /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } if_simloop(ifp, copym, AF_INET6, 0); } /* * Chop IPv6 header off from the payload. */ static int ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs) { struct mbuf *mh; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); if (m->m_len > sizeof(*ip6)) { mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { m_freem(m); return ENOBUFS; } m_move_pkthdr(mh, m); M_ALIGN(mh, sizeof(*ip6)); m->m_len -= sizeof(*ip6); m->m_data += sizeof(*ip6); mh->m_next = m; m = mh; m->m_len = sizeof(*ip6); bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6)); } exthdrs->ip6e_ip6 = m; return 0; } /* * Compute IPv6 extension header length. */ int ip6_optlen(struct inpcb *in6p) { int len; if (!in6p->in6p_outputopts) return 0; len = 0; #define elen(x) \ (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) len += elen(in6p->in6p_outputopts->ip6po_hbh); if (in6p->in6p_outputopts->ip6po_rthdr) /* dest1 is valid with rthdr only */ len += elen(in6p->in6p_outputopts->ip6po_dest1); len += elen(in6p->in6p_outputopts->ip6po_rthdr); len += elen(in6p->in6p_outputopts->ip6po_dest2); return len; #undef elen } Index: head/sys/netinet6/raw_ip6.c =================================================================== --- head/sys/netinet6/raw_ip6.c (revision 305823) +++ head/sys/netinet6/raw_ip6.c (revision 305824) @@ -1,905 +1,905 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipsec.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #endif /* IPSEC */ #include #define satosin6(sa) ((struct sockaddr_in6 *)(sa)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) /* * Raw interface to IP6 protocol. */ VNET_DECLARE(struct inpcbhead, ripcb); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcb VNET(ripcb) #define V_ripcbinfo VNET(ripcbinfo) extern u_long rip_sendspace; extern u_long rip_recvspace; VNET_PCPUSTAT_DEFINE(struct rip6stat, rip6stat); VNET_PCPUSTAT_SYSINIT(rip6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(rip6stat); #endif /* VIMAGE */ /* * Hooks for multicast routing. They all default to NULL, so leave them not * initialized and rely on BSS being set to 0. */ /* * The socket used to communicate with the multicast routing daemon. */ VNET_DEFINE(struct socket *, ip6_mrouter); /* * The various mrouter functions. */ int (*ip6_mrouter_set)(struct socket *, struct sockopt *); int (*ip6_mrouter_get)(struct socket *, struct sockopt *); int (*ip6_mrouter_done)(void); int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *); int (*mrt6_ioctl)(u_long, caddr_t); /* * Setup generic address and protocol structures for raw_input routine, then * pass them along with mbuf chain. */ int rip6_input(struct mbuf **mp, int *offp, int proto) { struct ifnet *ifp; struct mbuf *m = *mp; register struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); register struct inpcb *in6p; struct inpcb *last = NULL; struct mbuf *opts = NULL; struct sockaddr_in6 fromsa; RIP6STAT_INC(rip6s_ipackets); init_sin6(&fromsa, m); /* general init */ ifp = m->m_pkthdr.rcvif; INP_INFO_RLOCK(&V_ripcbinfo); LIST_FOREACH(in6p, &V_ripcb, inp_list) { /* XXX inp locking */ if ((in6p->inp_vflag & INP_IPV6) == 0) continue; if (in6p->inp_ip_p && in6p->inp_ip_p != proto) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst)) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) continue; if (jailed_without_vnet(in6p->inp_cred)) { /* * Allow raw socket in jail to receive multicast; * assume process had PRIV_NETINET_RAW at attach, * and fall through into normal filter path if so. */ if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && prison_check_ip6(in6p->inp_cred, &ip6->ip6_dst) != 0) continue; } INP_RLOCK(in6p); if (in6p->in6p_cksum != -1) { RIP6STAT_INC(rip6s_isum); if (in6_cksum(m, proto, *offp, m->m_pkthdr.len - *offp)) { INP_RUNLOCK(in6p); RIP6STAT_INC(rip6s_badsum); continue; } } /* * If this raw socket has multicast state, and we * have received a multicast, check if this socket * should receive it, as multicast filtering is now * the responsibility of the transport layer. */ if (in6p->in6p_moptions && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If the incoming datagram is for MLD, allow it * through unconditionally to the raw socket. * * Use the M_RTALERT_MLD flag to check for MLD * traffic without having to inspect the mbuf chain * more deeply, as all MLDv1/v2 host messages MUST * contain the Router Alert option. * * In the case of MLDv1, we may not have explicitly * joined the group, and may have set IFF_ALLMULTI * on the interface. im6o_mc_filter() may discard * control traffic we actually need to see. * * Userland multicast routing daemons should continue * filter the control traffic appropriately. */ int blocked; blocked = MCAST_PASS; if ((m->m_flags & M_RTALERT_MLD) == 0) { struct sockaddr_in6 mcaddr; bzero(&mcaddr, sizeof(struct sockaddr_in6)); mcaddr.sin6_len = sizeof(struct sockaddr_in6); mcaddr.sin6_family = AF_INET6; mcaddr.sin6_addr = ip6->ip6_dst; blocked = im6o_mc_filter(in6p->in6p_moptions, ifp, (struct sockaddr *)&mcaddr, (struct sockaddr *)&fromsa); } if (blocked != MCAST_PASS) { IP6STAT_INC(ip6s_notmember); INP_RUNLOCK(in6p); continue; } } if (last != NULL) { - struct mbuf *n = m_copy(m, 0, (int)M_COPYALL); + struct mbuf *n = m_copym(m, 0, M_COPYALL, M_NOWAIT); #ifdef IPSEC /* * Check AH/ESP integrity. */ if (n && ipsec6_in_reject(n, last)) { m_freem(n); /* Do not inject data into pcb. */ } else #endif /* IPSEC */ if (n) { if (last->inp_flags & INP_CONTROLOPTS || last->inp_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(last, n, &opts); /* strip intermediate headers */ m_adj(n, *offp); if (sbappendaddr(&last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, n, opts) == 0) { m_freem(n); if (opts) m_freem(opts); RIP6STAT_INC(rip6s_fullsock); } else sorwakeup(last->inp_socket); opts = NULL; } INP_RUNLOCK(last); } last = in6p; } INP_INFO_RUNLOCK(&V_ripcbinfo); #ifdef IPSEC /* * Check AH/ESP integrity. */ if ((last != NULL) && ipsec6_in_reject(m, last)) { m_freem(m); IP6STAT_DEC(ip6s_delivered); /* Do not inject data into pcb. */ INP_RUNLOCK(last); } else #endif /* IPSEC */ if (last != NULL) { if (last->inp_flags & INP_CONTROLOPTS || last->inp_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(last, m, &opts); /* Strip intermediate headers. */ m_adj(m, *offp); if (sbappendaddr(&last->inp_socket->so_rcv, (struct sockaddr *)&fromsa, m, opts) == 0) { m_freem(m); if (opts) m_freem(opts); RIP6STAT_INC(rip6s_fullsock); } else sorwakeup(last->inp_socket); INP_RUNLOCK(last); } else { RIP6STAT_INC(rip6s_nosock); if (m->m_flags & M_MCAST) RIP6STAT_INC(rip6s_nosockmcast); if (proto == IPPROTO_NONE) m_freem(m); else { char *prvnxtp = ip6_get_prevhdr(m, *offp); /* XXX */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, prvnxtp - mtod(m, char *)); } IP6STAT_DEC(ip6s_delivered); } return (IPPROTO_DONE); } void rip6_ctlinput(int cmd, struct sockaddr *sa, void *d) { struct ip6_hdr *ip6; struct mbuf *m; int off = 0; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; void *cmdarg; struct inpcb *(*notify)(struct inpcb *, int) = in6_rtchange; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if ((unsigned)cmd >= PRC_NCMDS) return; if (PRC_IS_REDIRECT(cmd)) notify = in6_rtchange, d = NULL; else if (cmd == PRC_HOSTDEAD) d = NULL; else if (inet6ctlerrmap[cmd] == 0) return; /* * If the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; cmdarg = ip6cp->ip6c_cmdarg; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; cmdarg = NULL; sa6_src = &sa6_any; } (void) in6_pcbnotify(&V_ripcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify); } /* * Generate IPv6 header and pass packet to ip6_output. Tack on options user * may have setup with control call. */ int rip6_output(struct mbuf *m, struct socket *so, ...) { struct mbuf *control; struct m_tag *mtag; struct sockaddr_in6 *dstsock; struct in6_addr *dst; struct ip6_hdr *ip6; struct inpcb *in6p; u_int plen = m->m_pkthdr.len; int error = 0; struct ip6_pktopts opt, *optp; struct ifnet *oifp = NULL; int type = 0, code = 0; /* for ICMPv6 output statistics only */ int scope_ambiguous = 0; int use_defzone = 0; int hlim = 0; struct in6_addr in6a; va_list ap; va_start(ap, so); dstsock = va_arg(ap, struct sockaddr_in6 *); control = va_arg(ap, struct mbuf *); va_end(ap); in6p = sotoinpcb(so); INP_WLOCK(in6p); dst = &dstsock->sin6_addr; if (control != NULL) { if ((error = ip6_setpktopts(control, &opt, in6p->in6p_outputopts, so->so_cred, so->so_proto->pr_protocol)) != 0) { goto bad; } optp = &opt; } else optp = in6p->in6p_outputopts; /* * Check and convert scope zone ID into internal form. * * XXX: we may still need to determine the zone later. */ if (!(so->so_state & SS_ISCONNECTED)) { if (!optp || !optp->ip6po_pktinfo || !optp->ip6po_pktinfo->ipi6_ifindex) use_defzone = V_ip6_use_defzone; if (dstsock->sin6_scope_id == 0 && !use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(dstsock, use_defzone)) != 0) goto bad; } /* * For an ICMPv6 packet, we should know its type and code to update * statistics. */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { struct icmp6_hdr *icmp6; if (m->m_len < sizeof(struct icmp6_hdr) && (m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) { error = ENOBUFS; goto bad; } icmp6 = mtod(m, struct icmp6_hdr *); type = icmp6->icmp6_type; code = icmp6->icmp6_code; } M_PREPEND(m, sizeof(*ip6), M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } ip6 = mtod(m, struct ip6_hdr *); /* * Source address selection. */ error = in6_selectsrc_socket(dstsock, optp, in6p, so->so_cred, scope_ambiguous, &in6a, &hlim); if (error) goto bad; error = prison_check_ip6(in6p->inp_cred, &in6a); if (error != 0) goto bad; ip6->ip6_src = in6a; ip6->ip6_dst = dstsock->sin6_addr; /* * Fill in the rest of the IPv6 header fields. */ ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (in6p->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); /* * ip6_plen will be filled in ip6_output, so not fill it here. */ ip6->ip6_nxt = in6p->inp_ip_p; ip6->ip6_hlim = hlim; if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 || in6p->in6p_cksum != -1) { struct mbuf *n; int off; u_int16_t *p; /* Compute checksum. */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) off = offsetof(struct icmp6_hdr, icmp6_cksum); else off = in6p->in6p_cksum; if (plen < off + 1) { error = EINVAL; goto bad; } off += sizeof(struct ip6_hdr); n = m; while (n && n->m_len <= off) { off -= n->m_len; n = n->m_next; } if (!n) goto bad; p = (u_int16_t *)(mtod(n, caddr_t) + off); *p = 0; *p = in6_cksum(m, ip6->ip6_nxt, sizeof(*ip6), plen); } /* * Send RA/RS messages to user land for protection, before sending * them to rtadvd/rtsol. */ if ((send_sendso_input_hook != NULL) && so->so_proto->pr_protocol == IPPROTO_ICMPV6) { switch (type) { case ND_ROUTER_ADVERT: case ND_ROUTER_SOLICIT: mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short), M_NOWAIT); if (mtag == NULL) goto bad; m_tag_prepend(m, mtag); } } error = ip6_output(m, optp, NULL, 0, in6p->in6p_moptions, &oifp, in6p); if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (oifp) icmp6_ifoutstat_inc(oifp, type, code); ICMP6STAT_INC(icp6s_outhist[type]); } else RIP6STAT_INC(rip6s_opackets); goto freectl; bad: if (m) m_freem(m); freectl: if (control != NULL) { ip6_clearpktopts(&opt, -1); m_freem(control); } INP_WUNLOCK(in6p); return (error); } /* * Raw IPv6 socket option processing. */ int rip6_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp; int error; if (sopt->sopt_level == IPPROTO_ICMPV6) /* * XXX: is it better to call icmp6_ctloutput() directly * from protosw? */ return (icmp6_ctloutput(so, sopt)); else if (sopt->sopt_level != IPPROTO_IPV6) { if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_SETFIB) { inp = sotoinpcb(so); INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); return (0); } return (EINVAL); } error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: case MRT6_DEL_MIF: case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: error = ip6_mrouter_get ? ip6_mrouter_get(so, sopt) : EOPNOTSUPP; break; case IPV6_CHECKSUM: error = ip6_raw_ctloutput(so, sopt); break; default: error = ip6_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: case MRT6_DEL_MIF: case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: error = ip6_mrouter_set ? ip6_mrouter_set(so, sopt) : EOPNOTSUPP; break; case IPV6_CHECKSUM: error = ip6_raw_ctloutput(so, sopt); break; default: error = ip6_ctloutput(so, sopt); break; } break; } return (error); } static int rip6_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct icmp6_filter *filter; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("rip6_attach: inp != NULL")); error = priv_check(td, PRIV_NETINET_RAW); if (error) return (error); error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return (error); filter = malloc(sizeof(struct icmp6_filter), M_PCB, M_NOWAIT); if (filter == NULL) return (ENOMEM); INP_INFO_WLOCK(&V_ripcbinfo); error = in_pcballoc(so, &V_ripcbinfo); if (error) { INP_INFO_WUNLOCK(&V_ripcbinfo); free(filter, M_PCB); return (error); } inp = (struct inpcb *)so->so_pcb; INP_INFO_WUNLOCK(&V_ripcbinfo); inp->inp_vflag |= INP_IPV6; inp->inp_ip_p = (long)proto; inp->in6p_hops = -1; /* use kernel default */ inp->in6p_cksum = -1; inp->in6p_icmp6filt = filter; ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt); INP_WUNLOCK(inp); return (0); } static void rip6_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_detach: inp == NULL")); if (so == V_ip6_mrouter && ip6_mrouter_done) ip6_mrouter_done(); /* xxx: RSVP */ INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); free(inp->in6p_icmp6filt, M_PCB); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); } /* XXXRW: This can't ever be called. */ static void rip6_abort(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_abort: inp == NULL")); soisdisconnected(so); } static void rip6_close(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_close: inp == NULL")); soisdisconnected(so); } static int rip6_disconnect(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_disconnect: inp == NULL")); if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); inp->in6p_faddr = in6addr_any; rip6_abort(so); return (0); } static int rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct ifaddr *ifa = NULL; int error = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_bind: inp == NULL")); if (nam->sa_len != sizeof(*addr)) return (EINVAL); if ((error = prison_check_ip6(td->td_ucred, &addr->sin6_addr)) != 0) return (error); if (TAILQ_EMPTY(&V_ifnet) || addr->sin6_family != AF_INET6) return (EADDRNOTAVAIL); if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0) return (error); if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) && (ifa = ifa_ifwithaddr((struct sockaddr *)addr)) == NULL) return (EADDRNOTAVAIL); if (ifa != NULL && ((struct in6_ifaddr *)ifa)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { ifa_free(ifa); return (EADDRNOTAVAIL); } if (ifa != NULL) ifa_free(ifa); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); inp->in6p_laddr = addr->sin6_addr; INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct in6_addr in6a; int error = 0, scope_ambiguous = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_connect: inp == NULL")); if (nam->sa_len != sizeof(*addr)) return (EINVAL); if (TAILQ_EMPTY(&V_ifnet)) return (EADDRNOTAVAIL); if (addr->sin6_family != AF_INET6) return (EAFNOSUPPORT); /* * Application should provide a proper zone ID or the use of default * zone IDs should be enabled. Unfortunately, some applications do * not behave as it should, so we need a workaround. Even if an * appropriate ID is not determined, we'll see if we can determine * the outgoing interface. If we can, determine the zone ID based on * the interface below. */ if (addr->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0) return (error); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); /* Source address selection. XXX: need pcblookup? */ error = in6_selectsrc_socket(addr, inp->in6p_outputopts, inp, so->so_cred, scope_ambiguous, &in6a, NULL); if (error) { INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (error); } inp->in6p_faddr = addr->sin6_addr; inp->in6p_laddr = in6a; soisconnected(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip6_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } static int rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct inpcb *inp; struct sockaddr_in6 tmp; struct sockaddr_in6 *dst; int ret; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_send: inp == NULL")); /* Always copy sockaddr to avoid overwrites. */ /* Unlocked read. */ if (so->so_state & SS_ISCONNECTED) { if (nam) { m_freem(m); return (EISCONN); } /* XXX */ bzero(&tmp, sizeof(tmp)); tmp.sin6_family = AF_INET6; tmp.sin6_len = sizeof(struct sockaddr_in6); INP_RLOCK(inp); bcopy(&inp->in6p_faddr, &tmp.sin6_addr, sizeof(struct in6_addr)); INP_RUNLOCK(inp); dst = &tmp; } else { if (nam == NULL) { m_freem(m); return (ENOTCONN); } if (nam->sa_len != sizeof(struct sockaddr_in6)) { m_freem(m); return (EINVAL); } tmp = *(struct sockaddr_in6 *)nam; dst = &tmp; if (dst->sin6_family == AF_UNSPEC) { /* * XXX: we allow this case for backward * compatibility to buggy applications that * rely on old (and wrong) kernel behavior. */ log(LOG_INFO, "rip6 SEND: address family is " "unspec. Assume AF_INET6\n"); dst->sin6_family = AF_INET6; } else if (dst->sin6_family != AF_INET6) { m_freem(m); return(EAFNOSUPPORT); } } ret = rip6_output(m, so, dst, control); return (ret); } struct pr_usrreqs rip6_usrreqs = { .pru_abort = rip6_abort, .pru_attach = rip6_attach, .pru_bind = rip6_bind, .pru_connect = rip6_connect, .pru_control = in6_control, .pru_detach = rip6_detach, .pru_disconnect = rip6_disconnect, .pru_peeraddr = in6_getpeeraddr, .pru_send = rip6_send, .pru_shutdown = rip6_shutdown, .pru_sockaddr = in6_getsockaddr, .pru_close = rip6_close, }; Index: head/sys/netinet6/udp6_usrreq.c =================================================================== --- head/sys/netinet6/udp6_usrreq.c (revision 305823) +++ head/sys/netinet6/udp6_usrreq.c (revision 305824) @@ -1,1301 +1,1302 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2014 Kevin Lo * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: udp6_usrreq.c,v 1.27 2001/05/21 05:45:10 jinmei Exp $ * $KAME: udp6_output.c,v 1.31 2001/05/21 16:39:15 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #endif /* IPSEC */ #include /* * UDP protocol implementation. * Per RFC 768, August, 1980. */ extern struct protosw inetsw[]; static void udp6_detach(struct socket *so); static int udp6_append(struct inpcb *inp, struct mbuf *n, int off, struct sockaddr_in6 *fromsa) { struct socket *so; struct mbuf *opts; struct udpcb *up; INP_LOCK_ASSERT(inp); /* * Engage the tunneling protocol. */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { in_pcbref(inp); INP_RUNLOCK(inp); (*up->u_tun_func)(n, off, inp, (struct sockaddr *)fromsa, up->u_tun_ctx); INP_RLOCK(inp); return (in_pcbrele_rlocked(inp)); } #ifdef IPSEC /* Check AH/ESP integrity. */ if (ipsec6_in_reject(n, inp)) { m_freem(n); return (0); } #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); return (0); } #endif opts = NULL; if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(inp, n, &opts); m_adj(n, off + sizeof(struct udphdr)); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)fromsa, n, opts) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); m_freem(n); if (opts) m_freem(opts); UDPSTAT_INC(udps_fullsock); } else sorwakeup_locked(so); return (0); } int udp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ifnet *ifp; struct ip6_hdr *ip6; struct udphdr *uh; struct inpcb *inp; struct inpcbinfo *pcbinfo; struct udpcb *up; int off = *offp; int cscov_partial; int plen, ulen; struct sockaddr_in6 fromsa; struct m_tag *fwd_tag; uint16_t uh_sum; uint8_t nxt; ifp = m->m_pkthdr.rcvif; ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct udphdr), IPPROTO_DONE); ip6 = mtod(m, struct ip6_hdr *); uh = (struct udphdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(uh, struct udphdr *, m, off, sizeof(*uh)); if (!uh) return (IPPROTO_DONE); #endif UDPSTAT_INC(udps_ipackets); /* * Destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; plen = ntohs(ip6->ip6_plen) - off + sizeof(*ip6); ulen = ntohs((u_short)uh->uh_ulen); nxt = proto; cscov_partial = (nxt == IPPROTO_UDPLITE) ? 1 : 0; if (nxt == IPPROTO_UDPLITE) { /* Zero means checksum over the complete packet. */ if (ulen == 0) ulen = plen; if (ulen == plen) cscov_partial = 0; if ((ulen < sizeof(struct udphdr)) || (ulen > plen)) { /* XXX: What is the right UDPLite MIB counter? */ goto badunlocked; } if (uh->uh_sum == 0) { /* XXX: What is the right UDPLite MIB counter? */ goto badunlocked; } } else { if ((ulen < sizeof(struct udphdr)) || (plen != ulen)) { UDPSTAT_INC(udps_badlen); goto badunlocked; } if (uh->uh_sum == 0) { UDPSTAT_INC(udps_nosum); goto badunlocked; } } if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) && !cscov_partial) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in6_cksum_pseudo(ip6, ulen, nxt, m->m_pkthdr.csum_data); uh_sum ^= 0xffff; } else uh_sum = in6_cksum_partial(m, nxt, off, plen, ulen); if (uh_sum != 0) { UDPSTAT_INC(udps_badsum); goto badunlocked; } /* * Construct sockaddr format source address. */ init_sin6(&fromsa, m); fromsa.sin6_port = uh->uh_sport; pcbinfo = udp_get_inpcbinfo(nxt); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct inpcb *last; struct inpcbhead *pcblist; struct ip6_moptions *imo; INP_INFO_RLOCK(pcbinfo); /* * In the event that laddr should be set to the link-local * address (this happens in RIPng), the multicast address * specified in the received packet will not match laddr. To * handle this situation, matching is relaxed if the * receiving interface is the same as one specified in the * socket and if the destination multicast address matches * one of the multicast groups specified in the socket. */ /* * KAME note: traditionally we dropped udpiphdr from mbuf * here. We need udphdr for IPsec processing so we do that * later. */ pcblist = udp_get_pcblist(nxt); last = NULL; LIST_FOREACH(inp, pcblist, inp_list) { if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (inp->inp_lport != uh->uh_dport) continue; if (inp->inp_fport != 0 && inp->inp_fport != uh->uh_sport) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) continue; } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src) || inp->inp_fport != uh->uh_sport) continue; } /* * XXXRW: Because we weren't holding either the inpcb * or the hash lock when we checked for a match * before, we should probably recheck now that the * inpcb lock is (supposed to be) held. */ /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ imo = inp->in6p_moptions; if (imo && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct sockaddr_in6 mcaddr; int blocked; INP_RLOCK(inp); bzero(&mcaddr, sizeof(struct sockaddr_in6)); mcaddr.sin6_len = sizeof(struct sockaddr_in6); mcaddr.sin6_family = AF_INET6; mcaddr.sin6_addr = ip6->ip6_dst; blocked = im6o_mc_filter(imo, ifp, (struct sockaddr *)&mcaddr, (struct sockaddr *)&fromsa); if (blocked != MCAST_PASS) { if (blocked == MCAST_NOTGMEMBER) IP6STAT_INC(ip6s_notmember); if (blocked == MCAST_NOTSMEMBER || blocked == MCAST_MUTED) UDPSTAT_INC(udps_filtermcast); INP_RUNLOCK(inp); /* XXX */ continue; } INP_RUNLOCK(inp); } if (last != NULL) { struct mbuf *n; - if ((n = m_copy(m, 0, M_COPYALL)) != NULL) { + if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != + NULL) { INP_RLOCK(last); UDP_PROBE(receive, NULL, last, ip6, last, uh); if (udp6_append(last, n, off, &fromsa)) goto inp_lost; INP_RUNLOCK(last); } } last = inp; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids * searching through all pcbs in the common case of a * non-shared port. It assumes that an application * will never clear these options after setting them. */ if ((last->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } if (last == NULL) { /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ UDPSTAT_INC(udps_noport); UDPSTAT_INC(udps_noportmcast); goto badheadlocked; } INP_RLOCK(last); INP_INFO_RUNLOCK(pcbinfo); UDP_PROBE(receive, NULL, last, ip6, last, uh); if (udp6_append(last, m, off, &fromsa) == 0) INP_RUNLOCK(last); inp_lost: return (IPPROTO_DONE); } /* * Locate pcb for datagram. */ /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ((m->m_flags & M_IP6_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { struct sockaddr_in6 *next_hop6; next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src, uh->uh_sport, &ip6->ip6_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in6_pcblookup(pcbinfo, &ip6->ip6_src, uh->uh_sport, &next_hop6->sin6_addr, next_hop6->sin6_port ? htons(next_hop6->sin6_port) : uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP6_NEXTHOP; } else inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src, uh->uh_sport, &ip6->ip6_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (inp == NULL) { if (udp_log_in_vain) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; log(LOG_INFO, "Connection attempt to UDP [%s]:%d from [%s]:%d\n", ip6_sprintf(ip6bufd, &ip6->ip6_dst), ntohs(uh->uh_dport), ip6_sprintf(ip6bufs, &ip6->ip6_src), ntohs(uh->uh_sport)); } UDPSTAT_INC(udps_noport); if (m->m_flags & M_MCAST) { printf("UDP6: M_MCAST is set in a unicast packet.\n"); UDPSTAT_INC(udps_noportmcast); goto badunlocked; } if (V_udp_blackhole) goto badunlocked; if (badport_bandlim(BANDLIM_ICMP6_UNREACH) < 0) goto badunlocked; icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0); return (IPPROTO_DONE); } INP_RLOCK_ASSERT(inp); up = intoudpcb(inp); if (cscov_partial) { if (up->u_rxcslen == 0 || up->u_rxcslen > ulen) { INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } } UDP_PROBE(receive, NULL, inp, ip6, inp, uh); if (udp6_append(inp, m, off, &fromsa) == 0) INP_RUNLOCK(inp); return (IPPROTO_DONE); badheadlocked: INP_INFO_RUNLOCK(pcbinfo); badunlocked: if (m) m_freem(m); return (IPPROTO_DONE); } static void udp6_common_ctlinput(int cmd, struct sockaddr *sa, void *d, struct inpcbinfo *pcbinfo) { struct udphdr uh; struct ip6_hdr *ip6; struct mbuf *m; int off = 0; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; void *cmdarg; struct inpcb *(*notify)(struct inpcb *, int) = udp_notify; struct udp_portonly { u_int16_t uh_sport; u_int16_t uh_dport; } *uhp; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if ((unsigned)cmd >= PRC_NCMDS) return; if (PRC_IS_REDIRECT(cmd)) notify = in6_rtchange, d = NULL; else if (cmd == PRC_HOSTDEAD) d = NULL; else if (inet6ctlerrmap[cmd] == 0) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; cmdarg = ip6cp->ip6c_cmdarg; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; cmdarg = NULL; sa6_src = &sa6_any; } if (ip6) { /* * XXX: We assume that when IPV6 is non NULL, * M and OFF are valid. */ /* Check if we can safely examine src and dst ports. */ if (m->m_pkthdr.len < off + sizeof(*uhp)) return; bzero(&uh, sizeof(uh)); m_copydata(m, off, sizeof(*uhp), (caddr_t)&uh); if (!PRC_IS_REDIRECT(cmd)) { /* Check to see if its tunneled */ struct inpcb *inp; inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_dst, uh.uh_dport, &ip6->ip6_src, uh.uh_sport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (inp != NULL) { struct udpcb *up; up = intoudpcb(inp); if (up->u_icmp_func) { /* Yes it is. */ INP_RUNLOCK(inp); (*up->u_icmp_func)(cmd, (struct sockaddr *)ip6cp->ip6c_src, d, up->u_tun_ctx); return; } else { /* Can't find it. */ INP_RUNLOCK(inp); } } } (void)in6_pcbnotify(pcbinfo, sa, uh.uh_dport, (struct sockaddr *)ip6cp->ip6c_src, uh.uh_sport, cmd, cmdarg, notify); } else (void)in6_pcbnotify(pcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify); } void udp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { return (udp6_common_ctlinput(cmd, sa, d, &V_udbinfo)); } void udplite6_ctlinput(int cmd, struct sockaddr *sa, void *d) { return (udp6_common_ctlinput(cmd, sa, d, &V_ulitecbinfo)); } static int udp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); if (req->newlen != sizeof(addrs)) return (EINVAL); if (req->oldlen != sizeof(struct xucred)) return (EINVAL); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } inp = in6_pcblookup(&V_udbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0, 0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection"); static int udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6, struct mbuf *control, struct thread *td) { u_int32_t ulen = m->m_pkthdr.len; u_int32_t plen = sizeof(struct udphdr) + ulen; struct ip6_hdr *ip6; struct udphdr *udp6; struct in6_addr *laddr, *faddr, in6a; struct sockaddr_in6 *sin6 = NULL; int cscov_partial = 0; int scope_ambiguous = 0; u_short fport; int error = 0; uint8_t nxt; uint16_t cscov = 0; struct ip6_pktopts *optp, opt; int af = AF_INET6, hlen = sizeof(struct ip6_hdr); int flags; struct sockaddr_in6 tmp; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (addr6) { /* addr6 has been validated in udp6_send(). */ sin6 = (struct sockaddr_in6 *)addr6; /* protect *sin6 from overwrites */ tmp = *sin6; sin6 = &tmp; /* * Application should provide a proper zone ID or the use of * default zone IDs should be enabled. Unfortunately, some * applications do not behave as it should, so we need a * workaround. Even if an appropriate ID is not determined, * we'll see if we can determine the outgoing interface. If we * can, determine the zone ID based on the interface below. */ if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return (error); } nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; if (control) { if ((error = ip6_setpktopts(control, &opt, inp->in6p_outputopts, td->td_ucred, nxt)) != 0) goto release; optp = &opt; } else optp = inp->in6p_outputopts; if (sin6) { faddr = &sin6->sin6_addr; /* * Since we saw no essential reason for calling in_pcbconnect, * we get rid of such kind of logic, and call in6_selectsrc * and in6_pcbsetport in order to fill in the local address * and the local port. */ if (sin6->sin6_port == 0) { error = EADDRNOTAVAIL; goto release; } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { /* how about ::ffff:0.0.0.0 case? */ error = EISCONN; goto release; } fport = sin6->sin6_port; /* allow 0 port */ if (IN6_IS_ADDR_V4MAPPED(faddr)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) { /* * I believe we should explicitly discard the * packet when mapped addresses are disabled, * rather than send the packet as an IPv6 one. * If we chose the latter approach, the packet * might be sent out on the wire based on the * default route, the situation which we'd * probably want to avoid. * (20010421 jinmei@kame.net) */ error = EINVAL; goto release; } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && !IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) { /* * when remote addr is an IPv4-mapped address, * local addr should not be an IPv6 address, * since you cannot determine how to map IPv6 * source address to IPv4. */ error = EINVAL; goto release; } af = AF_INET; } if (!IN6_IS_ADDR_V4MAPPED(faddr)) { error = in6_selectsrc_socket(sin6, optp, inp, td->td_ucred, scope_ambiguous, &in6a, NULL); if (error) goto release; laddr = &in6a; } else laddr = &inp->in6p_laddr; /* XXX */ if (laddr == NULL) { if (error == 0) error = EADDRNOTAVAIL; goto release; } if (inp->inp_lport == 0 && (error = in6_pcbsetport(laddr, inp, td->td_ucred)) != 0) { /* Undo an address bind that may have occurred. */ inp->in6p_laddr = in6addr_any; goto release; } } else { if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = ENOTCONN; goto release; } if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_faddr)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) { /* * XXX: this case would happen when the * application sets the V6ONLY flag after * connecting the foreign address. * Such applications should be fixed, * so we bark here. */ log(LOG_INFO, "udp6_output: IPV6_V6ONLY " "option was set for a connected socket\n"); error = EINVAL; goto release; } else af = AF_INET; } laddr = &inp->in6p_laddr; faddr = &inp->in6p_faddr; fport = inp->inp_fport; } if (af == AF_INET) hlen = sizeof(struct ip); /* * Calculate data length and get a mbuf * for UDP and IP6 headers. */ M_PREPEND(m, hlen + sizeof(struct udphdr), M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto release; } /* * Stuff checksum and output datagram. */ udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen); udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */ udp6->uh_dport = fport; if (nxt == IPPROTO_UDPLITE) { struct udpcb *up; up = intoudpcb(inp); cscov = up->u_txcslen; if (cscov >= plen) cscov = 0; udp6->uh_ulen = htons(cscov); /* * For UDP-Lite, checksum coverage length of zero means * the entire UDPLite packet is covered by the checksum. */ cscov_partial = (cscov == 0) ? 0 : 1; } else if (plen <= 0xffff) udp6->uh_ulen = htons((u_short)plen); else udp6->uh_ulen = 0; udp6->uh_sum = 0; switch (af) { case AF_INET6: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = inp->inp_flow & IPV6_FLOWINFO_MASK; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_plen = htons((u_short)plen); ip6->ip6_nxt = nxt; ip6->ip6_hlim = in6_selecthlim(inp, NULL); ip6->ip6_src = *laddr; ip6->ip6_dst = *faddr; if (cscov_partial) { if ((udp6->uh_sum = in6_cksum_partial(m, nxt, sizeof(struct ip6_hdr), plen, cscov)) == 0) udp6->uh_sum = 0xffff; } else { udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0); m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } #ifdef RSS { uint32_t hash_val, hash_type; uint8_t pr; pr = inp->inp_socket->so_proto->pr_protocol; /* * Calculate an appropriate RSS hash for UDP and * UDP Lite. * * The called function will take care of figuring out * whether a 2-tuple or 4-tuple hash is required based * on the currently configured scheme. * * Later later on connected socket values should be * cached in the inpcb and reused, rather than constantly * re-calculating it. * * UDP Lite is a different protocol number and will * likely end up being hashed as a 2-tuple until * RSS / NICs grow UDP Lite protocol awareness. */ if (rss_proto_software_hash_v6(faddr, laddr, fport, inp->inp_lport, pr, &hash_val, &hash_type) == 0) { m->m_pkthdr.flowid = hash_val; M_HASHTYPE_SET(m, hash_type); } } #endif flags = 0; #ifdef RSS /* * Don't override with the inp cached flowid. * * Until the whole UDP path is vetted, it may actually * be incorrect. */ flags |= IP_NODEFAULTFLOWID; #endif UDP_PROBE(send, NULL, inp, ip6, inp, udp6); UDPSTAT_INC(udps_opackets); error = ip6_output(m, optp, &inp->inp_route6, flags, inp->in6p_moptions, NULL, inp); break; case AF_INET: error = EAFNOSUPPORT; goto release; } goto releaseopt; release: m_freem(m); releaseopt: if (control) { ip6_clearpktopts(&opt, -1); m_freem(control); } return (error); } static void udp6_abort(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_abort: inp == NULL")); INP_WLOCK(inp); #ifdef INET if (inp->inp_vflag & INP_IPV4) { struct pr_usrreqs *pru; uint8_t nxt; nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; INP_WUNLOCK(inp); pru = inetsw[ip_protox[nxt]].pr_usrreqs; (*pru->pru_abort)(so); return; } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp6_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp6_attach: inp != NULL")); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); } INP_INFO_WLOCK(pcbinfo); error = in_pcballoc(so, pcbinfo); if (error) { INP_INFO_WUNLOCK(pcbinfo); return (error); } inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; inp->in6p_hops = -1; /* use kernel default */ inp->in6p_cksum = -1; /* just to be sure */ /* * XXX: ugly!! * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; error = udp_newudpcb(inp); if (error) { in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); return (error); } INP_WUNLOCK(inp); INP_INFO_WUNLOCK(pcbinfo); return (0); } static int udp6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_bind: inp == NULL")); INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { struct sockaddr_in6 *sin6_p; sin6_p = (struct sockaddr_in6 *)nam; if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr)) inp->inp_vflag |= INP_IPV4; #ifdef INET else if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6_p); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = in_pcbbind(inp, (struct sockaddr *)&sin, td->td_ucred); goto out; } #endif } error = in6_pcbbind(inp, nam, td->td_ucred); #ifdef INET out: #endif INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); } static void udp6_close(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_close: inp == NULL")); INP_WLOCK(inp); #ifdef INET if (inp->inp_vflag & INP_IPV4) { struct pr_usrreqs *pru; uint8_t nxt; nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; INP_WUNLOCK(inp); pru = inetsw[ip_protox[nxt]].pr_usrreqs; (*pru->pru_disconnect)(so); return; } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct sockaddr_in6 *sin6; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); sin6 = (struct sockaddr_in6 *)nam; KASSERT(inp != NULL, ("udp6_connect: inp == NULL")); /* * XXXRW: Need to clarify locking of v4/v6 flags. */ INP_WLOCK(inp); #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; goto out; } if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto out; } in6_sin6_2_sin(&sin, sin6); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = prison_remote_ip4(td->td_ucred, &sin.sin_addr); if (error != 0) goto out; INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect(inp, (struct sockaddr *)&sin, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error == 0) soisconnected(so); goto out; } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = EISCONN; goto out; } inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr); if (error != 0) goto out; INP_HASH_WLOCK(pcbinfo); error = in6_pcbconnect(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error == 0) soisconnected(so); out: INP_WUNLOCK(inp); return (error); } static void udp6_detach(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct udpcb *up; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_detach: inp == NULL")); INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); udp_discardcb(up); } static int udp6_disconnect(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_disconnect: inp == NULL")); INP_WLOCK(inp); #ifdef INET if (inp->inp_vflag & INP_IPV4) { struct pr_usrreqs *pru; uint8_t nxt; nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; INP_WUNLOCK(inp); pru = inetsw[ip_protox[nxt]].pr_usrreqs; (void)(*pru->pru_disconnect)(so); return (0); } #endif if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = ENOTCONN; goto out; } INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; INP_HASH_WUNLOCK(pcbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); out: INP_WUNLOCK(inp); return (0); } static int udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error = 0; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_send: inp == NULL")); INP_WLOCK(inp); if (addr) { if (addr->sa_len != sizeof(struct sockaddr_in6)) { error = EINVAL; goto bad; } if (addr->sa_family != AF_INET6) { error = EAFNOSUPPORT; goto bad; } } #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { int hasv4addr; struct sockaddr_in6 *sin6 = NULL; if (addr == NULL) hasv4addr = (inp->inp_vflag & INP_IPV4); else { sin6 = (struct sockaddr_in6 *)addr; hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ? 1 : 0; } if (hasv4addr) { struct pr_usrreqs *pru; uint8_t nxt; nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; /* * XXXRW: We release UDP-layer locks before calling * udp_send() in order to avoid recursion. However, * this does mean there is a short window where inp's * fields are unstable. Could this lead to a * potential race in which the factors causing us to * select the UDPv4 output routine are invalidated? */ INP_WUNLOCK(inp); if (sin6) in6_sin6_2_sin_in_sock(addr); pru = inetsw[ip_protox[nxt]].pr_usrreqs; /* addr will just be freed in sendit(). */ return ((*pru->pru_send)(so, flags, m, addr, control, td)); } } #endif #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif INP_HASH_WLOCK(pcbinfo); error = udp6_output(inp, m, addr, control, td); INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); bad: INP_WUNLOCK(inp); m_freem(m); return (error); } struct pr_usrreqs udp6_usrreqs = { .pru_abort = udp6_abort, .pru_attach = udp6_attach, .pru_bind = udp6_bind, .pru_connect = udp6_connect, .pru_control = in6_control, .pru_detach = udp6_detach, .pru_disconnect = udp6_disconnect, .pru_peeraddr = in6_mapped_peeraddr, .pru_send = udp6_send, .pru_shutdown = udp_shutdown, .pru_sockaddr = in6_mapped_sockaddr, .pru_soreceive = soreceive_dgram, .pru_sosend = sosend_dgram, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = udp6_close }; Index: head/sys/netipsec/keysock.c =================================================================== --- head/sys/netipsec/keysock.c (revision 305823) +++ head/sys/netipsec/keysock.c (revision 305824) @@ -1,570 +1,570 @@ /* $FreeBSD$ */ /* $KAME: keysock.c,v 1.25 2001/08/13 20:07:41 itojun Exp $ */ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_ipsec.h" /* This code has derived from sys/net/rtsock.c on FreeBSD2.2.5 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct key_cb { int key_count; int any_count; }; static VNET_DEFINE(struct key_cb, key_cb); #define V_key_cb VNET(key_cb) static struct sockaddr key_src = { 2, PF_KEY, }; static int key_sendup0(struct rawcb *, struct mbuf *, int); VNET_PCPUSTAT_DEFINE(struct pfkeystat, pfkeystat); VNET_PCPUSTAT_SYSINIT(pfkeystat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(pfkeystat); #endif /* VIMAGE */ /* * key_output() */ int key_output(struct mbuf *m, struct socket *so, ...) { struct sadb_msg *msg; int len, error = 0; if (m == NULL) panic("%s: NULL pointer was passed.\n", __func__); PFKEYSTAT_INC(out_total); PFKEYSTAT_ADD(out_bytes, m->m_pkthdr.len); len = m->m_pkthdr.len; if (len < sizeof(struct sadb_msg)) { PFKEYSTAT_INC(out_tooshort); error = EINVAL; goto end; } if (m->m_len < sizeof(struct sadb_msg)) { if ((m = m_pullup(m, sizeof(struct sadb_msg))) == NULL) { PFKEYSTAT_INC(out_nomem); error = ENOBUFS; goto end; } } M_ASSERTPKTHDR(m); KEYDEBUG(KEYDEBUG_KEY_DUMP, kdebug_mbuf(m)); msg = mtod(m, struct sadb_msg *); PFKEYSTAT_INC(out_msgtype[msg->sadb_msg_type]); if (len != PFKEY_UNUNIT64(msg->sadb_msg_len)) { PFKEYSTAT_INC(out_invlen); error = EINVAL; goto end; } error = key_parse(m, so); m = NULL; end: if (m) m_freem(m); return error; } /* * send message to the socket. */ static int key_sendup0(struct rawcb *rp, struct mbuf *m, int promisc) { int error; if (promisc) { struct sadb_msg *pmsg; M_PREPEND(m, sizeof(struct sadb_msg), M_NOWAIT); if (m == NULL) { PFKEYSTAT_INC(in_nomem); return (ENOBUFS); } pmsg = mtod(m, struct sadb_msg *); bzero(pmsg, sizeof(*pmsg)); pmsg->sadb_msg_version = PF_KEY_V2; pmsg->sadb_msg_type = SADB_X_PROMISC; pmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len); /* pid and seq? */ PFKEYSTAT_INC(in_msgtype[pmsg->sadb_msg_type]); } if (!sbappendaddr(&rp->rcb_socket->so_rcv, (struct sockaddr *)&key_src, m, NULL)) { PFKEYSTAT_INC(in_nomem); m_freem(m); error = ENOBUFS; } else error = 0; sorwakeup(rp->rcb_socket); return error; } /* XXX this interface should be obsoleted. */ int key_sendup(struct socket *so, struct sadb_msg *msg, u_int len, int target) { struct mbuf *m, *n, *mprev; int tlen; /* sanity check */ if (so == NULL || msg == NULL) panic("%s: NULL pointer was passed.\n", __func__); KEYDEBUG(KEYDEBUG_KEY_DUMP, printf("%s: \n", __func__); kdebug_sadb(msg)); /* * we increment statistics here, just in case we have ENOBUFS * in this function. */ PFKEYSTAT_INC(in_total); PFKEYSTAT_ADD(in_bytes, len); PFKEYSTAT_INC(in_msgtype[msg->sadb_msg_type]); /* * Get mbuf chain whenever possible (not clusters), * to save socket buffer. We'll be generating many SADB_ACQUIRE * messages to listening key sockets. If we simply allocate clusters, * sbappendaddr() will raise ENOBUFS due to too little sbspace(). * sbspace() computes # of actual data bytes AND mbuf region. * * TODO: SADB_ACQUIRE filters should be implemented. */ tlen = len; m = mprev = NULL; while (tlen > 0) { if (tlen == len) { MGETHDR(n, M_NOWAIT, MT_DATA); if (n == NULL) { PFKEYSTAT_INC(in_nomem); return ENOBUFS; } n->m_len = MHLEN; } else { MGET(n, M_NOWAIT, MT_DATA); if (n == NULL) { PFKEYSTAT_INC(in_nomem); return ENOBUFS; } n->m_len = MLEN; } if (tlen >= MCLBYTES) { /*XXX better threshold? */ if (!(MCLGET(n, M_NOWAIT))) { m_free(n); m_freem(m); PFKEYSTAT_INC(in_nomem); return ENOBUFS; } n->m_len = MCLBYTES; } if (tlen < n->m_len) n->m_len = tlen; n->m_next = NULL; if (m == NULL) m = mprev = n; else { mprev->m_next = n; mprev = n; } tlen -= n->m_len; n = NULL; } m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m_copyback(m, 0, len, (caddr_t)msg); /* avoid duplicated statistics */ PFKEYSTAT_ADD(in_total, -1); PFKEYSTAT_ADD(in_bytes, -len); PFKEYSTAT_ADD(in_msgtype[msg->sadb_msg_type], -1); return key_sendup_mbuf(so, m, target); } /* so can be NULL if target != KEY_SENDUP_ONE */ int key_sendup_mbuf(struct socket *so, struct mbuf *m, int target) { struct mbuf *n; struct keycb *kp; int sendup; struct rawcb *rp; int error = 0; if (m == NULL) panic("key_sendup_mbuf: NULL pointer was passed.\n"); if (so == NULL && target == KEY_SENDUP_ONE) panic("%s: NULL pointer was passed.\n", __func__); PFKEYSTAT_INC(in_total); PFKEYSTAT_ADD(in_bytes, m->m_pkthdr.len); if (m->m_len < sizeof(struct sadb_msg)) { m = m_pullup(m, sizeof(struct sadb_msg)); if (m == NULL) { PFKEYSTAT_INC(in_nomem); return ENOBUFS; } } if (m->m_len >= sizeof(struct sadb_msg)) { struct sadb_msg *msg; msg = mtod(m, struct sadb_msg *); PFKEYSTAT_INC(in_msgtype[msg->sadb_msg_type]); } mtx_lock(&rawcb_mtx); LIST_FOREACH(rp, &V_rawcb_list, list) { if (rp->rcb_proto.sp_family != PF_KEY) continue; if (rp->rcb_proto.sp_protocol && rp->rcb_proto.sp_protocol != PF_KEY_V2) { continue; } kp = (struct keycb *)rp; /* * If you are in promiscuous mode, and when you get broadcasted * reply, you'll get two PF_KEY messages. * (based on pf_key@inner.net message on 14 Oct 1998) */ if (((struct keycb *)rp)->kp_promisc) { - if ((n = m_copy(m, 0, (int)M_COPYALL)) != NULL) { + if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { (void)key_sendup0(rp, n, 1); n = NULL; } } /* the exact target will be processed later */ if (so && sotorawcb(so) == rp) continue; sendup = 0; switch (target) { case KEY_SENDUP_ONE: /* the statement has no effect */ if (so && sotorawcb(so) == rp) sendup++; break; case KEY_SENDUP_ALL: sendup++; break; case KEY_SENDUP_REGISTERED: if (kp->kp_registered) sendup++; break; } PFKEYSTAT_INC(in_msgtarget[target]); if (!sendup) continue; - if ((n = m_copy(m, 0, (int)M_COPYALL)) == NULL) { + if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) { m_freem(m); PFKEYSTAT_INC(in_nomem); mtx_unlock(&rawcb_mtx); return ENOBUFS; } if ((error = key_sendup0(rp, n, 0)) != 0) { m_freem(m); mtx_unlock(&rawcb_mtx); return error; } n = NULL; } if (so) { error = key_sendup0(sotorawcb(so), m, 0); m = NULL; } else { error = 0; m_freem(m); } mtx_unlock(&rawcb_mtx); return error; } /* * key_abort() * derived from net/rtsock.c:rts_abort() */ static void key_abort(struct socket *so) { raw_usrreqs.pru_abort(so); } /* * key_attach() * derived from net/rtsock.c:rts_attach() */ static int key_attach(struct socket *so, int proto, struct thread *td) { struct keycb *kp; int error; KASSERT(so->so_pcb == NULL, ("key_attach: so_pcb != NULL")); if (td != NULL) { error = priv_check(td, PRIV_NET_RAW); if (error) return error; } /* XXX */ kp = malloc(sizeof *kp, M_PCB, M_WAITOK | M_ZERO); if (kp == NULL) return ENOBUFS; so->so_pcb = (caddr_t)kp; error = raw_attach(so, proto); kp = (struct keycb *)sotorawcb(so); if (error) { free(kp, M_PCB); so->so_pcb = (caddr_t) 0; return error; } kp->kp_promisc = kp->kp_registered = 0; if (kp->kp_raw.rcb_proto.sp_protocol == PF_KEY) /* XXX: AF_KEY */ V_key_cb.key_count++; V_key_cb.any_count++; soisconnected(so); so->so_options |= SO_USELOOPBACK; return 0; } /* * key_bind() * derived from net/rtsock.c:rts_bind() */ static int key_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { return EINVAL; } /* * key_close() * derived from net/rtsock.c:rts_close(). */ static void key_close(struct socket *so) { raw_usrreqs.pru_close(so); } /* * key_connect() * derived from net/rtsock.c:rts_connect() */ static int key_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { return EINVAL; } /* * key_detach() * derived from net/rtsock.c:rts_detach() */ static void key_detach(struct socket *so) { struct keycb *kp = (struct keycb *)sotorawcb(so); KASSERT(kp != NULL, ("key_detach: kp == NULL")); if (kp->kp_raw.rcb_proto.sp_protocol == PF_KEY) /* XXX: AF_KEY */ V_key_cb.key_count--; V_key_cb.any_count--; key_freereg(so); raw_usrreqs.pru_detach(so); } /* * key_disconnect() * derived from net/rtsock.c:key_disconnect() */ static int key_disconnect(struct socket *so) { return(raw_usrreqs.pru_disconnect(so)); } /* * key_peeraddr() * derived from net/rtsock.c:rts_peeraddr() */ static int key_peeraddr(struct socket *so, struct sockaddr **nam) { return(raw_usrreqs.pru_peeraddr(so, nam)); } /* * key_send() * derived from net/rtsock.c:rts_send() */ static int key_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { return(raw_usrreqs.pru_send(so, flags, m, nam, control, td)); } /* * key_shutdown() * derived from net/rtsock.c:rts_shutdown() */ static int key_shutdown(struct socket *so) { return(raw_usrreqs.pru_shutdown(so)); } /* * key_sockaddr() * derived from net/rtsock.c:rts_sockaddr() */ static int key_sockaddr(struct socket *so, struct sockaddr **nam) { return(raw_usrreqs.pru_sockaddr(so, nam)); } struct pr_usrreqs key_usrreqs = { .pru_abort = key_abort, .pru_attach = key_attach, .pru_bind = key_bind, .pru_connect = key_connect, .pru_detach = key_detach, .pru_disconnect = key_disconnect, .pru_peeraddr = key_peeraddr, .pru_send = key_send, .pru_shutdown = key_shutdown, .pru_sockaddr = key_sockaddr, .pru_close = key_close, }; /* sysctl */ SYSCTL_NODE(_net, PF_KEY, key, CTLFLAG_RW, 0, "Key Family"); /* * Definitions of protocols supported in the KEY domain. */ extern struct domain keydomain; struct protosw keysw[] = { { .pr_type = SOCK_RAW, .pr_domain = &keydomain, .pr_protocol = PF_KEY_V2, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_output = key_output, .pr_ctlinput = raw_ctlinput, .pr_init = raw_init, .pr_usrreqs = &key_usrreqs } }; static void key_init0(void) { bzero((caddr_t)&V_key_cb, sizeof(V_key_cb)); key_init(); } struct domain keydomain = { .dom_family = PF_KEY, .dom_name = "key", .dom_init = key_init0, #ifdef VIMAGE .dom_destroy = key_destroy, #endif .dom_protosw = keysw, .dom_protoswNPROTOSW = &keysw[nitems(keysw)] }; VNET_DOMAIN_SET(key); Index: head/sys/sys/mbuf.h =================================================================== --- head/sys/sys/mbuf.h (revision 305823) +++ head/sys/sys/mbuf.h (revision 305824) @@ -1,1316 +1,1313 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mbuf.h 8.5 (Berkeley) 2/19/95 * $FreeBSD$ */ #ifndef _SYS_MBUF_H_ #define _SYS_MBUF_H_ /* XXX: These includes suck. Sorry! */ #include #ifdef _KERNEL #include #include #ifdef WITNESS #include #endif #endif #ifdef _KERNEL #include #define MBUF_PROBE1(probe, arg0) \ SDT_PROBE1(sdt, , , probe, arg0) #define MBUF_PROBE2(probe, arg0, arg1) \ SDT_PROBE2(sdt, , , probe, arg0, arg1) #define MBUF_PROBE3(probe, arg0, arg1, arg2) \ SDT_PROBE3(sdt, , , probe, arg0, arg1, arg2) #define MBUF_PROBE4(probe, arg0, arg1, arg2, arg3) \ SDT_PROBE4(sdt, , , probe, arg0, arg1, arg2, arg3) #define MBUF_PROBE5(probe, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE5(sdt, , , probe, arg0, arg1, arg2, arg3, arg4) SDT_PROBE_DECLARE(sdt, , , m__init); SDT_PROBE_DECLARE(sdt, , , m__gethdr); SDT_PROBE_DECLARE(sdt, , , m__get); SDT_PROBE_DECLARE(sdt, , , m__getcl); SDT_PROBE_DECLARE(sdt, , , m__clget); SDT_PROBE_DECLARE(sdt, , , m__cljget); SDT_PROBE_DECLARE(sdt, , , m__cljset); SDT_PROBE_DECLARE(sdt, , , m__free); SDT_PROBE_DECLARE(sdt, , , m__freem); #endif /* _KERNEL */ /* * Mbufs are of a single size, MSIZE (sys/param.h), which includes overhead. * An mbuf may add a single "mbuf cluster" of size MCLBYTES (also in * sys/param.h), which has no additional overhead and is used instead of the * internal data area; this is done when at least MINCLSIZE of data must be * stored. Additionally, it is possible to allocate a separate buffer * externally and attach it to the mbuf in a way similar to that of mbuf * clusters. * * NB: These calculation do not take actual compiler-induced alignment and * padding inside the complete struct mbuf into account. Appropriate * attention is required when changing members of struct mbuf. * * MLEN is data length in a normal mbuf. * MHLEN is data length in an mbuf with pktheader. * MINCLSIZE is a smallest amount of data that should be put into cluster. * * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are sensible. */ struct mbuf; #define MHSIZE offsetof(struct mbuf, m_dat) #define MPKTHSIZE offsetof(struct mbuf, m_pktdat) #define MLEN ((int)(MSIZE - MHSIZE)) #define MHLEN ((int)(MSIZE - MPKTHSIZE)) #define MINCLSIZE (MHLEN + 1) #ifdef _KERNEL /*- * Macro for type conversion: convert mbuf pointer to data pointer of correct * type: * * mtod(m, t) -- Convert mbuf pointer to data pointer of correct type. * mtodo(m, o) -- Same as above but with offset 'o' into data. */ #define mtod(m, t) ((t)((m)->m_data)) #define mtodo(m, o) ((void *)(((m)->m_data) + (o))) /* * Argument structure passed to UMA routines during mbuf and packet * allocations. */ struct mb_args { int flags; /* Flags for mbuf being allocated */ short type; /* Type of mbuf being allocated */ }; #endif /* _KERNEL */ /* * Packet tag structure (see below for details). */ struct m_tag { SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ u_int16_t m_tag_id; /* Tag ID */ u_int16_t m_tag_len; /* Length of data */ u_int32_t m_tag_cookie; /* ABI/Module ID */ void (*m_tag_free)(struct m_tag *); }; /* * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set. * Size ILP32: 48 * LP64: 56 * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are correct. */ struct pkthdr { struct ifnet *rcvif; /* rcv interface */ SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ int32_t len; /* total packet length */ /* Layer crossing persistent information. */ uint32_t flowid; /* packet's 4-tuple system */ uint64_t csum_flags; /* checksum and offload features */ uint16_t fibnum; /* this packet should use this fib */ uint8_t cosqos; /* class/quality of service */ uint8_t rsstype; /* hash type */ uint8_t l2hlen; /* layer 2 header length */ uint8_t l3hlen; /* layer 3 header length */ uint8_t l4hlen; /* layer 4 header length */ uint8_t l5hlen; /* layer 5 header length */ union { uint8_t eight[8]; uint16_t sixteen[4]; uint32_t thirtytwo[2]; uint64_t sixtyfour[1]; uintptr_t unintptr[1]; void *ptr; } PH_per; /* Layer specific non-persistent local storage for reassembly, etc. */ union { uint8_t eight[8]; uint16_t sixteen[4]; uint32_t thirtytwo[2]; uint64_t sixtyfour[1]; uintptr_t unintptr[1]; void *ptr; } PH_loc; }; #define ether_vtag PH_per.sixteen[0] #define PH_vt PH_per #define vt_nrecs sixteen[0] #define tso_segsz PH_per.sixteen[1] #define lro_nsegs tso_segsz #define csum_phsum PH_per.sixteen[2] #define csum_data PH_per.thirtytwo[1] /* * Description of external storage mapped into mbuf; valid only if M_EXT is * set. * Size ILP32: 28 * LP64: 48 * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are correct. */ struct m_ext { union { volatile u_int ext_count; /* value of ref count info */ volatile u_int *ext_cnt; /* pointer to ref count info */ }; caddr_t ext_buf; /* start of buffer */ uint32_t ext_size; /* size of buffer, for ext_free */ uint32_t ext_type:8, /* type of external storage */ ext_flags:24; /* external storage mbuf flags */ void (*ext_free) /* free routine if not the usual */ (struct mbuf *, void *, void *); void *ext_arg1; /* optional argument pointer */ void *ext_arg2; /* optional argument pointer */ }; /* * The core of the mbuf object along with some shortcut defines for practical * purposes. */ struct mbuf { /* * Header present at the beginning of every mbuf. * Size ILP32: 24 * LP64: 32 * Compile-time assertions in uipc_mbuf.c test these values to ensure * that they are correct. */ union { /* next buffer in chain */ struct mbuf *m_next; SLIST_ENTRY(mbuf) m_slist; STAILQ_ENTRY(mbuf) m_stailq; }; union { /* next chain in queue/record */ struct mbuf *m_nextpkt; SLIST_ENTRY(mbuf) m_slistpkt; STAILQ_ENTRY(mbuf) m_stailqpkt; }; caddr_t m_data; /* location of data */ int32_t m_len; /* amount of data in this mbuf */ uint32_t m_type:8, /* type of data in this mbuf */ m_flags:24; /* flags; see below */ #if !defined(__LP64__) uint32_t m_pad; /* pad for 64bit alignment */ #endif /* * A set of optional headers (packet header, external storage header) * and internal data storage. Historically, these arrays were sized * to MHLEN (space left after a packet header) and MLEN (space left * after only a regular mbuf header); they are now variable size in * order to support future work on variable-size mbufs. */ union { struct { struct pkthdr m_pkthdr; /* M_PKTHDR set */ union { struct m_ext m_ext; /* M_EXT set */ char m_pktdat[0]; }; }; char m_dat[0]; /* !M_PKTHDR, !M_EXT */ }; }; /* * mbuf flags of global significance and layer crossing. * Those of only protocol/layer specific significance are to be mapped * to M_PROTO[1-12] and cleared at layer handoff boundaries. * NB: Limited to the lower 24 bits. */ #define M_EXT 0x00000001 /* has associated external storage */ #define M_PKTHDR 0x00000002 /* start of record */ #define M_EOR 0x00000004 /* end of record */ #define M_RDONLY 0x00000008 /* associated data is marked read-only */ #define M_BCAST 0x00000010 /* send/received as link-level broadcast */ #define M_MCAST 0x00000020 /* send/received as link-level multicast */ #define M_PROMISC 0x00000040 /* packet was not for us */ #define M_VLANTAG 0x00000080 /* ether_vtag is valid */ #define M_UNUSED_8 0x00000100 /* --available-- */ #define M_NOFREE 0x00000200 /* do not free mbuf, embedded in cluster */ #define M_PROTO1 0x00001000 /* protocol-specific */ #define M_PROTO2 0x00002000 /* protocol-specific */ #define M_PROTO3 0x00004000 /* protocol-specific */ #define M_PROTO4 0x00008000 /* protocol-specific */ #define M_PROTO5 0x00010000 /* protocol-specific */ #define M_PROTO6 0x00020000 /* protocol-specific */ #define M_PROTO7 0x00040000 /* protocol-specific */ #define M_PROTO8 0x00080000 /* protocol-specific */ #define M_PROTO9 0x00100000 /* protocol-specific */ #define M_PROTO10 0x00200000 /* protocol-specific */ #define M_PROTO11 0x00400000 /* protocol-specific */ #define M_PROTO12 0x00800000 /* protocol-specific */ #define MB_DTOR_SKIP 0x1 /* don't pollute the cache by touching a freed mbuf */ /* * Flags to purge when crossing layers. */ #define M_PROTOFLAGS \ (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8|\ M_PROTO9|M_PROTO10|M_PROTO11|M_PROTO12) /* * Flags preserved when copying m_pkthdr. */ #define M_COPYFLAGS \ (M_PKTHDR|M_EOR|M_RDONLY|M_BCAST|M_MCAST|M_PROMISC|M_VLANTAG| \ M_PROTOFLAGS) /* * Mbuf flag description for use with printf(9) %b identifier. */ #define M_FLAG_BITS \ "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \ "\7M_PROMISC\10M_VLANTAG" #define M_FLAG_PROTOBITS \ "\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \ "\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \ "\27M_PROTO11\30M_PROTO12" #define M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS) /* * Network interface cards are able to hash protocol fields (such as IPv4 * addresses and TCP port numbers) classify packets into flows. These flows * can then be used to maintain ordering while delivering packets to the OS * via parallel input queues, as well as to provide a stateless affinity * model. NIC drivers can pass up the hash via m->m_pkthdr.flowid, and set * m_flag fields to indicate how the hash should be interpreted by the * network stack. * * Most NICs support RSS, which provides ordering and explicit affinity, and * use the hash m_flag bits to indicate what header fields were covered by * the hash. M_HASHTYPE_OPAQUE and M_HASHTYPE_OPAQUE_HASH can be set by non- * RSS cards or configurations that provide an opaque flow identifier, allowing * for ordering and distribution without explicit affinity. Additionally, * M_HASHTYPE_OPAQUE_HASH indicates that the flow identifier has hash * properties. */ #define M_HASHTYPE_HASHPROP 0x80 /* has hash properties */ #define M_HASHTYPE_HASH(t) (M_HASHTYPE_HASHPROP | (t)) /* Microsoft RSS standard hash types */ #define M_HASHTYPE_NONE 0 #define M_HASHTYPE_RSS_IPV4 M_HASHTYPE_HASH(1) /* IPv4 2-tuple */ #define M_HASHTYPE_RSS_TCP_IPV4 M_HASHTYPE_HASH(2) /* TCPv4 4-tuple */ #define M_HASHTYPE_RSS_IPV6 M_HASHTYPE_HASH(3) /* IPv6 2-tuple */ #define M_HASHTYPE_RSS_TCP_IPV6 M_HASHTYPE_HASH(4) /* TCPv6 4-tuple */ #define M_HASHTYPE_RSS_IPV6_EX M_HASHTYPE_HASH(5) /* IPv6 2-tuple + * ext hdrs */ #define M_HASHTYPE_RSS_TCP_IPV6_EX M_HASHTYPE_HASH(6) /* TCPv6 4-tiple + * ext hdrs */ /* Non-standard RSS hash types */ #define M_HASHTYPE_RSS_UDP_IPV4 M_HASHTYPE_HASH(7) /* IPv4 UDP 4-tuple*/ #define M_HASHTYPE_RSS_UDP_IPV4_EX M_HASHTYPE_HASH(8) /* IPv4 UDP 4-tuple + * ext hdrs */ #define M_HASHTYPE_RSS_UDP_IPV6 M_HASHTYPE_HASH(9) /* IPv6 UDP 4-tuple*/ #define M_HASHTYPE_RSS_UDP_IPV6_EX M_HASHTYPE_HASH(10)/* IPv6 UDP 4-tuple + * ext hdrs */ #define M_HASHTYPE_OPAQUE 63 /* ordering, not affinity */ #define M_HASHTYPE_OPAQUE_HASH M_HASHTYPE_HASH(M_HASHTYPE_OPAQUE) /* ordering+hash, not affinity*/ #define M_HASHTYPE_CLEAR(m) ((m)->m_pkthdr.rsstype = 0) #define M_HASHTYPE_GET(m) ((m)->m_pkthdr.rsstype) #define M_HASHTYPE_SET(m, v) ((m)->m_pkthdr.rsstype = (v)) #define M_HASHTYPE_TEST(m, v) (M_HASHTYPE_GET(m) == (v)) #define M_HASHTYPE_ISHASH(m) (M_HASHTYPE_GET(m) & M_HASHTYPE_HASHPROP) /* * COS/QOS class and quality of service tags. * It uses DSCP code points as base. */ #define QOS_DSCP_CS0 0x00 #define QOS_DSCP_DEF QOS_DSCP_CS0 #define QOS_DSCP_CS1 0x20 #define QOS_DSCP_AF11 0x28 #define QOS_DSCP_AF12 0x30 #define QOS_DSCP_AF13 0x38 #define QOS_DSCP_CS2 0x40 #define QOS_DSCP_AF21 0x48 #define QOS_DSCP_AF22 0x50 #define QOS_DSCP_AF23 0x58 #define QOS_DSCP_CS3 0x60 #define QOS_DSCP_AF31 0x68 #define QOS_DSCP_AF32 0x70 #define QOS_DSCP_AF33 0x78 #define QOS_DSCP_CS4 0x80 #define QOS_DSCP_AF41 0x88 #define QOS_DSCP_AF42 0x90 #define QOS_DSCP_AF43 0x98 #define QOS_DSCP_CS5 0xa0 #define QOS_DSCP_EF 0xb8 #define QOS_DSCP_CS6 0xc0 #define QOS_DSCP_CS7 0xe0 /* * External mbuf storage buffer types. */ #define EXT_CLUSTER 1 /* mbuf cluster */ #define EXT_SFBUF 2 /* sendfile(2)'s sf_buf */ #define EXT_JUMBOP 3 /* jumbo cluster page sized */ #define EXT_JUMBO9 4 /* jumbo cluster 9216 bytes */ #define EXT_JUMBO16 5 /* jumbo cluster 16184 bytes */ #define EXT_PACKET 6 /* mbuf+cluster from packet zone */ #define EXT_MBUF 7 /* external mbuf reference */ #define EXT_SFBUF_NOCACHE 8 /* sendfile(2)'s sf_buf not to be cached */ #define EXT_VENDOR1 224 /* for vendor-internal use */ #define EXT_VENDOR2 225 /* for vendor-internal use */ #define EXT_VENDOR3 226 /* for vendor-internal use */ #define EXT_VENDOR4 227 /* for vendor-internal use */ #define EXT_EXP1 244 /* for experimental use */ #define EXT_EXP2 245 /* for experimental use */ #define EXT_EXP3 246 /* for experimental use */ #define EXT_EXP4 247 /* for experimental use */ #define EXT_NET_DRV 252 /* custom ext_buf provided by net driver(s) */ #define EXT_MOD_TYPE 253 /* custom module's ext_buf type */ #define EXT_DISPOSABLE 254 /* can throw this buffer away w/page flipping */ #define EXT_EXTREF 255 /* has externally maintained ext_cnt ptr */ /* * Flags for external mbuf buffer types. * NB: limited to the lower 24 bits. */ #define EXT_FLAG_EMBREF 0x000001 /* embedded ext_count */ #define EXT_FLAG_EXTREF 0x000002 /* external ext_cnt, notyet */ #define EXT_FLAG_NOFREE 0x000010 /* don't free mbuf to pool, notyet */ #define EXT_FLAG_VENDOR1 0x010000 /* for vendor-internal use */ #define EXT_FLAG_VENDOR2 0x020000 /* for vendor-internal use */ #define EXT_FLAG_VENDOR3 0x040000 /* for vendor-internal use */ #define EXT_FLAG_VENDOR4 0x080000 /* for vendor-internal use */ #define EXT_FLAG_EXP1 0x100000 /* for experimental use */ #define EXT_FLAG_EXP2 0x200000 /* for experimental use */ #define EXT_FLAG_EXP3 0x400000 /* for experimental use */ #define EXT_FLAG_EXP4 0x800000 /* for experimental use */ /* * EXT flag description for use with printf(9) %b identifier. */ #define EXT_FLAG_BITS \ "\20\1EXT_FLAG_EMBREF\2EXT_FLAG_EXTREF\5EXT_FLAG_NOFREE" \ "\21EXT_FLAG_VENDOR1\22EXT_FLAG_VENDOR2\23EXT_FLAG_VENDOR3" \ "\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \ "\30EXT_FLAG_EXP4" /* * External reference/free functions. */ void sf_ext_free(void *, void *); void sf_ext_free_nocache(void *, void *); /* * Flags indicating checksum, segmentation and other offload work to be * done, or already done, by hardware or lower layers. It is split into * separate inbound and outbound flags. * * Outbound flags that are set by upper protocol layers requesting lower * layers, or ideally the hardware, to perform these offloading tasks. * For outbound packets this field and its flags can be directly tested * against ifnet if_hwassist. */ #define CSUM_IP 0x00000001 /* IP header checksum offload */ #define CSUM_IP_UDP 0x00000002 /* UDP checksum offload */ #define CSUM_IP_TCP 0x00000004 /* TCP checksum offload */ #define CSUM_IP_SCTP 0x00000008 /* SCTP checksum offload */ #define CSUM_IP_TSO 0x00000010 /* TCP segmentation offload */ #define CSUM_IP_ISCSI 0x00000020 /* iSCSI checksum offload */ #define CSUM_IP6_UDP 0x00000200 /* UDP checksum offload */ #define CSUM_IP6_TCP 0x00000400 /* TCP checksum offload */ #define CSUM_IP6_SCTP 0x00000800 /* SCTP checksum offload */ #define CSUM_IP6_TSO 0x00001000 /* TCP segmentation offload */ #define CSUM_IP6_ISCSI 0x00002000 /* iSCSI checksum offload */ /* Inbound checksum support where the checksum was verified by hardware. */ #define CSUM_L3_CALC 0x01000000 /* calculated layer 3 csum */ #define CSUM_L3_VALID 0x02000000 /* checksum is correct */ #define CSUM_L4_CALC 0x04000000 /* calculated layer 4 csum */ #define CSUM_L4_VALID 0x08000000 /* checksum is correct */ #define CSUM_L5_CALC 0x10000000 /* calculated layer 5 csum */ #define CSUM_L5_VALID 0x20000000 /* checksum is correct */ #define CSUM_COALESED 0x40000000 /* contains merged segments */ /* * CSUM flag description for use with printf(9) %b identifier. */ #define CSUM_BITS \ "\20\1CSUM_IP\2CSUM_IP_UDP\3CSUM_IP_TCP\4CSUM_IP_SCTP\5CSUM_IP_TSO" \ "\6CSUM_IP_ISCSI" \ "\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP\15CSUM_IP6_TSO" \ "\16CSUM_IP6_ISCSI" \ "\31CSUM_L3_CALC\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID" \ "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESED" /* CSUM flags compatibility mappings. */ #define CSUM_IP_CHECKED CSUM_L3_CALC #define CSUM_IP_VALID CSUM_L3_VALID #define CSUM_DATA_VALID CSUM_L4_VALID #define CSUM_PSEUDO_HDR CSUM_L4_CALC #define CSUM_SCTP_VALID CSUM_L4_VALID #define CSUM_DELAY_DATA (CSUM_TCP|CSUM_UDP) #define CSUM_DELAY_IP CSUM_IP /* Only v4, no v6 IP hdr csum */ #define CSUM_DELAY_DATA_IPV6 (CSUM_TCP_IPV6|CSUM_UDP_IPV6) #define CSUM_DATA_VALID_IPV6 CSUM_DATA_VALID #define CSUM_TCP CSUM_IP_TCP #define CSUM_UDP CSUM_IP_UDP #define CSUM_SCTP CSUM_IP_SCTP #define CSUM_TSO (CSUM_IP_TSO|CSUM_IP6_TSO) #define CSUM_UDP_IPV6 CSUM_IP6_UDP #define CSUM_TCP_IPV6 CSUM_IP6_TCP #define CSUM_SCTP_IPV6 CSUM_IP6_SCTP /* * mbuf types describing the content of the mbuf (including external storage). */ #define MT_NOTMBUF 0 /* USED INTERNALLY ONLY! Object is not mbuf */ #define MT_DATA 1 /* dynamic (data) allocation */ #define MT_HEADER MT_DATA /* packet header, use M_PKTHDR instead */ #define MT_VENDOR1 4 /* for vendor-internal use */ #define MT_VENDOR2 5 /* for vendor-internal use */ #define MT_VENDOR3 6 /* for vendor-internal use */ #define MT_VENDOR4 7 /* for vendor-internal use */ #define MT_SONAME 8 /* socket name */ #define MT_EXP1 9 /* for experimental use */ #define MT_EXP2 10 /* for experimental use */ #define MT_EXP3 11 /* for experimental use */ #define MT_EXP4 12 /* for experimental use */ #define MT_CONTROL 14 /* extra-data protocol message */ #define MT_OOBDATA 15 /* expedited data */ #define MT_NTYPES 16 /* number of mbuf types for mbtypes[] */ #define MT_NOINIT 255 /* Not a type but a flag to allocate a non-initialized mbuf */ /* * String names of mbuf-related UMA(9) and malloc(9) types. Exposed to * !_KERNEL so that monitoring tools can look up the zones with * libmemstat(3). */ #define MBUF_MEM_NAME "mbuf" #define MBUF_CLUSTER_MEM_NAME "mbuf_cluster" #define MBUF_PACKET_MEM_NAME "mbuf_packet" #define MBUF_JUMBOP_MEM_NAME "mbuf_jumbo_page" #define MBUF_JUMBO9_MEM_NAME "mbuf_jumbo_9k" #define MBUF_JUMBO16_MEM_NAME "mbuf_jumbo_16k" #define MBUF_TAG_MEM_NAME "mbuf_tag" #define MBUF_EXTREFCNT_MEM_NAME "mbuf_ext_refcnt" #ifdef _KERNEL #ifdef WITNESS #define MBUF_CHECKSLEEP(how) do { \ if (how == M_WAITOK) \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, \ "Sleeping in \"%s\"", __func__); \ } while (0) #else #define MBUF_CHECKSLEEP(how) #endif /* * Network buffer allocation API * * The rest of it is defined in kern/kern_mbuf.c */ extern uma_zone_t zone_mbuf; extern uma_zone_t zone_clust; extern uma_zone_t zone_pack; extern uma_zone_t zone_jumbop; extern uma_zone_t zone_jumbo9; extern uma_zone_t zone_jumbo16; void mb_dupcl(struct mbuf *, struct mbuf *); void mb_free_ext(struct mbuf *); void m_adj(struct mbuf *, int); int m_apply(struct mbuf *, int, int, int (*)(void *, void *, u_int), void *); int m_append(struct mbuf *, int, c_caddr_t); void m_cat(struct mbuf *, struct mbuf *); void m_catpkt(struct mbuf *, struct mbuf *); int m_clget(struct mbuf *m, int how); void *m_cljget(struct mbuf *m, int how, int size); struct mbuf *m_collapse(struct mbuf *, int, int); void m_copyback(struct mbuf *, int, int, c_caddr_t); void m_copydata(const struct mbuf *, int, int, caddr_t); struct mbuf *m_copym(struct mbuf *, int, int, int); struct mbuf *m_copypacket(struct mbuf *, int); void m_copy_pkthdr(struct mbuf *, struct mbuf *); struct mbuf *m_copyup(struct mbuf *, int, int); struct mbuf *m_defrag(struct mbuf *, int); void m_demote_pkthdr(struct mbuf *); void m_demote(struct mbuf *, int, int); struct mbuf *m_devget(char *, int, int, struct ifnet *, void (*)(char *, caddr_t, u_int)); struct mbuf *m_dup(const struct mbuf *, int); int m_dup_pkthdr(struct mbuf *, const struct mbuf *, int); void m_extadd(struct mbuf *, caddr_t, u_int, void (*)(struct mbuf *, void *, void *), void *, void *, int, int); u_int m_fixhdr(struct mbuf *); struct mbuf *m_fragment(struct mbuf *, int, int); void m_freem(struct mbuf *); struct mbuf *m_get2(int, int, short, int); struct mbuf *m_getjcl(int, short, int, int); struct mbuf *m_getm2(struct mbuf *, int, int, short, int); struct mbuf *m_getptr(struct mbuf *, int, int *); u_int m_length(struct mbuf *, struct mbuf **); int m_mbuftouio(struct uio *, struct mbuf *, int); void m_move_pkthdr(struct mbuf *, struct mbuf *); int m_pkthdr_init(struct mbuf *, int); struct mbuf *m_prepend(struct mbuf *, int, int); void m_print(const struct mbuf *, int); struct mbuf *m_pulldown(struct mbuf *, int, int, int *); struct mbuf *m_pullup(struct mbuf *, int); int m_sanity(struct mbuf *, int); struct mbuf *m_split(struct mbuf *, int, int); struct mbuf *m_uiotombuf(struct uio *, int, int, int, int); struct mbuf *m_unshare(struct mbuf *, int); static __inline int m_gettype(int size) { int type; switch (size) { case MSIZE: type = EXT_MBUF; break; case MCLBYTES: type = EXT_CLUSTER; break; #if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: type = EXT_JUMBOP; break; #endif case MJUM9BYTES: type = EXT_JUMBO9; break; case MJUM16BYTES: type = EXT_JUMBO16; break; default: panic("%s: invalid cluster size %d", __func__, size); } return (type); } /* * Associated an external reference counted buffer with an mbuf. */ static __inline void m_extaddref(struct mbuf *m, caddr_t buf, u_int size, u_int *ref_cnt, void (*freef)(struct mbuf *, void *, void *), void *arg1, void *arg2) { KASSERT(ref_cnt != NULL, ("%s: ref_cnt not provided", __func__)); atomic_add_int(ref_cnt, 1); m->m_flags |= M_EXT; m->m_ext.ext_buf = buf; m->m_ext.ext_cnt = ref_cnt; m->m_data = m->m_ext.ext_buf; m->m_ext.ext_size = size; m->m_ext.ext_free = freef; m->m_ext.ext_arg1 = arg1; m->m_ext.ext_arg2 = arg2; m->m_ext.ext_type = EXT_EXTREF; m->m_ext.ext_flags = 0; } static __inline uma_zone_t m_getzone(int size) { uma_zone_t zone; switch (size) { case MCLBYTES: zone = zone_clust; break; #if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: zone = zone_jumbop; break; #endif case MJUM9BYTES: zone = zone_jumbo9; break; case MJUM16BYTES: zone = zone_jumbo16; break; default: panic("%s: invalid cluster size %d", __func__, size); } return (zone); } /* * Initialize an mbuf with linear storage. * * Inline because the consumer text overhead will be roughly the same to * initialize or call a function with this many parameters and M_PKTHDR * should go away with constant propagation for !MGETHDR. */ static __inline int m_init(struct mbuf *m, int how, short type, int flags) { int error; m->m_next = NULL; m->m_nextpkt = NULL; m->m_data = m->m_dat; m->m_len = 0; m->m_flags = flags; m->m_type = type; if (flags & M_PKTHDR) error = m_pkthdr_init(m, how); else error = 0; MBUF_PROBE5(m__init, m, how, type, flags, error); return (error); } static __inline struct mbuf * m_get(int how, short type) { struct mbuf *m; struct mb_args args; args.flags = 0; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); MBUF_PROBE3(m__get, how, type, m); return (m); } static __inline struct mbuf * m_gethdr(int how, short type) { struct mbuf *m; struct mb_args args; args.flags = M_PKTHDR; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); MBUF_PROBE3(m__gethdr, how, type, m); return (m); } static __inline struct mbuf * m_getcl(int how, short type, int flags) { struct mbuf *m; struct mb_args args; args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_pack, &args, how); MBUF_PROBE4(m__getcl, how, type, flags, m); return (m); } /* * XXX: m_cljset() is a dangerous API. One must attach only a new, * unreferenced cluster to an mbuf(9). It is not possible to assert * that, so care can be taken only by users of the API. */ static __inline void m_cljset(struct mbuf *m, void *cl, int type) { int size; switch (type) { case EXT_CLUSTER: size = MCLBYTES; break; #if MJUMPAGESIZE != MCLBYTES case EXT_JUMBOP: size = MJUMPAGESIZE; break; #endif case EXT_JUMBO9: size = MJUM9BYTES; break; case EXT_JUMBO16: size = MJUM16BYTES; break; default: panic("%s: unknown cluster type %d", __func__, type); break; } m->m_data = m->m_ext.ext_buf = cl; m->m_ext.ext_free = m->m_ext.ext_arg1 = m->m_ext.ext_arg2 = NULL; m->m_ext.ext_size = size; m->m_ext.ext_type = type; m->m_ext.ext_flags = EXT_FLAG_EMBREF; m->m_ext.ext_count = 1; m->m_flags |= M_EXT; MBUF_PROBE3(m__cljset, m, cl, type); } static __inline void m_chtype(struct mbuf *m, short new_type) { m->m_type = new_type; } static __inline void m_clrprotoflags(struct mbuf *m) { while (m) { m->m_flags &= ~M_PROTOFLAGS; m = m->m_next; } } static __inline struct mbuf * m_last(struct mbuf *m) { while (m->m_next) m = m->m_next; return (m); } static inline u_int m_extrefcnt(struct mbuf *m) { KASSERT(m->m_flags & M_EXT, ("%s: M_EXT missing", __func__)); return ((m->m_ext.ext_flags & EXT_FLAG_EMBREF) ? m->m_ext.ext_count : *m->m_ext.ext_cnt); } /* * mbuf, cluster, and external object allocation macros (for compatibility * purposes). */ #define M_MOVE_PKTHDR(to, from) m_move_pkthdr((to), (from)) #define MGET(m, how, type) ((m) = m_get((how), (type))) #define MGETHDR(m, how, type) ((m) = m_gethdr((how), (type))) #define MCLGET(m, how) m_clget((m), (how)) #define MEXTADD(m, buf, size, free, arg1, arg2, flags, type) \ m_extadd((m), (caddr_t)(buf), (size), (free), (arg1), (arg2), \ (flags), (type)) #define m_getm(m, len, how, type) \ m_getm2((m), (len), (how), (type), M_PKTHDR) /* * Evaluate TRUE if it's safe to write to the mbuf m's data region (this can * be both the local data payload, or an external buffer area, depending on * whether M_EXT is set). */ #define M_WRITABLE(m) (!((m)->m_flags & M_RDONLY) && \ (!(((m)->m_flags & M_EXT)) || \ (m_extrefcnt(m) == 1))) /* Check if the supplied mbuf has a packet header, or else panic. */ #define M_ASSERTPKTHDR(m) \ KASSERT((m) != NULL && (m)->m_flags & M_PKTHDR, \ ("%s: no mbuf packet header!", __func__)) /* * Ensure that the supplied mbuf is a valid, non-free mbuf. * * XXX: Broken at the moment. Need some UMA magic to make it work again. */ #define M_ASSERTVALID(m) \ KASSERT((((struct mbuf *)m)->m_flags & 0) == 0, \ ("%s: attempted use of a free mbuf!", __func__)) /* * Return the address of the start of the buffer associated with an mbuf, * handling external storage, packet-header mbufs, and regular data mbufs. */ #define M_START(m) \ (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \ ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] : \ &(m)->m_dat[0]) /* * Return the size of the buffer associated with an mbuf, handling external * storage, packet-header mbufs, and regular data mbufs. */ #define M_SIZE(m) \ (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size : \ ((m)->m_flags & M_PKTHDR) ? MHLEN : \ MLEN) /* * Set the m_data pointer of a newly allocated mbuf to place an object of the * specified size at the end of the mbuf, longword aligned. * * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as * separate macros, each asserting that it was called at the proper moment. * This required callers to themselves test the storage type and call the * right one. Rather than require callers to be aware of those layout * decisions, we centralize here. */ static __inline void m_align(struct mbuf *m, int len) { #ifdef INVARIANTS const char *msg = "%s: not a virgin mbuf"; #endif int adjust; KASSERT(m->m_data == M_START(m), (msg, __func__)); adjust = M_SIZE(m) - len; m->m_data += adjust &~ (sizeof(long)-1); } #define M_ALIGN(m, len) m_align(m, len) #define MH_ALIGN(m, len) m_align(m, len) #define MEXT_ALIGN(m, len) m_align(m, len) /* * Compute the amount of space available before the current start of data in * an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. * * NB: In previous versions, M_LEADINGSPACE() would only check M_WRITABLE() * for mbufs with external storage. We now allow mbuf-embedded data to be * read-only as well. */ #define M_LEADINGSPACE(m) \ (M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0) /* * Compute the amount of space available after the end of data in an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. * * NB: In previous versions, M_TRAILINGSPACE() would only check M_WRITABLE() * for mbufs with external storage. We now allow mbuf-embedded data to be * read-only as well. */ #define M_TRAILINGSPACE(m) \ (M_WRITABLE(m) ? \ ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0) /* * Arrange to prepend space of size plen to mbuf m. If a new mbuf must be * allocated, how specifies whether to wait. If the allocation fails, the * original mbuf chain is freed and m is set to NULL. */ #define M_PREPEND(m, plen, how) do { \ struct mbuf **_mmp = &(m); \ struct mbuf *_mm = *_mmp; \ int _mplen = (plen); \ int __mhow = (how); \ \ MBUF_CHECKSLEEP(how); \ if (M_LEADINGSPACE(_mm) >= _mplen) { \ _mm->m_data -= _mplen; \ _mm->m_len += _mplen; \ } else \ _mm = m_prepend(_mm, _mplen, __mhow); \ if (_mm != NULL && _mm->m_flags & M_PKTHDR) \ _mm->m_pkthdr.len += _mplen; \ *_mmp = _mm; \ } while (0) /* * Change mbuf to new type. This is a relatively expensive operation and * should be avoided. */ #define MCHTYPE(m, t) m_chtype((m), (t)) /* Length to m_copy to copy all. */ #define M_COPYALL 1000000000 -/* Compatibility with 4.3. */ -#define m_copy(m, o, l) m_copym((m), (o), (l), M_NOWAIT) - extern int max_datalen; /* MHLEN - max_hdr */ extern int max_hdr; /* Largest link + protocol header */ extern int max_linkhdr; /* Largest link-level header */ extern int max_protohdr; /* Largest protocol header */ extern int nmbclusters; /* Maximum number of clusters */ /*- * Network packets may have annotations attached by affixing a list of * "packet tags" to the pkthdr structure. Packet tags are dynamically * allocated semi-opaque data structures that have a fixed header * (struct m_tag) that specifies the size of the memory block and a * pair that identifies it. The cookie is a 32-bit unique * unsigned value used to identify a module or ABI. By convention this value * is chosen as the date+time that the module is created, expressed as the * number of seconds since the epoch (e.g., using date -u +'%s'). The type * value is an ABI/module-specific value that identifies a particular * annotation and is private to the module. For compatibility with systems * like OpenBSD that define packet tags w/o an ABI/module cookie, the value * PACKET_ABI_COMPAT is used to implement m_tag_get and m_tag_find * compatibility shim functions and several tag types are defined below. * Users that do not require compatibility should use a private cookie value * so that packet tag-related definitions can be maintained privately. * * Note that the packet tag returned by m_tag_alloc has the default memory * alignment implemented by malloc. To reference private data one can use a * construct like: * * struct m_tag *mtag = m_tag_alloc(...); * struct foo *p = (struct foo *)(mtag+1); * * if the alignment of struct m_tag is sufficient for referencing members of * struct foo. Otherwise it is necessary to embed struct m_tag within the * private data structure to insure proper alignment; e.g., * * struct foo { * struct m_tag tag; * ... * }; * struct foo *p = (struct foo *) m_tag_alloc(...); * struct m_tag *mtag = &p->tag; */ /* * Persistent tags stay with an mbuf until the mbuf is reclaimed. Otherwise * tags are expected to ``vanish'' when they pass through a network * interface. For most interfaces this happens normally as the tags are * reclaimed when the mbuf is free'd. However in some special cases * reclaiming must be done manually. An example is packets that pass through * the loopback interface. Also, one must be careful to do this when * ``turning around'' packets (e.g., icmp_reflect). * * To mark a tag persistent bit-or this flag in when defining the tag id. * The tag will then be treated as described above. */ #define MTAG_PERSISTENT 0x800 #define PACKET_TAG_NONE 0 /* Nadda */ /* Packet tags for use with PACKET_ABI_COMPAT. */ #define PACKET_TAG_IPSEC_IN_DONE 1 /* IPsec applied, in */ #define PACKET_TAG_IPSEC_OUT_DONE 2 /* IPsec applied, out */ #define PACKET_TAG_IPSEC_IN_CRYPTO_DONE 3 /* NIC IPsec crypto done */ #define PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED 4 /* NIC IPsec crypto req'ed */ #define PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO 5 /* NIC notifies IPsec */ #define PACKET_TAG_IPSEC_PENDING_TDB 6 /* Reminder to do IPsec */ #define PACKET_TAG_BRIDGE 7 /* Bridge processing done */ #define PACKET_TAG_GIF 8 /* GIF processing done */ #define PACKET_TAG_GRE 9 /* GRE processing done */ #define PACKET_TAG_IN_PACKET_CHECKSUM 10 /* NIC checksumming done */ #define PACKET_TAG_ENCAP 11 /* Encap. processing */ #define PACKET_TAG_IPSEC_SOCKET 12 /* IPSEC socket ref */ #define PACKET_TAG_IPSEC_HISTORY 13 /* IPSEC history */ #define PACKET_TAG_IPV6_INPUT 14 /* IPV6 input processing */ #define PACKET_TAG_DUMMYNET 15 /* dummynet info */ #define PACKET_TAG_DIVERT 17 /* divert info */ #define PACKET_TAG_IPFORWARD 18 /* ipforward info */ #define PACKET_TAG_MACLABEL (19 | MTAG_PERSISTENT) /* MAC label */ #define PACKET_TAG_PF (21 | MTAG_PERSISTENT) /* PF/ALTQ information */ #define PACKET_TAG_RTSOCKFAM 25 /* rtsock sa family */ #define PACKET_TAG_IPOPTIONS 27 /* Saved IP options */ #define PACKET_TAG_CARP 28 /* CARP info */ #define PACKET_TAG_IPSEC_NAT_T_PORTS 29 /* two uint16_t */ #define PACKET_TAG_ND_OUTGOING 30 /* ND outgoing */ /* Specific cookies and tags. */ /* Packet tag routines. */ struct m_tag *m_tag_alloc(u_int32_t, int, int, int); void m_tag_delete(struct mbuf *, struct m_tag *); void m_tag_delete_chain(struct mbuf *, struct m_tag *); void m_tag_free_default(struct m_tag *); struct m_tag *m_tag_locate(struct mbuf *, u_int32_t, int, struct m_tag *); struct m_tag *m_tag_copy(struct m_tag *, int); int m_tag_copy_chain(struct mbuf *, const struct mbuf *, int); void m_tag_delete_nonpersistent(struct mbuf *); /* * Initialize the list of tags associated with an mbuf. */ static __inline void m_tag_init(struct mbuf *m) { SLIST_INIT(&m->m_pkthdr.tags); } /* * Set up the contents of a tag. Note that this does not fill in the free * method; the caller is expected to do that. * * XXX probably should be called m_tag_init, but that was already taken. */ static __inline void m_tag_setup(struct m_tag *t, u_int32_t cookie, int type, int len) { t->m_tag_id = type; t->m_tag_len = len; t->m_tag_cookie = cookie; } /* * Reclaim resources associated with a tag. */ static __inline void m_tag_free(struct m_tag *t) { (*t->m_tag_free)(t); } /* * Return the first tag associated with an mbuf. */ static __inline struct m_tag * m_tag_first(struct mbuf *m) { return (SLIST_FIRST(&m->m_pkthdr.tags)); } /* * Return the next tag in the list of tags associated with an mbuf. */ static __inline struct m_tag * m_tag_next(struct mbuf *m __unused, struct m_tag *t) { return (SLIST_NEXT(t, m_tag_link)); } /* * Prepend a tag to the list of tags associated with an mbuf. */ static __inline void m_tag_prepend(struct mbuf *m, struct m_tag *t) { SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); } /* * Unlink a tag from the list of tags associated with an mbuf. */ static __inline void m_tag_unlink(struct mbuf *m, struct m_tag *t) { SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link); } /* These are for OpenBSD compatibility. */ #define MTAG_ABI_COMPAT 0 /* compatibility ABI */ static __inline struct m_tag * m_tag_get(int type, int length, int wait) { return (m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait)); } static __inline struct m_tag * m_tag_find(struct mbuf *m, int type, struct m_tag *start) { return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL : m_tag_locate(m, MTAG_ABI_COMPAT, type, start)); } static __inline struct mbuf * m_free(struct mbuf *m) { struct mbuf *n = m->m_next; MBUF_PROBE1(m__free, m); if ((m->m_flags & (M_PKTHDR|M_NOFREE)) == (M_PKTHDR|M_NOFREE)) m_tag_delete_chain(m, NULL); if (m->m_flags & M_EXT) mb_free_ext(m); else if ((m->m_flags & M_NOFREE) == 0) uma_zfree(zone_mbuf, m); return (n); } static __inline int rt_m_getfib(struct mbuf *m) { KASSERT(m->m_flags & M_PKTHDR , ("Attempt to get FIB from non header mbuf.")); return (m->m_pkthdr.fibnum); } #define M_GETFIB(_m) rt_m_getfib(_m) #define M_SETFIB(_m, _fib) do { \ KASSERT((_m)->m_flags & M_PKTHDR, ("Attempt to set FIB on non header mbuf.")); \ ((_m)->m_pkthdr.fibnum) = (_fib); \ } while (0) /* flags passed as first argument for "m_ether_tcpip_hash()" */ #define MBUF_HASHFLAG_L2 (1 << 2) #define MBUF_HASHFLAG_L3 (1 << 3) #define MBUF_HASHFLAG_L4 (1 << 4) /* mbuf hashing helper routines */ uint32_t m_ether_tcpip_hash_init(void); uint32_t m_ether_tcpip_hash(const uint32_t, const struct mbuf *, const uint32_t); #ifdef MBUF_PROFILING void m_profile(struct mbuf *m); #define M_PROFILE(m) m_profile(m) #else #define M_PROFILE(m) #endif struct mbufq { STAILQ_HEAD(, mbuf) mq_head; int mq_len; int mq_maxlen; }; static inline void mbufq_init(struct mbufq *mq, int maxlen) { STAILQ_INIT(&mq->mq_head); mq->mq_maxlen = maxlen; mq->mq_len = 0; } static inline struct mbuf * mbufq_flush(struct mbufq *mq) { struct mbuf *m; m = STAILQ_FIRST(&mq->mq_head); STAILQ_INIT(&mq->mq_head); mq->mq_len = 0; return (m); } static inline void mbufq_drain(struct mbufq *mq) { struct mbuf *m, *n; n = mbufq_flush(mq); while ((m = n) != NULL) { n = STAILQ_NEXT(m, m_stailqpkt); m_freem(m); } } static inline struct mbuf * mbufq_first(const struct mbufq *mq) { return (STAILQ_FIRST(&mq->mq_head)); } static inline struct mbuf * mbufq_last(const struct mbufq *mq) { return (STAILQ_LAST(&mq->mq_head, mbuf, m_stailqpkt)); } static inline int mbufq_full(const struct mbufq *mq) { return (mq->mq_len >= mq->mq_maxlen); } static inline int mbufq_len(const struct mbufq *mq) { return (mq->mq_len); } static inline int mbufq_enqueue(struct mbufq *mq, struct mbuf *m) { if (mbufq_full(mq)) return (ENOBUFS); STAILQ_INSERT_TAIL(&mq->mq_head, m, m_stailqpkt); mq->mq_len++; return (0); } static inline struct mbuf * mbufq_dequeue(struct mbufq *mq) { struct mbuf *m; m = STAILQ_FIRST(&mq->mq_head); if (m) { STAILQ_REMOVE_HEAD(&mq->mq_head, m_stailqpkt); m->m_nextpkt = NULL; mq->mq_len--; } return (m); } static inline void mbufq_prepend(struct mbufq *mq, struct mbuf *m) { STAILQ_INSERT_HEAD(&mq->mq_head, m, m_stailqpkt); mq->mq_len++; } #endif /* _KERNEL */ #endif /* !_SYS_MBUF_H_ */