diff --git a/sys/netinet/icmp6.h b/sys/netinet/icmp6.h index 7845b682f3e4..2ca5b3433e47 100644 --- a/sys/netinet/icmp6.h +++ b/sys/netinet/icmp6.h @@ -1,787 +1,784 @@ /* $KAME: icmp6.h,v 1.46 2001/04/27 15:09:48 itojun Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETINET_ICMP6_H_ #define _NETINET_ICMP6_H_ #include #define ICMPV6_PLD_MAXLEN 1232 /* IPV6_MMTU - sizeof(struct ip6_hdr) - sizeof(struct icmp6_hdr) */ struct icmp6_hdr { u_int8_t icmp6_type; /* type field */ u_int8_t icmp6_code; /* code field */ u_int16_t icmp6_cksum; /* checksum field */ union { u_int32_t icmp6_un_data32[1]; /* type-specific field */ u_int16_t icmp6_un_data16[2]; /* type-specific field */ u_int8_t icmp6_un_data8[4]; /* type-specific field */ } icmp6_dataun; } __packed; #define icmp6_data32 icmp6_dataun.icmp6_un_data32 #define icmp6_data16 icmp6_dataun.icmp6_un_data16 #define icmp6_data8 icmp6_dataun.icmp6_un_data8 #define icmp6_pptr icmp6_data32[0] /* parameter prob */ #define icmp6_mtu icmp6_data32[0] /* packet too big */ #define icmp6_id icmp6_data16[0] /* echo request/reply */ #define icmp6_seq icmp6_data16[1] /* echo request/reply */ #define icmp6_maxdelay icmp6_data16[0] /* mcast group membership */ #define ICMP6_DST_UNREACH 1 /* dest unreachable, codes: */ #define ICMP6_PACKET_TOO_BIG 2 /* packet too big */ #define ICMP6_TIME_EXCEEDED 3 /* time exceeded, code: */ #define ICMP6_PARAM_PROB 4 /* ip6 header bad */ #define ICMP6_ECHO_REQUEST 128 /* echo service */ #define ICMP6_ECHO_REPLY 129 /* echo reply */ #define MLD_LISTENER_QUERY 130 /* multicast listener query */ #define MLD_LISTENER_REPORT 131 /* multicast listener report */ #define MLD_LISTENER_DONE 132 /* multicast listener done */ #define MLD_LISTENER_REDUCTION MLD_LISTENER_DONE /* RFC3542 definition */ /* RFC2292 decls */ #define ICMP6_MEMBERSHIP_QUERY 130 /* group membership query */ #define ICMP6_MEMBERSHIP_REPORT 131 /* group membership report */ #define ICMP6_MEMBERSHIP_REDUCTION 132 /* group membership termination */ #ifndef _KERNEL /* the followings are for backward compatibility to old KAME apps. */ #define MLD6_LISTENER_QUERY MLD_LISTENER_QUERY #define MLD6_LISTENER_REPORT MLD_LISTENER_REPORT #define MLD6_LISTENER_DONE MLD_LISTENER_DONE #endif #define ND_ROUTER_SOLICIT 133 /* router solicitation */ #define ND_ROUTER_ADVERT 134 /* router advertisement */ #define ND_NEIGHBOR_SOLICIT 135 /* neighbor solicitation */ #define ND_NEIGHBOR_ADVERT 136 /* neighbor advertisement */ #define ND_REDIRECT 137 /* redirect */ #define ICMP6_ROUTER_RENUMBERING 138 /* router renumbering */ #define ICMP6_WRUREQUEST 139 /* who are you request */ #define ICMP6_WRUREPLY 140 /* who are you reply */ #define ICMP6_FQDN_QUERY 139 /* FQDN query */ #define ICMP6_FQDN_REPLY 140 /* FQDN reply */ #define ICMP6_NI_QUERY 139 /* node information request */ #define ICMP6_NI_REPLY 140 /* node information reply */ #define MLDV2_LISTENER_REPORT 143 /* RFC3810 listener report */ /* The definitions below are experimental. TBA */ #define MLD_MTRACE_RESP 200 /* mtrace resp (to sender) */ #define MLD_MTRACE 201 /* mtrace messages */ #ifndef _KERNEL #define MLD6_MTRACE_RESP MLD_MTRACE_RESP #define MLD6_MTRACE MLD_MTRACE #endif #define ICMP6_MAXTYPE 201 #define ICMP6_DST_UNREACH_NOROUTE 0 /* no route to destination */ #define ICMP6_DST_UNREACH_ADMIN 1 /* administratively prohibited */ #define ICMP6_DST_UNREACH_NOTNEIGHBOR 2 /* not a neighbor(obsolete) */ #define ICMP6_DST_UNREACH_BEYONDSCOPE 2 /* beyond scope of source address */ #define ICMP6_DST_UNREACH_ADDR 3 /* address unreachable */ #define ICMP6_DST_UNREACH_NOPORT 4 /* port unreachable */ #define ICMP6_DST_UNREACH_POLICY 5 /* failed ingress/egress policy */ #define ICMP6_DST_UNREACH_REJECT 6 /* Reject route to destination */ #define ICMP6_DST_UNREACH_SRCROUTE 7 /* Error in source routing header */ #define ICMP6_TIME_EXCEED_TRANSIT 0 /* ttl==0 in transit */ #define ICMP6_TIME_EXCEED_REASSEMBLY 1 /* ttl==0 in reass */ #define ICMP6_PARAMPROB_HEADER 0 /* erroneous header field */ #define ICMP6_PARAMPROB_NEXTHEADER 1 /* unrecognized next header */ #define ICMP6_PARAMPROB_OPTION 2 /* unrecognized option */ #define ICMP6_INFOMSG_MASK 0x80 /* all informational messages */ #define ICMP6_NI_SUBJ_IPV6 0 /* Query Subject is an IPv6 address */ #define ICMP6_NI_SUBJ_FQDN 1 /* Query Subject is a Domain name */ #define ICMP6_NI_SUBJ_IPV4 2 /* Query Subject is an IPv4 address */ #define ICMP6_NI_SUCCESS 0 /* node information successful reply */ #define ICMP6_NI_REFUSED 1 /* node information request is refused */ #define ICMP6_NI_UNKNOWN 2 /* unknown Qtype */ #define ICMP6_ROUTER_RENUMBERING_COMMAND 0 /* rr command */ #define ICMP6_ROUTER_RENUMBERING_RESULT 1 /* rr result */ #define ICMP6_ROUTER_RENUMBERING_SEQNUM_RESET 255 /* rr seq num reset */ /* Used in kernel only */ #define ND_REDIRECT_ONLINK 0 /* redirect to an on-link node */ #define ND_REDIRECT_ROUTER 1 /* redirect to a better router */ /* * Multicast Listener Discovery */ struct mld_hdr { struct icmp6_hdr mld_icmp6_hdr; struct in6_addr mld_addr; /* multicast address */ } __packed; /* definitions to provide backward compatibility to old KAME applications */ #ifndef _KERNEL #define mld6_hdr mld_hdr #define mld6_type mld_type #define mld6_code mld_code #define mld6_cksum mld_cksum #define mld6_maxdelay mld_maxdelay #define mld6_reserved mld_reserved #define mld6_addr mld_addr #endif /* shortcut macro definitions */ #define mld_type mld_icmp6_hdr.icmp6_type #define mld_code mld_icmp6_hdr.icmp6_code #define mld_cksum mld_icmp6_hdr.icmp6_cksum #define mld_maxdelay mld_icmp6_hdr.icmp6_data16[0] #define mld_reserved mld_icmp6_hdr.icmp6_data16[1] #define mld_v2_reserved mld_icmp6_hdr.icmp6_data16[0] #define mld_v2_numrecs mld_icmp6_hdr.icmp6_data16[1] /* * Neighbor Discovery */ struct nd_router_solicit { /* router solicitation */ struct icmp6_hdr nd_rs_hdr; /* could be followed by options */ } __packed; #define nd_rs_type nd_rs_hdr.icmp6_type #define nd_rs_code nd_rs_hdr.icmp6_code #define nd_rs_cksum nd_rs_hdr.icmp6_cksum #define nd_rs_reserved nd_rs_hdr.icmp6_data32[0] struct nd_router_advert { /* router advertisement */ struct icmp6_hdr nd_ra_hdr; u_int32_t nd_ra_reachable; /* reachable time */ u_int32_t nd_ra_retransmit; /* retransmit timer */ /* could be followed by options */ } __packed; #define nd_ra_type nd_ra_hdr.icmp6_type #define nd_ra_code nd_ra_hdr.icmp6_code #define nd_ra_cksum nd_ra_hdr.icmp6_cksum #define nd_ra_curhoplimit nd_ra_hdr.icmp6_data8[0] #define nd_ra_flags_reserved nd_ra_hdr.icmp6_data8[1] #define ND_RA_FLAG_MANAGED 0x80 #define ND_RA_FLAG_OTHER 0x40 #define ND_RA_FLAG_HA 0x20 /* * Router preference values based on draft-draves-ipngwg-router-selection-01. * These are non-standard definitions. */ #define ND_RA_FLAG_RTPREF_MASK 0x18 /* 00011000 */ #define ND_RA_FLAG_RTPREF_HIGH 0x08 /* 00001000 */ #define ND_RA_FLAG_RTPREF_MEDIUM 0x00 /* 00000000 */ #define ND_RA_FLAG_RTPREF_LOW 0x18 /* 00011000 */ #define ND_RA_FLAG_RTPREF_RSV 0x10 /* 00010000 */ #ifdef EXPERIMENTAL #define ND_RA_FLAG_IPV6_ONLY 0x02 /* draft-ietf-6man-ipv6only-flag */ #endif #define nd_ra_router_lifetime nd_ra_hdr.icmp6_data16[1] struct nd_neighbor_solicit { /* neighbor solicitation */ struct icmp6_hdr nd_ns_hdr; struct in6_addr nd_ns_target; /*target address */ /* could be followed by options */ } __packed; #define nd_ns_type nd_ns_hdr.icmp6_type #define nd_ns_code nd_ns_hdr.icmp6_code #define nd_ns_cksum nd_ns_hdr.icmp6_cksum #define nd_ns_reserved nd_ns_hdr.icmp6_data32[0] struct nd_neighbor_advert { /* neighbor advertisement */ struct icmp6_hdr nd_na_hdr; struct in6_addr nd_na_target; /* target address */ /* could be followed by options */ } __packed; #define nd_na_type nd_na_hdr.icmp6_type #define nd_na_code nd_na_hdr.icmp6_code #define nd_na_cksum nd_na_hdr.icmp6_cksum #define nd_na_flags_reserved nd_na_hdr.icmp6_data32[0] #if BYTE_ORDER == BIG_ENDIAN #define ND_NA_FLAG_ROUTER 0x80000000 #define ND_NA_FLAG_SOLICITED 0x40000000 #define ND_NA_FLAG_OVERRIDE 0x20000000 #else #if BYTE_ORDER == LITTLE_ENDIAN #define ND_NA_FLAG_ROUTER 0x80 #define ND_NA_FLAG_SOLICITED 0x40 #define ND_NA_FLAG_OVERRIDE 0x20 #endif #endif struct nd_redirect { /* redirect */ struct icmp6_hdr nd_rd_hdr; struct in6_addr nd_rd_target; /* target address */ struct in6_addr nd_rd_dst; /* destination address */ /* could be followed by options */ } __packed; #define nd_rd_type nd_rd_hdr.icmp6_type #define nd_rd_code nd_rd_hdr.icmp6_code #define nd_rd_cksum nd_rd_hdr.icmp6_cksum #define nd_rd_reserved nd_rd_hdr.icmp6_data32[0] struct nd_opt_hdr { /* Neighbor discovery option header */ u_int8_t nd_opt_type; u_int8_t nd_opt_len; /* followed by option specific data*/ } __packed; #define ND_OPT_SOURCE_LINKADDR 1 #define ND_OPT_TARGET_LINKADDR 2 #define ND_OPT_PREFIX_INFORMATION 3 #define ND_OPT_REDIRECTED_HEADER 4 #define ND_OPT_MTU 5 #define ND_OPT_NONCE 14 /* RFC 3971 */ #define ND_OPT_ROUTE_INFO 24 /* RFC 4191 */ #define ND_OPT_RDNSS 25 /* RFC 6106 */ #define ND_OPT_DNSSL 31 /* RFC 6106 */ #define ND_OPT_PREF64 38 /* RFC 8781 */ #define ND_OPT_MAX 38 struct nd_opt_prefix_info { /* prefix information */ u_int8_t nd_opt_pi_type; u_int8_t nd_opt_pi_len; u_int8_t nd_opt_pi_prefix_len; u_int8_t nd_opt_pi_flags_reserved; u_int32_t nd_opt_pi_valid_time; u_int32_t nd_opt_pi_preferred_time; u_int32_t nd_opt_pi_reserved2; struct in6_addr nd_opt_pi_prefix; } __packed; #define ND_OPT_PI_FLAG_ONLINK 0x80 #define ND_OPT_PI_FLAG_AUTO 0x40 struct nd_opt_rd_hdr { /* redirected header */ u_int8_t nd_opt_rh_type; u_int8_t nd_opt_rh_len; u_int16_t nd_opt_rh_reserved1; u_int32_t nd_opt_rh_reserved2; /* followed by IP header and data */ } __packed; struct nd_opt_mtu { /* MTU option */ u_int8_t nd_opt_mtu_type; u_int8_t nd_opt_mtu_len; u_int16_t nd_opt_mtu_reserved; u_int32_t nd_opt_mtu_mtu; } __packed; #define ND_OPT_NONCE_LEN ((1 * 8) - 2) #if ((ND_OPT_NONCE_LEN + 2) % 8) != 0 #error "(ND_OPT_NONCE_LEN + 2) must be a multiple of 8." #endif struct nd_opt_nonce { /* nonce option */ u_int8_t nd_opt_nonce_type; u_int8_t nd_opt_nonce_len; u_int8_t nd_opt_nonce[ND_OPT_NONCE_LEN]; } __packed; struct nd_opt_route_info { /* route info */ u_int8_t nd_opt_rti_type; u_int8_t nd_opt_rti_len; u_int8_t nd_opt_rti_prefixlen; u_int8_t nd_opt_rti_flags; u_int32_t nd_opt_rti_lifetime; /* prefix follows */ } __packed; struct nd_opt_rdnss { /* RDNSS option (RFC 6106) */ u_int8_t nd_opt_rdnss_type; u_int8_t nd_opt_rdnss_len; u_int16_t nd_opt_rdnss_reserved; u_int32_t nd_opt_rdnss_lifetime; /* followed by list of recursive DNS servers */ } __packed; struct nd_opt_dnssl { /* DNSSL option (RFC 6106) */ u_int8_t nd_opt_dnssl_type; u_int8_t nd_opt_dnssl_len; u_int16_t nd_opt_dnssl_reserved; u_int32_t nd_opt_dnssl_lifetime; /* followed by list of DNS search domains */ } __packed; struct nd_opt_pref64 { /* PREF64 option (RFC 8781) */ uint8_t nd_opt_pref64_type; uint8_t nd_opt_pref64_len; /* bits 0-12 are the SL, bits 13-15 are the PLC */ uint16_t nd_opt_pref64_sl_plc; char nd_opt_prefix[12]; } __packed; /* * icmp6 namelookup */ struct icmp6_namelookup { struct icmp6_hdr icmp6_nl_hdr; u_int8_t icmp6_nl_nonce[8]; int32_t icmp6_nl_ttl; #if 0 u_int8_t icmp6_nl_len; u_int8_t icmp6_nl_name[3]; #endif /* could be followed by options */ } __packed; /* * icmp6 node information */ struct icmp6_nodeinfo { struct icmp6_hdr icmp6_ni_hdr; u_int8_t icmp6_ni_nonce[8]; /* could be followed by reply data */ } __packed; #define ni_type icmp6_ni_hdr.icmp6_type #define ni_code icmp6_ni_hdr.icmp6_code #define ni_cksum icmp6_ni_hdr.icmp6_cksum #define ni_qtype icmp6_ni_hdr.icmp6_data16[0] #define ni_flags icmp6_ni_hdr.icmp6_data16[1] #define NI_QTYPE_NOOP 0 /* NOOP */ #define NI_QTYPE_SUPTYPES 1 /* Supported Qtypes */ #define NI_QTYPE_FQDN 2 /* FQDN (draft 04) */ #define NI_QTYPE_DNSNAME 2 /* DNS Name */ #define NI_QTYPE_NODEADDR 3 /* Node Addresses */ #define NI_QTYPE_IPV4ADDR 4 /* IPv4 Addresses */ #if BYTE_ORDER == BIG_ENDIAN #define NI_SUPTYPE_FLAG_COMPRESS 0x1 #define NI_FQDN_FLAG_VALIDTTL 0x1 #elif BYTE_ORDER == LITTLE_ENDIAN #define NI_SUPTYPE_FLAG_COMPRESS 0x0100 #define NI_FQDN_FLAG_VALIDTTL 0x0100 #endif #ifdef NAME_LOOKUPS_04 #if BYTE_ORDER == BIG_ENDIAN #define NI_NODEADDR_FLAG_LINKLOCAL 0x1 #define NI_NODEADDR_FLAG_SITELOCAL 0x2 #define NI_NODEADDR_FLAG_GLOBAL 0x4 #define NI_NODEADDR_FLAG_ALL 0x8 #define NI_NODEADDR_FLAG_TRUNCATE 0x10 #define NI_NODEADDR_FLAG_ANYCAST 0x20 /* just experimental. not in spec */ #elif BYTE_ORDER == LITTLE_ENDIAN #define NI_NODEADDR_FLAG_LINKLOCAL 0x0100 #define NI_NODEADDR_FLAG_SITELOCAL 0x0200 #define NI_NODEADDR_FLAG_GLOBAL 0x0400 #define NI_NODEADDR_FLAG_ALL 0x0800 #define NI_NODEADDR_FLAG_TRUNCATE 0x1000 #define NI_NODEADDR_FLAG_ANYCAST 0x2000 /* just experimental. not in spec */ #endif #else /* draft-ietf-ipngwg-icmp-name-lookups-05 (and later?) */ #if BYTE_ORDER == BIG_ENDIAN #define NI_NODEADDR_FLAG_TRUNCATE 0x1 #define NI_NODEADDR_FLAG_ALL 0x2 #define NI_NODEADDR_FLAG_COMPAT 0x4 #define NI_NODEADDR_FLAG_LINKLOCAL 0x8 #define NI_NODEADDR_FLAG_SITELOCAL 0x10 #define NI_NODEADDR_FLAG_GLOBAL 0x20 #define NI_NODEADDR_FLAG_ANYCAST 0x40 /* just experimental. not in spec */ #elif BYTE_ORDER == LITTLE_ENDIAN #define NI_NODEADDR_FLAG_TRUNCATE 0x0100 #define NI_NODEADDR_FLAG_ALL 0x0200 #define NI_NODEADDR_FLAG_COMPAT 0x0400 #define NI_NODEADDR_FLAG_LINKLOCAL 0x0800 #define NI_NODEADDR_FLAG_SITELOCAL 0x1000 #define NI_NODEADDR_FLAG_GLOBAL 0x2000 #define NI_NODEADDR_FLAG_ANYCAST 0x4000 /* just experimental. not in spec */ #endif #endif struct ni_reply_fqdn { u_int32_t ni_fqdn_ttl; /* TTL */ u_int8_t ni_fqdn_namelen; /* length in octets of the FQDN */ u_int8_t ni_fqdn_name[3]; /* XXX: alignment */ } __packed; /* * Router Renumbering. as router-renum-08.txt */ struct icmp6_router_renum { /* router renumbering header */ struct icmp6_hdr rr_hdr; u_int8_t rr_segnum; u_int8_t rr_flags; u_int16_t rr_maxdelay; u_int32_t rr_reserved; } __packed; #define ICMP6_RR_FLAGS_TEST 0x80 #define ICMP6_RR_FLAGS_REQRESULT 0x40 #define ICMP6_RR_FLAGS_FORCEAPPLY 0x20 #define ICMP6_RR_FLAGS_SPECSITE 0x10 #define ICMP6_RR_FLAGS_PREVDONE 0x08 #define rr_type rr_hdr.icmp6_type #define rr_code rr_hdr.icmp6_code #define rr_cksum rr_hdr.icmp6_cksum #define rr_seqnum rr_hdr.icmp6_data32[0] struct rr_pco_match { /* match prefix part */ u_int8_t rpm_code; u_int8_t rpm_len; u_int8_t rpm_ordinal; u_int8_t rpm_matchlen; u_int8_t rpm_minlen; u_int8_t rpm_maxlen; u_int16_t rpm_reserved; struct in6_addr rpm_prefix; } __packed; #define RPM_PCO_ADD 1 #define RPM_PCO_CHANGE 2 #define RPM_PCO_SETGLOBAL 3 #define RPM_PCO_MAX 4 struct rr_pco_use { /* use prefix part */ u_int8_t rpu_uselen; u_int8_t rpu_keeplen; u_int8_t rpu_ramask; u_int8_t rpu_raflags; u_int32_t rpu_vltime; u_int32_t rpu_pltime; u_int32_t rpu_flags; struct in6_addr rpu_prefix; } __packed; #define ICMP6_RR_PCOUSE_RAFLAGS_ONLINK 0x80 #define ICMP6_RR_PCOUSE_RAFLAGS_AUTO 0x40 #if BYTE_ORDER == BIG_ENDIAN #define ICMP6_RR_PCOUSE_FLAGS_DECRVLTIME 0x80000000 #define ICMP6_RR_PCOUSE_FLAGS_DECRPLTIME 0x40000000 #elif BYTE_ORDER == LITTLE_ENDIAN #define ICMP6_RR_PCOUSE_FLAGS_DECRVLTIME 0x80 #define ICMP6_RR_PCOUSE_FLAGS_DECRPLTIME 0x40 #endif struct rr_result { /* router renumbering result message */ u_int16_t rrr_flags; u_int8_t rrr_ordinal; u_int8_t rrr_matchedlen; u_int32_t rrr_ifid; struct in6_addr rrr_prefix; } __packed; #if BYTE_ORDER == BIG_ENDIAN #define ICMP6_RR_RESULT_FLAGS_OOB 0x0002 #define ICMP6_RR_RESULT_FLAGS_FORBIDDEN 0x0001 #elif BYTE_ORDER == LITTLE_ENDIAN #define ICMP6_RR_RESULT_FLAGS_OOB 0x0200 #define ICMP6_RR_RESULT_FLAGS_FORBIDDEN 0x0100 #endif /* * icmp6 filter structures. */ struct icmp6_filter { u_int32_t icmp6_filt[8]; }; #ifdef _KERNEL #define ICMP6_FILTER_SETPASSALL(filterp) \ do { \ int i; u_char *p; \ p = (u_char *)filterp; \ for (i = 0; i < sizeof(struct icmp6_filter); i++) \ p[i] = 0xff; \ } while (/*CONSTCOND*/ 0) #define ICMP6_FILTER_SETBLOCKALL(filterp) \ bzero(filterp, sizeof(struct icmp6_filter)) #else /* _KERNEL */ #define ICMP6_FILTER_SETPASSALL(filterp) \ memset(filterp, 0xff, sizeof(struct icmp6_filter)) #define ICMP6_FILTER_SETBLOCKALL(filterp) \ memset(filterp, 0x00, sizeof(struct icmp6_filter)) #endif /* _KERNEL */ #define ICMP6_FILTER_SETPASS(type, filterp) \ (((filterp)->icmp6_filt[(type) >> 5]) |= (1 << ((type) & 31))) #define ICMP6_FILTER_SETBLOCK(type, filterp) \ (((filterp)->icmp6_filt[(type) >> 5]) &= ~(1 << ((type) & 31))) #define ICMP6_FILTER_WILLPASS(type, filterp) \ ((((filterp)->icmp6_filt[(type) >> 5]) & (1 << ((type) & 31))) != 0) #define ICMP6_FILTER_WILLBLOCK(type, filterp) \ ((((filterp)->icmp6_filt[(type) >> 5]) & (1 << ((type) & 31))) == 0) /* * Variables related to this implementation * of the internet control message protocol version 6. */ struct icmp6stat { /* statistics related to icmp6 packets generated */ uint64_t icp6s_error; /* # of calls to icmp6_error */ uint64_t icp6s_canterror; /* no error 'cuz old was icmp */ uint64_t icp6s_toofreq; /* no error 'cuz rate limitation */ uint64_t icp6s_outhist[256]; /* statistics related to input message processed */ uint64_t icp6s_badcode; /* icmp6_code out of range */ uint64_t icp6s_tooshort; /* packet < sizeof(struct icmp6_hdr) */ uint64_t icp6s_checksum; /* bad checksum */ uint64_t icp6s_badlen; /* calculated bound mismatch */ uint64_t icp6s_dropped; /* # of packets dropped waiting for a resolution */ /* * number of responses: this member is inherited from netinet code, but * for netinet6 code, it is already available in icp6s_outhist[]. */ uint64_t icp6s_reflect; uint64_t icp6s_inhist[256]; uint64_t icp6s_nd_toomanyopt; /* too many ND options */ uint64_t icp6s_odst_unreach_noroute; uint64_t icp6s_odst_unreach_admin; uint64_t icp6s_odst_unreach_beyondscope; uint64_t icp6s_odst_unreach_addr; uint64_t icp6s_odst_unreach_noport; uint64_t icp6s_opacket_too_big; uint64_t icp6s_otime_exceed_transit; uint64_t icp6s_otime_exceed_reassembly; uint64_t icp6s_oparamprob_header; uint64_t icp6s_oparamprob_nextheader; uint64_t icp6s_oparamprob_option; uint64_t icp6s_oredirect; uint64_t icp6s_ounknown; uint64_t icp6s_pmtuchg; /* path MTU changes */ uint64_t icp6s_nd_badopt; /* bad ND options */ uint64_t icp6s_badns; /* bad neighbor solicitation */ uint64_t icp6s_badna; /* bad neighbor advertisement */ uint64_t icp6s_badrs; /* bad router solicitation */ uint64_t icp6s_badra; /* bad router advertisement */ uint64_t icp6s_badredirect; /* bad redirect message */ uint64_t icp6s_overflowdefrtr; /* Too many default routers. */ uint64_t icp6s_overflowprfx; /* Too many prefixes. */ uint64_t icp6s_overflownndp; /* Too many neighbour entries. */ uint64_t icp6s_overflowredirect;/* Too many redirects. */ uint64_t icp6s_invlhlim; /* Invalid hop limit. */ uint64_t icp6s_spare[32]; }; #ifdef _KERNEL #include #include #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet6_icmp6); #endif VNET_PCPUSTAT_DECLARE(struct icmp6stat, icmp6stat); /* * In-kernel consumers can use these accessor macros directly to update * stats. */ #define ICMP6STAT_ADD(name, val) \ do { \ MIB_SDT_PROBE1(icmp6, count, name, (val)); \ VNET_PCPUSTAT_ADD(struct icmp6stat, icmp6stat, name, (val)); \ } while (0) #define ICMP6STAT_INC(name) ICMP6STAT_ADD(name, 1) #define ICMP6STAT_INC2(name, type) \ do { \ MIB_SDT_PROBE2(icmp6, count, name, 1, type); \ VNET_PCPUSTAT_ADD(struct icmp6stat, icmp6stat, name[type], 1); \ } while (0) /* * Kernel module consumers must use this accessor macro. */ void kmod_icmp6stat_inc(int statnum); #define KMOD_ICMP6STAT_INC(name) \ do { \ MIB_SDT_PROBE1(icmp6, count, name, 1); \ kmod_icmp6stat_inc( \ offsetof(struct icmp6stat, name) / sizeof(uint64_t)); \ } while (0) #endif /* * Names for ICMP sysctl objects */ #define ICMPV6CTL_STATS 1 #define ICMPV6CTL_REDIRACCEPT 2 /* accept/process redirects */ #define ICMPV6CTL_REDIRTIMEOUT 3 /* redirect cache time */ #if 0 /*obsoleted*/ #define ICMPV6CTL_ERRRATELIMIT 5 /* ICMPv6 error rate limitation */ #endif #define ICMPV6CTL_ND6_PRUNE 6 #define ICMPV6CTL_ND6_DELAY 8 #define ICMPV6CTL_ND6_UMAXTRIES 9 #define ICMPV6CTL_ND6_MMAXTRIES 10 #define ICMPV6CTL_ND6_USELOOPBACK 11 /*#define ICMPV6CTL_ND6_PROXYALL 12 obsoleted, do not reuse here */ #define ICMPV6CTL_NODEINFO 13 #define ICMPV6CTL_ERRPPSLIMIT 14 /* ICMPv6 error pps limitation */ #define ICMPV6CTL_ND6_MAXNUDHINT 15 #define ICMPV6CTL_MTUDISC_HIWAT 16 #define ICMPV6CTL_MTUDISC_LOWAT 17 #define ICMPV6CTL_ND6_DEBUG 18 #define ICMPV6CTL_ND6_DRLIST 19 #define ICMPV6CTL_ND6_PRLIST 20 #define ICMPV6CTL_MLD_MAXSRCFILTER 21 #define ICMPV6CTL_MLD_SOMAXSRC 22 #define ICMPV6CTL_MLD_VERSION 23 #define ICMPV6CTL_ND6_MAXQLEN 24 #define ICMPV6CTL_NODEINFO_OLDMCPREFIX 25 #define ICMPV6CTL_MAXID 26 #ifdef _KERNEL # ifdef __STDC__ struct nhop_object; struct rttimer; struct in6_multi; # endif void icmp6_paramerror(struct mbuf *, int); int icmp6_errmap(const struct icmp6_hdr *); void icmp6_error(struct mbuf *, int, int, int); void icmp6_error2(struct mbuf *, int, int, int, struct ifnet *); int icmp6_input(struct mbuf **, int *, int); void icmp6_prepare(struct mbuf *); void icmp6_redirect_input(struct mbuf *, int); void icmp6_redirect_output(struct mbuf *, struct nhop_object *); int icmp6_ratelimit(const struct in6_addr *, const int, const int); -struct ip6ctlparam; -void icmp6_mtudisc_update(struct ip6ctlparam *, int); - /* XXX: is this the right place for these macros? */ #define icmp6_ifstat_inc(ifp, tag) \ do { \ if (ifp) \ counter_u64_add(((struct in6_ifextra *) \ ((ifp)->if_afdata[AF_INET6]))->icmp6_ifstat[\ offsetof(struct icmp6_ifstat, tag) / sizeof(uint64_t)], 1);\ } while (/*CONSTCOND*/ 0) #define icmp6_ifoutstat_inc(ifp, type, code) \ do { \ icmp6_ifstat_inc(ifp, ifs6_out_msg); \ if (type < ICMP6_INFOMSG_MASK) \ icmp6_ifstat_inc(ifp, ifs6_out_error); \ switch (type) { \ case ICMP6_DST_UNREACH: \ icmp6_ifstat_inc(ifp, ifs6_out_dstunreach); \ if (code == ICMP6_DST_UNREACH_ADMIN) \ icmp6_ifstat_inc(ifp, ifs6_out_adminprohib); \ break; \ case ICMP6_PACKET_TOO_BIG: \ icmp6_ifstat_inc(ifp, ifs6_out_pkttoobig); \ break; \ case ICMP6_TIME_EXCEEDED: \ icmp6_ifstat_inc(ifp, ifs6_out_timeexceed); \ break; \ case ICMP6_PARAM_PROB: \ icmp6_ifstat_inc(ifp, ifs6_out_paramprob); \ break; \ case ICMP6_ECHO_REQUEST: \ icmp6_ifstat_inc(ifp, ifs6_out_echo); \ break; \ case ICMP6_ECHO_REPLY: \ icmp6_ifstat_inc(ifp, ifs6_out_echoreply); \ break; \ case MLD_LISTENER_QUERY: \ icmp6_ifstat_inc(ifp, ifs6_out_mldquery); \ break; \ case MLD_LISTENER_REPORT: \ icmp6_ifstat_inc(ifp, ifs6_out_mldreport); \ break; \ case MLD_LISTENER_DONE: \ icmp6_ifstat_inc(ifp, ifs6_out_mlddone); \ break; \ case ND_ROUTER_SOLICIT: \ icmp6_ifstat_inc(ifp, ifs6_out_routersolicit); \ break; \ case ND_ROUTER_ADVERT: \ icmp6_ifstat_inc(ifp, ifs6_out_routeradvert); \ break; \ case ND_NEIGHBOR_SOLICIT: \ icmp6_ifstat_inc(ifp, ifs6_out_neighborsolicit); \ break; \ case ND_NEIGHBOR_ADVERT: \ icmp6_ifstat_inc(ifp, ifs6_out_neighboradvert); \ break; \ case ND_REDIRECT: \ icmp6_ifstat_inc(ifp, ifs6_out_redirect); \ break; \ } \ } while (/*CONSTCOND*/ 0) #define ICMP6_NODEINFO_FQDNOK 0x1 #define ICMP6_NODEINFO_NODEADDROK 0x2 #define ICMP6_NODEINFO_TMPADDROK 0x4 #define ICMP6_NODEINFO_GLOBALOK 0x8 #endif /* _KERNEL */ #endif /* not _NETINET_ICMP6_H_ */ diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 1fce7c591639..2e039ebbfdd2 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1,5027 +1,5027 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #ifdef TCP_HHOOK #include #endif #ifdef KERN_TLS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #include #include #endif #include #ifdef INVARIANTS #define TCPSTATES #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCP_OFFLOAD #include #endif #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef INET6 static ip6proto_ctlinput_t tcp6_ctlinput; static udp_tun_icmp_t tcp6_ctlinput_viaudp; #endif VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; #ifdef INET6 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; #endif VNET_DEFINE(uint32_t, tcp_ack_war_time_window) = 1000; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_timewindow, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ack_war_time_window), 0, "Time interval in ms used to limit the number (ack_war_cnt) of challenge ACKs sent per TCP connection"); VNET_DEFINE(uint32_t, tcp_ack_war_cnt) = 5; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ack_war_cnt), 0, "Maximum number of challenge ACKs sent per TCP connection during the time interval (ack_war_timewindow)"); struct rwlock tcp_function_lock; static int sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I", "Default TCP Maximum Segment Size"); #ifdef INET6 static int sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_v6mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_v6mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I", "Default TCP Maximum Segment Size for IPv6"); #endif /* INET6 */ /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds * of packets instead of one. The effect scales with the available * bandwidth and quickly saturates the CPU and network interface * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_minmss), 0, "Minimum TCP Maximum Segment Size"); VNET_DEFINE(int, tcp_do_rfc1323) = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc1323), 0, "Enable rfc1323 (high performance TCP) extensions"); /* * As of June 2021, several TCP stacks violate RFC 7323 from September 2014. * Some stacks negotiate TS, but never send them after connection setup. Some * stacks negotiate TS, but don't send them when sending keep-alive segments. * These include modern widely deployed TCP stacks. * Therefore tolerating violations for now... */ VNET_DEFINE(int, tcp_tolerate_missing_ts) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tolerate_missing_ts, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_tolerate_missing_ts), 0, "Tolerate missing TCP timestamps"); VNET_DEFINE(int, tcp_ts_offset_per_conn) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, ts_offset_per_conn, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ts_offset_per_conn), 0, "Initialize TCP timestamps per connection instead of per host pair"); /* How many connections are pacing */ static volatile uint32_t number_of_tcp_connections_pacing = 0; static uint32_t shadow_num_connections = 0; static counter_u64_t tcp_pacing_failures; static counter_u64_t tcp_dgp_failures; static uint32_t shadow_tcp_pacing_dgp = 0; static volatile uint32_t number_of_dgp_connections = 0; static int tcp_pacing_limit = 10000; SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW, &tcp_pacing_limit, 1000, "If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)"); static int tcp_dgp_limit = -1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, dgp_limit, CTLFLAG_RW, &tcp_dgp_limit, -1, "If the TCP stack does DGP, is there a limit (-1 = no, 0 = no dgp N = number of connections)"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD, &shadow_num_connections, 0, "Number of TCP connections being paced"); SYSCTL_COUNTER_U64(_net_inet_tcp, OID_AUTO, pacing_failures, CTLFLAG_RD, &tcp_pacing_failures, "Number of times we failed to enable pacing to avoid exceeding the limit"); SYSCTL_COUNTER_U64(_net_inet_tcp, OID_AUTO, dgp_failures, CTLFLAG_RD, &tcp_dgp_failures, "Number of times we failed to enable dgp to avoid exceeding the limit"); static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 0 #endif static int tcp_tcbhashsize = TCBHASHSIZE; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); VNET_DEFINE_STATIC(int, icmp_may_rst) = 1; #define V_icmp_may_rst VNET(icmp_may_rst) SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_may_rst), 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); VNET_DEFINE_STATIC(int, tcp_isn_reseed_interval) = 0; #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); static int tcp_soreceive_stream; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); VNET_DEFINE(uma_zone_t, sack_hole_zone); #define V_sack_hole_zone VNET(sack_hole_zone) VNET_DEFINE(uint32_t, tcp_map_entries_limit) = 0; /* unlimited */ static int sysctl_net_inet_tcp_map_limit_check(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_tcp_map_entries_limit; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { /* only allow "0" and value > minimum */ if (new > 0 && new < TCP_MIN_MAP_ENTRIES_LIMIT) error = EINVAL; else V_tcp_map_entries_limit = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, map_limit, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_map_entries_limit), 0, &sysctl_net_inet_tcp_map_limit_check, "IU", "Total sendmap entries limit"); VNET_DEFINE(uint32_t, tcp_map_split_limit) = 0; /* unlimited */ SYSCTL_UINT(_net_inet_tcp, OID_AUTO, split_limit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_map_split_limit), 0, "Total sendmap split entries limit"); #ifdef TCP_HHOOK VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); #endif #define TS_OFFSET_SECRET_LENGTH SIPHASH_KEY_LENGTH VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]); #define V_ts_offset_secret VNET(ts_offset_secret) static int tcp_default_fb_init(struct tcpcb *tp, void **ptr); static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged); static int tcp_default_handoff_ok(struct tcpcb *tp); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc(struct inpcb *, int); static struct inpcb *tcp_drop_syn_sent(struct inpcb *, int); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr); static void tcp_default_switch_failed(struct tcpcb *tp); static ipproto_ctlinput_t tcp_ctlinput; static udp_tun_icmp_t tcp_ctlinput_viaudp; static struct tcp_function_block tcp_def_funcblk = { .tfb_tcp_block_name = "freebsd", .tfb_tcp_output = tcp_default_output, .tfb_tcp_do_segment = tcp_do_segment, .tfb_tcp_ctloutput = tcp_default_ctloutput, .tfb_tcp_handoff_ok = tcp_default_handoff_ok, .tfb_tcp_fb_init = tcp_default_fb_init, .tfb_tcp_fb_fini = tcp_default_fb_fini, .tfb_switch_failed = tcp_default_switch_failed, .tfb_flags = TCP_FUNC_DEFAULT_OK, }; static int tcp_fb_cnt = 0; struct tcp_funchead t_functions; VNET_DEFINE_STATIC(struct tcp_function_block *, tcp_func_set_ptr) = &tcp_def_funcblk; #define V_tcp_func_set_ptr VNET(tcp_func_set_ptr) void tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp) { TCPSTAT_INC(tcps_dsack_count); tp->t_dsack_pack++; if (tlp == 0) { if (SEQ_GT(end, start)) { tp->t_dsack_bytes += (end - start); TCPSTAT_ADD(tcps_dsack_bytes, (end - start)); } else { tp->t_dsack_tlp_bytes += (start - end); TCPSTAT_ADD(tcps_dsack_bytes, (start - end)); } } else { if (SEQ_GT(end, start)) { tp->t_dsack_bytes += (end - start); TCPSTAT_ADD(tcps_dsack_tlp_bytes, (end - start)); } else { tp->t_dsack_tlp_bytes += (start - end); TCPSTAT_ADD(tcps_dsack_tlp_bytes, (start - end)); } } } static struct tcp_function_block * find_tcp_functions_locked(struct tcp_function_set *fs) { struct tcp_function *f; struct tcp_function_block *blk = NULL; rw_assert(&tcp_function_lock, RA_LOCKED); TAILQ_FOREACH(f, &t_functions, tf_next) { if (strcmp(f->tf_name, fs->function_set_name) == 0) { blk = f->tf_fb; break; } } return (blk); } static struct tcp_function_block * find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) { struct tcp_function_block *rblk = NULL; struct tcp_function *f; rw_assert(&tcp_function_lock, RA_LOCKED); TAILQ_FOREACH(f, &t_functions, tf_next) { if (f->tf_fb == blk) { rblk = blk; if (s) { *s = f; } break; } } return (rblk); } struct tcp_function_block * find_and_ref_tcp_functions(struct tcp_function_set *fs) { struct tcp_function_block *blk; rw_rlock(&tcp_function_lock); blk = find_tcp_functions_locked(fs); if (blk) refcount_acquire(&blk->tfb_refcnt); rw_runlock(&tcp_function_lock); return (blk); } struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *blk) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = find_tcp_fb_locked(blk, NULL); if (rblk) refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return (rblk); } /* Find a matching alias for the given tcp_function_block. */ int find_tcp_function_alias(struct tcp_function_block *blk, struct tcp_function_set *fs) { struct tcp_function *f; int found; found = 0; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { if ((f->tf_fb == blk) && (strncmp(f->tf_name, blk->tfb_tcp_block_name, TCP_FUNCTION_NAME_LEN_MAX) != 0)) { /* Matching function block with different name. */ strncpy(fs->function_set_name, f->tf_name, TCP_FUNCTION_NAME_LEN_MAX); found = 1; break; } } /* Null terminate the string appropriately. */ if (found) { fs->function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; } else { fs->function_set_name[0] = '\0'; } rw_runlock(&tcp_function_lock); return (found); } static struct tcp_function_block * find_and_ref_tcp_default_fb(void) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = V_tcp_func_set_ptr; refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return (rblk); } void tcp_switch_back_to_default(struct tcpcb *tp) { struct tcp_function_block *tfb; void *ptr = NULL; KASSERT(tp->t_fb != &tcp_def_funcblk, ("%s: called by the built-in default stack", __func__)); if (tp->t_fb->tfb_tcp_timer_stop_all != NULL) tp->t_fb->tfb_tcp_timer_stop_all(tp); /* * Now, we'll find a new function block to use. * Start by trying the current user-selected * default, unless this stack is the user-selected * default. */ tfb = find_and_ref_tcp_default_fb(); if (tfb == tp->t_fb) { refcount_release(&tfb->tfb_refcnt); tfb = NULL; } /* Does the stack accept this connection? */ if (tfb != NULL && (*tfb->tfb_tcp_handoff_ok)(tp)) { refcount_release(&tfb->tfb_refcnt); tfb = NULL; } /* Try to use that stack. */ if (tfb != NULL) { /* Initialize the new stack. If it succeeds, we are done. */ if (tfb->tfb_tcp_fb_init == NULL || (*tfb->tfb_tcp_fb_init)(tp, &ptr) == 0) { /* Release the old stack */ if (tp->t_fb->tfb_tcp_fb_fini != NULL) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); refcount_release(&tp->t_fb->tfb_refcnt); /* Now set in all the pointers */ tp->t_fb = tfb; tp->t_fb_ptr = ptr; return; } /* * Initialization failed. Release the reference count on * the looked up default stack. */ refcount_release(&tfb->tfb_refcnt); } /* * If that wasn't feasible, use the built-in default * stack which is not allowed to reject anyone. */ tfb = find_and_ref_tcp_fb(&tcp_def_funcblk); if (tfb == NULL) { /* there always should be a default */ panic("Can't refer to tcp_def_funcblk"); } if ((*tfb->tfb_tcp_handoff_ok)(tp)) { /* The default stack cannot say no */ panic("Default stack rejects a new session?"); } if (tfb->tfb_tcp_fb_init != NULL && (*tfb->tfb_tcp_fb_init)(tp, &ptr)) { /* The default stack cannot fail */ panic("Default stack initialization failed"); } /* Now release the old stack */ if (tp->t_fb->tfb_tcp_fb_fini != NULL) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); refcount_release(&tp->t_fb->tfb_refcnt); /* And set in the pointers to the new */ tp->t_fb = tfb; tp->t_fb_ptr = ptr; } static bool tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp, const struct sockaddr *sa, void *ctx) { struct ip *iph; #ifdef INET6 struct ip6_hdr *ip6; #endif struct udphdr *uh; struct tcphdr *th; int thlen; uint16_t port; TCPSTAT_INC(tcps_tunneled_pkts); if ((m->m_flags & M_PKTHDR) == 0) { /* Can't handle one that is not a pkt hdr */ TCPSTAT_INC(tcps_tunneled_errs); goto out; } thlen = sizeof(struct tcphdr); if (m->m_len < off + sizeof(struct udphdr) + thlen && (m = m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) { TCPSTAT_INC(tcps_tunneled_errs); goto out; } iph = mtod(m, struct ip *); uh = (struct udphdr *)((caddr_t)iph + off); th = (struct tcphdr *)(uh + 1); thlen = th->th_off << 2; if (m->m_len < off + sizeof(struct udphdr) + thlen) { m = m_pullup(m, off + sizeof(struct udphdr) + thlen); if (m == NULL) { TCPSTAT_INC(tcps_tunneled_errs); goto out; } else { iph = mtod(m, struct ip *); uh = (struct udphdr *)((caddr_t)iph + off); th = (struct tcphdr *)(uh + 1); } } m->m_pkthdr.tcp_tun_port = port = uh->uh_sport; bcopy(th, uh, m->m_len - off); m->m_len -= sizeof(struct udphdr); m->m_pkthdr.len -= sizeof(struct udphdr); /* * We use the same algorithm for * both UDP and TCP for c-sum. So * the code in tcp_input will skip * the checksum. So we do nothing * with the flag (m->m_pkthdr.csum_flags). */ switch (iph->ip_v) { #ifdef INET case IPVERSION: iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr)); tcp_input_with_port(&m, &off, IPPROTO_TCP, port); break; #endif #ifdef INET6 case IPV6_VERSION >> 4: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr)); tcp6_input_with_port(&m, &off, IPPROTO_TCP, port); break; #endif default: goto out; break; } return (true); out: m_freem(m); return (true); } static int sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) { int error = ENOENT; struct tcp_function_set fs; struct tcp_function_block *blk; memset(&fs, 0, sizeof(fs)); rw_rlock(&tcp_function_lock); blk = find_tcp_fb_locked(V_tcp_func_set_ptr, NULL); if (blk) { /* Found him */ strcpy(fs.function_set_name, blk->tfb_tcp_block_name); fs.pcbcnt = blk->tfb_refcnt; } rw_runlock(&tcp_function_lock); error = sysctl_handle_string(oidp, fs.function_set_name, sizeof(fs.function_set_name), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) return (error); rw_wlock(&tcp_function_lock); blk = find_tcp_functions_locked(&fs); if ((blk == NULL) || (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { error = ENOENT; goto done; } if ((blk->tfb_flags & TCP_FUNC_DEFAULT_OK) == 0) { error = EINVAL; goto done; } V_tcp_func_set_ptr = blk; done: rw_wunlock(&tcp_function_lock); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_net_inet_default_tcp_functions, "A", "Set/get the default TCP functions"); static int sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) { int error, cnt, linesz; struct tcp_function *f; char *buffer, *cp; size_t bufsz, outsz; bool alias; cnt = 0; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { cnt++; } rw_runlock(&tcp_function_lock); bufsz = (cnt+2) * ((TCP_FUNCTION_NAME_LEN_MAX * 2) + 13) + 1; buffer = malloc(bufsz, M_TEMP, M_WAITOK); error = 0; cp = buffer; linesz = snprintf(cp, bufsz, "\n%-32s%c %-32s %s\n", "Stack", 'D', "Alias", "PCB count"); cp += linesz; bufsz -= linesz; outsz = linesz; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { alias = (f->tf_name != f->tf_fb->tfb_tcp_block_name); linesz = snprintf(cp, bufsz, "%-32s%c %-32s %u\n", f->tf_fb->tfb_tcp_block_name, (f->tf_fb == V_tcp_func_set_ptr) ? '*' : ' ', alias ? f->tf_name : "-", f->tf_fb->tfb_refcnt); if (linesz >= bufsz) { error = EOVERFLOW; break; } cp += linesz; bufsz -= linesz; outsz += linesz; } rw_runlock(&tcp_function_lock); if (error == 0) error = sysctl_handle_string(oidp, buffer, outsz + 1, req); free(buffer, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_net_inet_list_available, "A", "list available TCP Function sets"); VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT; #ifdef INET VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL; #define V_udp4_tun_socket VNET(udp4_tun_socket) #endif #ifdef INET6 VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL; #define V_udp6_tun_socket VNET(udp6_tun_socket) #endif static struct sx tcpoudp_lock; static void tcp_over_udp_stop(void) { sx_assert(&tcpoudp_lock, SA_XLOCKED); #ifdef INET if (V_udp4_tun_socket != NULL) { soclose(V_udp4_tun_socket); V_udp4_tun_socket = NULL; } #endif #ifdef INET6 if (V_udp6_tun_socket != NULL) { soclose(V_udp6_tun_socket); V_udp6_tun_socket = NULL; } #endif } static int tcp_over_udp_start(void) { uint16_t port; int ret; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif sx_assert(&tcpoudp_lock, SA_XLOCKED); port = V_tcp_udp_tunneling_port; if (ntohs(port) == 0) { /* Must have a port set */ return (EINVAL); } #ifdef INET if (V_udp4_tun_socket != NULL) { /* Already running -- must stop first */ return (EALREADY); } #endif #ifdef INET6 if (V_udp6_tun_socket != NULL) { /* Already running -- must stop first */ return (EALREADY); } #endif #ifdef INET if ((ret = socreate(PF_INET, &V_udp4_tun_socket, SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread))) { tcp_over_udp_stop(); return (ret); } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket, tcp_recv_udp_tunneled_packet, tcp_ctlinput_viaudp, NULL))) { tcp_over_udp_stop(); return (ret); } /* Ok, we have a socket, bind it to the port. */ memset(&sin, 0, sizeof(struct sockaddr_in)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_port = htons(port); if ((ret = sobind(V_udp4_tun_socket, (struct sockaddr *)&sin, curthread))) { tcp_over_udp_stop(); return (ret); } #endif #ifdef INET6 if ((ret = socreate(PF_INET6, &V_udp6_tun_socket, SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread))) { tcp_over_udp_stop(); return (ret); } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket, tcp_recv_udp_tunneled_packet, tcp6_ctlinput_viaudp, NULL))) { tcp_over_udp_stop(); return (ret); } /* Ok, we have a socket, bind it to the port. */ memset(&sin6, 0, sizeof(struct sockaddr_in6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_port = htons(port); if ((ret = sobind(V_udp6_tun_socket, (struct sockaddr *)&sin6, curthread))) { tcp_over_udp_stop(); return (ret); } #endif return (0); } static int sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS) { int error; uint32_t old, new; old = V_tcp_udp_tunneling_port; new = old; error = sysctl_handle_int(oidp, &new, 0, req); if ((error == 0) && (req->newptr != NULL)) { if ((new < TCP_TUNNELING_PORT_MIN) || (new > TCP_TUNNELING_PORT_MAX)) { error = EINVAL; } else { sx_xlock(&tcpoudp_lock); V_tcp_udp_tunneling_port = new; if (old != 0) { tcp_over_udp_stop(); } if (new != 0) { error = tcp_over_udp_start(); if (error != 0) { V_tcp_udp_tunneling_port = 0; } } sx_xunlock(&tcpoudp_lock); } } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(tcp_udp_tunneling_port), 0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU", "Tunneling port for tcp over udp"); VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT; static int sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_udp_tunneling_overhead; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if ((new < TCP_TUNNELING_OVERHEAD_MIN) || (new > TCP_TUNNELING_OVERHEAD_MAX)) error = EINVAL; else V_tcp_udp_tunneling_overhead = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(tcp_udp_tunneling_overhead), 0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU", "MSS reduction when using tcp over udp"); /* * Exports one (struct tcp_function_info) for each alias/name. */ static int sysctl_net_inet_list_func_info(SYSCTL_HANDLER_ARGS) { int cnt, error; struct tcp_function *f; struct tcp_function_info tfi; /* * We don't allow writes. */ if (req->newptr != NULL) return (EINVAL); /* * Wire the old buffer so we can directly copy the functions to * user space without dropping the lock. */ if (req->oldptr != NULL) { error = sysctl_wire_old_buffer(req, 0); if (error) return (error); } /* * Walk the list and copy out matching entries. If INVARIANTS * is compiled in, also walk the list to verify the length of * the list matches what we have recorded. */ rw_rlock(&tcp_function_lock); cnt = 0; #ifndef INVARIANTS if (req->oldptr == NULL) { cnt = tcp_fb_cnt; goto skip_loop; } #endif TAILQ_FOREACH(f, &t_functions, tf_next) { #ifdef INVARIANTS cnt++; #endif if (req->oldptr != NULL) { bzero(&tfi, sizeof(tfi)); tfi.tfi_refcnt = f->tf_fb->tfb_refcnt; tfi.tfi_id = f->tf_fb->tfb_id; (void)strlcpy(tfi.tfi_alias, f->tf_name, sizeof(tfi.tfi_alias)); (void)strlcpy(tfi.tfi_name, f->tf_fb->tfb_tcp_block_name, sizeof(tfi.tfi_name)); error = SYSCTL_OUT(req, &tfi, sizeof(tfi)); /* * Don't stop on error, as that is the * mechanism we use to accumulate length * information if the buffer was too short. */ } } KASSERT(cnt == tcp_fb_cnt, ("%s: cnt (%d) != tcp_fb_cnt (%d)", __func__, cnt, tcp_fb_cnt)); #ifndef INVARIANTS skip_loop: #endif rw_runlock(&tcp_function_lock); if (req->oldptr == NULL) error = SYSCTL_OUT(req, NULL, (cnt + 1) * sizeof(struct tcp_function_info)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info, CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_net_inet_list_func_info, "S,tcp_function_info", "List TCP function block name-to-ID mappings"); /* * tfb_tcp_handoff_ok() function for the default stack. * Note that we'll basically try to take all comers. */ static int tcp_default_handoff_ok(struct tcpcb *tp) { return (0); } /* * tfb_tcp_fb_init() function for the default stack. * * This handles making sure we have appropriate timers set if you are * transitioning a socket that has some amount of setup done. * * The init() fuction from the default can *never* return non-zero i.e. * it is required to always succeed since it is the stack of last resort! */ static int tcp_default_fb_init(struct tcpcb *tp, void **ptr) { struct socket *so = tptosocket(tp); int rexmt; INP_WLOCK_ASSERT(tptoinpcb(tp)); /* We don't use the pointer */ *ptr = NULL; /* Make sure we get no interesting mbuf queuing behavior */ /* All mbuf queue/ack compress flags should be off */ tcp_lro_features_off(tp); /* Cancel the GP measurement in progress */ tp->t_flags &= ~TF_GPUTINPROG; /* Validate the timers are not in usec, if they are convert */ tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; if (tp->t_rxtshift == 0) tp->t_rxtcur = rexmt; else TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, tcp_rexmit_max); /* * Nothing to do for ESTABLISHED or LISTEN states. And, we don't * know what to do for unexpected states (which includes TIME_WAIT). */ if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT) return (0); /* * Make sure some kind of transmission timer is set if there is * outstanding data. */ if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) || tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) || tcp_timer_active(tp, TT_PERSIST))) { /* * If the session has established and it looks like it should * be in the persist state, set the persist timer. Otherwise, * set the retransmit timer. */ if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 && (int32_t)(tp->snd_nxt - tp->snd_una) < (int32_t)sbavail(&so->so_snd)) tcp_setpersist(tp); else tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); } /* All non-embryonic sessions get a keepalive timer. */ if (!tcp_timer_active(tp, TT_KEEP)) tcp_timer_activate(tp, TT_KEEP, TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); /* * Make sure critical variables are initialized * if transitioning while in Recovery. */ if IN_FASTRECOVERY(tp->t_flags) { if (tp->sackhint.recover_fs == 0) tp->sackhint.recover_fs = max(1, tp->snd_nxt - tp->snd_una); } return (0); } /* * tfb_tcp_fb_fini() function for the default stack. * * This changes state as necessary (or prudent) to prepare for another stack * to assume responsibility for the connection. */ static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged) { INP_WLOCK_ASSERT(tptoinpcb(tp)); #ifdef TCP_BLACKBOX tcp_log_flowend(tp); #endif tp->t_acktime = 0; return; } MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) INPCBSTORAGE_DEFINE(tcpcbstor, tcpcb, "tcpinp", "tcp_inpcb", "tcp", "tcphash"); /* * Take a value and get the next power of 2 that doesn't overflow. * Used to size the tcp_inpcb hash buckets. */ static int maketcp_hashsize(int size) { int hashsize; /* * auto tune. * get the next power of 2 higher than maxsockets. */ hashsize = 1 << fls(size); /* catch overflow, and just go one power of 2 smaller */ if (hashsize < size) { hashsize = 1 << (fls(size) - 1); } return (hashsize); } static volatile int next_tcp_stack_id = 1; /* * Register a TCP function block with the name provided in the names * array. (Note that this function does NOT automatically register * blk->tfb_tcp_block_name as a stack name. Therefore, you should * explicitly include blk->tfb_tcp_block_name in the list of names if * you wish to register the stack with that name.) * * Either all name registrations will succeed or all will fail. If * a name registration fails, the function will update the num_names * argument to point to the array index of the name that encountered * the failure. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names) { struct tcp_function *f[TCP_FUNCTION_NAME_NUM_MAX]; struct tcp_function_set fs; int error, i, num_registered; KASSERT(names != NULL, ("%s: Called with NULL name list", __func__)); KASSERT(*num_names > 0, ("%s: Called with non-positive length of name list", __func__)); KASSERT(rw_initialized(&tcp_function_lock), ("%s: called too early", __func__)); if (*num_names > TCP_FUNCTION_NAME_NUM_MAX) { /* Too many names. */ *num_names = 0; return (E2BIG); } if ((blk->tfb_tcp_output == NULL) || (blk->tfb_tcp_do_segment == NULL) || (blk->tfb_tcp_ctloutput == NULL) || (blk->tfb_tcp_handoff_ok == NULL) || (strlen(blk->tfb_tcp_block_name) == 0)) { /* These functions are required and a name is needed. */ *num_names = 0; return (EINVAL); } for (i = 0; i < *num_names; i++) { f[i] = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); if (f[i] == NULL) { while (--i >= 0) free(f[i], M_TCPFUNCTIONS); *num_names = 0; return (ENOMEM); } } num_registered = 0; rw_wlock(&tcp_function_lock); if (find_tcp_fb_locked(blk, NULL) != NULL) { /* A TCP function block can only be registered once. */ error = EALREADY; goto cleanup; } if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { error = EINVAL; goto cleanup; } refcount_init(&blk->tfb_refcnt, 0); blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1); for (i = 0; i < *num_names; i++) { (void)strlcpy(fs.function_set_name, names[i], sizeof(fs.function_set_name)); if (find_tcp_functions_locked(&fs) != NULL) { /* Duplicate name space not allowed */ error = EALREADY; goto cleanup; } f[i]->tf_fb = blk; (void)strlcpy(f[i]->tf_name, names[i], sizeof(f[i]->tf_name)); TAILQ_INSERT_TAIL(&t_functions, f[i], tf_next); tcp_fb_cnt++; num_registered++; } rw_wunlock(&tcp_function_lock); return (0); cleanup: /* Remove the entries just added. */ for (i = 0; i < *num_names; i++) { if (i < num_registered) { TAILQ_REMOVE(&t_functions, f[i], tf_next); tcp_fb_cnt--; } f[i]->tf_fb = NULL; free(f[i], M_TCPFUNCTIONS); } rw_wunlock(&tcp_function_lock); *num_names = num_registered; return (error); } /* * Register a TCP function block using the name provided in the name * argument. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait) { const char *name_list[1]; int num_names, rv; num_names = 1; if (name != NULL) name_list[0] = name; else name_list[0] = blk->tfb_tcp_block_name; rv = register_tcp_functions_as_names(blk, wait, name_list, &num_names); return (rv); } /* * Register a TCP function block using the name defined in * blk->tfb_tcp_block_name. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions(struct tcp_function_block *blk, int wait) { return (register_tcp_functions_as_name(blk, NULL, wait)); } /* * Deregister all names associated with a function block. This * functionally removes the function block from use within the system. * * When called with a true quiesce argument, mark the function block * as being removed so no more stacks will use it and determine * whether the removal would succeed. * * When called with a false quiesce argument, actually attempt the * removal. * * When called with a force argument, attempt to switch all TCBs to * use the default stack instead of returning EBUSY. * * Returns 0 on success (or if the removal would succeed), or an error * code on failure. */ int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force) { struct tcp_function *f; VNET_ITERATOR_DECL(vnet_iter); if (blk == &tcp_def_funcblk) { /* You can't un-register the default */ return (EPERM); } rw_wlock(&tcp_function_lock); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); if (blk == V_tcp_func_set_ptr) { /* You can't free the current default in some vnet. */ CURVNET_RESTORE(); VNET_LIST_RUNLOCK_NOSLEEP(); rw_wunlock(&tcp_function_lock); return (EBUSY); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); /* Mark the block so no more stacks can use it. */ blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; /* * If TCBs are still attached to the stack, attempt to switch them * to the default stack. */ if (force && blk->tfb_refcnt) { struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); rw_wunlock(&tcp_function_lock); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_WLOCKPCB); while ((inp = inp_next(&inpi)) != NULL) { tp = intotcpcb(inp); if (tp == NULL || tp->t_fb != blk) continue; tcp_switch_back_to_default(tp); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); rw_wlock(&tcp_function_lock); } if (blk->tfb_refcnt) { /* TCBs still attached. */ rw_wunlock(&tcp_function_lock); return (EBUSY); } if (quiesce) { /* Skip removal. */ rw_wunlock(&tcp_function_lock); return (0); } /* Remove any function names that map to this function block. */ while (find_tcp_fb_locked(blk, &f) != NULL) { TAILQ_REMOVE(&t_functions, f, tf_next); tcp_fb_cnt--; f->tf_fb = NULL; free(f, M_TCPFUNCTIONS); } rw_wunlock(&tcp_function_lock); return (0); } static void tcp_drain(void *ctx __unused, int flags __unused) { struct epoch_tracker et; VNET_ITERATOR_DECL(vnet_iter); if (!do_tcpdrain) return; NET_EPOCH_ENTER(et); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_WLOCKPCB); struct inpcb *inpb; struct tcpcb *tcpb; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * useful. */ while ((inpb = inp_next(&inpi)) != NULL) { if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); #ifdef TCP_BLACKBOX tcp_log_drain(tcpb); #endif } } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); NET_EPOCH_EXIT(et); } static void tcp_vnet_init(void *arg __unused) { #ifdef TCP_HHOOK if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); #endif #ifdef STATS if (tcp_stats_init()) printf("%s: WARNING: unable to initialise TCP stats\n", __func__); #endif in_pcbinfo_init(&V_tcbinfo, &tcpcbstor, tcp_tcbhashsize, tcp_tcbhashsize); syncache_init(); tcp_hc_init(); TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); tcp_fastopen_init(); COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK); VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK); V_tcp_msl = TCPTV_MSL; V_tcp_msl_local = TCPTV_MSL_LOCAL; arc4rand(&V_ts_offset_secret, sizeof(V_ts_offset_secret), 0); } VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_vnet_init, NULL); static void tcp_init(void *arg __unused) { int hashsize; tcp_reass_global_init(); /* XXX virtualize those below? */ tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_rexmit_initial = TCPTV_RTOBASE; tcp_rexmit_min = TCPTV_MIN; tcp_rexmit_max = TCPTV_REXMTMAX; tcp_persmin = TCPTV_PERSMIN; tcp_persmax = TCPTV_PERSMAX; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; /* Setup the tcp function block list */ TAILQ_INIT(&t_functions); rw_init(&tcp_function_lock, "tcp_func_lock"); register_tcp_functions(&tcp_def_funcblk, M_WAITOK); sx_init(&tcpoudp_lock, "TCP over UDP configuration"); #ifdef TCP_BLACKBOX /* Initialize the TCP logging data. */ tcp_log_init(); #endif if (tcp_soreceive_stream) { #ifdef INET tcp_protosw.pr_soreceive = soreceive_stream; #endif #ifdef INET6 tcp6_protosw.pr_soreceive = soreceive_stream; #endif /* INET6 */ } #ifdef INET6 max_protohdr_grow(sizeof(struct ip6_hdr) + sizeof(struct tcphdr)); #else /* INET6 */ max_protohdr_grow(sizeof(struct tcpiphdr)); #endif /* INET6 */ ISN_LOCK_INIT(); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(vm_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT); EVENTHANDLER_REGISTER(mbuf_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT); tcp_inp_lro_direct_queue = counter_u64_alloc(M_WAITOK); tcp_inp_lro_wokeup_queue = counter_u64_alloc(M_WAITOK); tcp_inp_lro_compressed = counter_u64_alloc(M_WAITOK); tcp_inp_lro_locks_taken = counter_u64_alloc(M_WAITOK); tcp_extra_mbuf = counter_u64_alloc(M_WAITOK); tcp_would_have_but = counter_u64_alloc(M_WAITOK); tcp_comp_total = counter_u64_alloc(M_WAITOK); tcp_uncomp_total = counter_u64_alloc(M_WAITOK); tcp_bad_csums = counter_u64_alloc(M_WAITOK); tcp_pacing_failures = counter_u64_alloc(M_WAITOK); tcp_dgp_failures = counter_u64_alloc(M_WAITOK); hashsize = tcp_tcbhashsize; if (hashsize == 0) { /* * Auto tune the hash size based on maxsockets. * A perfect hash would have a 1:1 mapping * (hashsize = maxsockets) however it's been * suggested that O(2) average is better. */ hashsize = maketcp_hashsize(maxsockets / 4); /* * Our historical default is 512, * do not autotune lower than this. */ if (hashsize < 512) hashsize = 512; if (bootverbose) printf("%s: %s auto tuned to %d\n", __func__, "net.inet.tcp.tcbhashsize", hashsize); } /* * We require a hashsize to be a power of two. * Previously if it was not a power of two we would just reset it * back to 512, which could be a nasty surprise if you did not notice * the error message. * Instead what we do is clip it to the closest power of two lower * than the specified hash value. */ if (!powerof2(hashsize)) { int oldhashsize = hashsize; hashsize = maketcp_hashsize(hashsize); /* prevent absurdly low value */ if (hashsize < 16) hashsize = 16; printf("%s: WARNING: TCB hash size not a power of 2, " "clipped from %d to %d.\n", __func__, oldhashsize, hashsize); } tcp_tcbhashsize = hashsize; #ifdef INET IPPROTO_REGISTER(IPPROTO_TCP, tcp_input, tcp_ctlinput); #endif #ifdef INET6 IP6PROTO_REGISTER(IPPROTO_TCP, tcp6_input, tcp6_ctlinput); #endif } SYSINIT(tcp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, tcp_init, NULL); #ifdef VIMAGE static void tcp_destroy(void *unused __unused) { #ifdef TCP_HHOOK int error; #endif tcp_hc_destroy(); syncache_destroy(); in_pcbinfo_destroy(&V_tcbinfo); /* tcp_discardcb() clears the sack_holes up. */ uma_zdestroy(V_sack_hole_zone); /* * Cannot free the zone until all tcpcbs are released as we attach * the allocations to them. */ tcp_fastopen_destroy(); COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES); VNET_PCPUSTAT_FREE(tcpstat); #ifdef TCP_HHOOK error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error); } error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error); } #endif } VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL); #endif void tcp_fini(void *xtp) { } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcpip_fillheaders(struct inpcb *inp, uint16_t port, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; INP_WLOCK_ASSERT(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); if (port == 0) ip6->ip6_nxt = IPPROTO_TCP; else ip6->ip6_nxt = IPPROTO_UDP; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { struct ip *ip; ip = (struct ip *)ip_ptr; ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; if (port == 0) ip->ip_p = IPPROTO_TCP; else ip->ip_p = IPPROTO_UDP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } #endif /* INET */ th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; th->th_ack = 0; th->th_off = 5; tcp_set_flags(th, 0); th->th_win = 0; th->th_urp = 0; th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcpip_maketemplate(struct inpcb *inp) { struct tcptemp *t; t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); if (t == NULL) return (NULL); tcpip_fillheaders(inp, 0, (void *)&t->tt_ipgen, (void *)&t->tt_t); return (t); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy * of the tcpiphdr at th and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the segment th, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then th must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, uint16_t flags) { struct tcpopt to; struct inpcb *inp; struct ip *ip; struct mbuf *optm; struct udphdr *uh = NULL; struct tcphdr *nth; struct tcp_log_buffer *lgb; u_char *optp; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int optlen, tlen, win, ulen; int ect = 0; bool incl_opts; uint16_t port; int output_ret; #ifdef INVARIANTS int thflags = tcp_get_flags(th); #endif KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); NET_EPOCH_ASSERT(); #ifdef INET6 isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4); ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp != NULL) { inp = tptoinpcb(tp); INP_LOCK_ASSERT(inp); } else inp = NULL; if (m != NULL) { #ifdef INET6 if (isipv6 && ip6 && (ip6->ip6_nxt == IPPROTO_UDP)) port = m->m_pkthdr.tcp_tun_port; else #endif if (ip && (ip->ip_p == IPPROTO_UDP)) port = m->m_pkthdr.tcp_tun_port; else port = 0; } else port = tp->t_port; incl_opts = false; win = 0; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > TCP_MAXWIN << tp->rcv_scale) win = TCP_MAXWIN << tp->rcv_scale; } if ((tp->t_flags & TF_NOOPT) == 0) incl_opts = true; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else if ((!M_WRITABLE(m)) || (port != 0)) { struct mbuf *n; /* Can't reuse 'm', allocate a new mbuf. */ n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } if (!m_dup_pkthdr(n, m, M_NOWAIT)) { m_freem(m); m_freem(n); return; } n->m_data += max_linkhdr; /* m_len is set later */ #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(n, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(n, struct ip6_hdr *); xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip)); ip = mtod(n, struct ip *); xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); xchg(nth->th_dport, nth->th_sport, uint16_t); th = nth; m_freem(m); m = n; } else { /* * reuse the mbuf. * XXX MRT We inherit the FIB, which is lucky. */ m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; /* clear any receive flags for proper bpf timestamping */ m->m_flags &= ~(M_TSTMP | M_TSTMP_LRO); /* m_len is set later */ #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, uint16_t); #undef xchg } tlen = 0; #ifdef INET6 if (isipv6) tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tlen = sizeof (struct tcpiphdr); #endif if (port) tlen += sizeof (struct udphdr); #ifdef INVARIANTS m->m_len = 0; KASSERT(M_TRAILINGSPACE(m) >= tlen, ("Not enough trailing space for message (m=%p, need=%d, have=%ld)", m, tlen, (long)M_TRAILINGSPACE(m))); #endif m->m_len = tlen; to.to_flags = 0; if (incl_opts) { ect = tcp_ecn_output_established(tp, &flags, 0, false); /* Make sure we have room. */ if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) { m->m_next = m_get(M_NOWAIT, MT_DATA); if (m->m_next) { optp = mtod(m->m_next, u_char *); optm = m->m_next; } else incl_opts = false; } else { optp = (u_char *) (nth + 1); optm = m; } } if (incl_opts) { /* Timestamps. */ if (tp->t_flags & TF_RCVD_TSTMP) { to.to_tsval = tcp_ts_getticks() + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* Add the options. */ tlen += optlen = tcp_addoptions(&to, optp); /* Update m_len in the correct mbuf. */ optm->m_len += optlen; } else optlen = 0; #ifdef INET6 if (isipv6) { if (uh) { ulen = tlen - sizeof(struct ip6_hdr); uh->uh_ulen = htons(ulen); } ip6->ip6_flow = htonl(ect << IPV6_FLOWLABEL_LEN); ip6->ip6_vfc = IPV6_VERSION; if (port) ip6->ip6_nxt = IPPROTO_UDP; else ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(tlen - sizeof(*ip6)); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (uh) { ulen = tlen - sizeof(struct ip); uh->uh_ulen = htons(ulen); } ip->ip_len = htons(tlen); if (inp != NULL) { ip->ip_tos = inp->inp_ip_tos & ~IPTOS_ECN_MASK; ip->ip_ttl = inp->inp_ip_ttl; } else { ip->ip_tos = 0; ip->ip_ttl = V_ip_defttl; } ip->ip_tos |= ect; if (port) { ip->ip_p = IPPROTO_UDP; } else { ip->ip_p = IPPROTO_TCP; } if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); } #endif m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC if (inp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ INP_LOCK_ASSERT(inp); mac_inpcb_create_mbuf(inp, m); } else { /* * Packet is not associated with a socket, so possibly * update the label in place. */ mac_netinet_tcp_reply(m); } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2; tcp_set_flags(nth, flags); if (tp && (flags & TH_RST)) { /* Log the reset */ tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); } if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, nth, to.to_signature) != 0) { m_freem(m); return; } } #endif #ifdef INET6 if (isipv6) { if (port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); uh->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); nth->th_sum = 0; } else { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); nth->th_sum = in6_cksum_pseudo(ip6, tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); } ip6->ip6_hlim = in6_selecthlim(inp, NULL); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { if (port) { uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); nth->th_sum = 0; } else { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); } } #endif /* INET */ TCP_PROBE3(debug__output, tp, th, m); if (flags & TH_RST) TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth); lgb = NULL; if ((tp != NULL) && tcp_bblogging_on(tp)) { if (INP_WLOCKED(inp)) { union tcp_log_stackspecific log; struct timeval tv; memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(tp); log.u_bbr.flex8 = 4; log.u_bbr.pkts_out = tp->t_maxseg; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.delivered = 0; lgb = tcp_log_event(tp, nth, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 0, &log, false, NULL, NULL, 0, &tv); } else { /* * We can not log the packet, since we only own the * read lock, but a write lock is needed. The read lock * is not upgraded to a write lock, since only getting * the read lock was done intentionally to improve the * handling of SYN flooding attacks. * This happens only for pure SYN segments received in * the initial CLOSED state, or received in a more * advanced state than listen and the UDP encapsulation * port is unexpected. * The incoming SYN segments do not really belong to * the TCP connection and the handling does not change * the state of the TCP connection. Therefore, the * sending of the RST segments is not logged. Please * note that also the incoming SYN segments are not * logged. * * The following code ensures that the above description * is and stays correct. */ KASSERT((thflags & (TH_ACK|TH_SYN)) == TH_SYN && (tp->t_state == TCPS_CLOSED || (tp->t_state > TCPS_LISTEN && tp->t_port != port)), ("%s: Logging of TCP segment with flags 0x%b and " "UDP encapsulation port %u skipped in state %s", __func__, thflags, PRINT_TH_FLAGS, ntohs(port), tcpstates[tp->t_state])); } } if (flags & TH_ACK) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN|TH_FIN|TH_RST)) TCPSTAT_INC(tcps_sndctrl); TCPSTAT_INC(tcps_sndtotal); #ifdef INET6 if (isipv6) { TCP_PROBE5(send, NULL, tp, ip6, tp, nth); output_ret = ip6_output(m, inp ? inp->in6p_outputopts : NULL, NULL, 0, NULL, NULL, inp); } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { TCP_PROBE5(send, NULL, tp, ip, tp, nth); output_ret = ip_output(m, NULL, NULL, 0, NULL, inp); } #endif if (lgb != NULL) lgb->tlb_errno = output_ret; } /* * Send a challenge ack (no data, no SACK option), but not more than * V_tcp_ack_war_cnt per V_tcp_ack_war_time_window (per TCP connection). */ void tcp_send_challenge_ack(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m) { sbintime_t now; bool send_challenge_ack; /* * The sending of a challenge ACK could be triggered by a blind attacker * to detect an existing TCP connection. To mitigate that, increment * also the global counter which would be incremented if the attacker * would have guessed wrongly. */ (void)badport_bandlim(BANDLIM_TCP_RST); if (V_tcp_ack_war_time_window == 0 || V_tcp_ack_war_cnt == 0) { /* ACK war protection is disabled. */ send_challenge_ack = true; } else { /* Start new epoch, if the previous one is already over. */ now = getsbinuptime(); if (tp->t_challenge_ack_end < now) { tp->t_challenge_ack_cnt = 0; tp->t_challenge_ack_end = now + V_tcp_ack_war_time_window * SBT_1MS; } /* * Send a challenge ACK, if less than tcp_ack_war_cnt have been * sent in the current epoch. */ if (tp->t_challenge_ack_cnt < V_tcp_ack_war_cnt) { send_challenge_ack = true; tp->t_challenge_ack_cnt++; } else { send_challenge_ack = false; } } if (send_challenge_ack) { tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; } } /* * Create a new TCP control block, making an empty reassembly queue and hooking * it to the argument protocol control block. The `inp' parameter must have * come from the zone allocator set up by tcpcbstor declaration. * The caller can provide a pointer to a tcpcb of the listener to inherit the * TCP function block from the listener. */ struct tcpcb * tcp_newtcpcb(struct inpcb *inp, struct tcpcb *listening_tcb) { struct tcpcb *tp = intotcpcb(inp); #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ /* * Historically allocation was done with M_ZERO. There is a lot of * code that rely on that. For now take safe approach and zero whole * tcpcb. This definitely can be optimized. */ bzero(&tp->t_start_zero, t_zero_size); /* Initialise cc_var struct for this tcpcb. */ tp->t_ccv.tp = tp; rw_rlock(&tcp_function_lock); if (listening_tcb != NULL) { INP_LOCK_ASSERT(tptoinpcb(listening_tcb)); KASSERT(listening_tcb->t_fb != NULL, ("tcp_newtcpcb: listening_tcb->t_fb is NULL")); if (listening_tcb->t_fb->tfb_flags & TCP_FUNC_BEING_REMOVED) { rw_runlock(&tcp_function_lock); return (NULL); } tp->t_fb = listening_tcb->t_fb; } else { tp->t_fb = V_tcp_func_set_ptr; } refcount_acquire(&tp->t_fb->tfb_refcnt); KASSERT((tp->t_fb->tfb_flags & TCP_FUNC_BEING_REMOVED) == 0, ("tcp_newtcpcb: using TFB being removed")); rw_runlock(&tcp_function_lock); CC_LIST_RLOCK(); if (listening_tcb != NULL) { if (CC_ALGO(listening_tcb)->flags & CC_MODULE_BEING_REMOVED) { CC_LIST_RUNLOCK(); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); return (NULL); } CC_ALGO(tp) = CC_ALGO(listening_tcb); } else CC_ALGO(tp) = CC_DEFAULT_ALGO(); cc_refer(CC_ALGO(tp)); CC_LIST_RUNLOCK(); if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(&tp->t_ccv, NULL) > 0) { cc_detach(tp); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); return (NULL); } #ifdef TCP_HHOOK if (khelp_init_osd(HELPER_CLASS_TCP, &tp->t_osd)) { if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(&tp->t_ccv); CC_DATA(tp) = NULL; cc_detach(tp); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); return (NULL); } #endif TAILQ_INIT(&tp->t_segq); STAILQ_INIT(&tp->t_inqueue); tp->t_maxseg = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ V_tcp_mssdflt; /* All mbuf queue/ack compress flags should be off */ tcp_lro_features_off(tp); tp->t_hpts_cpu = HPTS_CPU_NONE; tp->t_lro_cpu = HPTS_CPU_NONE; callout_init_rw(&tp->t_callout, &inp->inp_lock, CALLOUT_TRYLOCK | CALLOUT_RETURNUNLOCKED); for (int i = 0; i < TT_N; i++) tp->t_timers[i] = SBT_MAX; switch (V_tcp_do_rfc1323) { case 0: break; default: case 1: tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); break; case 2: tp->t_flags = TF_REQ_SCALE; break; case 3: tp->t_flags = TF_REQ_TSTMP; break; } if (V_tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((tcp_rexmit_initial - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = tcp_rexmit_initial; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; /* We always start with ticks granularity */ tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; #ifdef TCP_BLACKBOX /* Initialize the per-TCPCB log data. */ tcp_log_tcpcbinit(tp); #endif tp->t_pacing_rate = -1; if (tp->t_fb->tfb_tcp_fb_init) { if ((*tp->t_fb->tfb_tcp_fb_init)(tp, &tp->t_fb_ptr)) { if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(&tp->t_ccv); CC_DATA(tp) = NULL; cc_detach(tp); #ifdef TCP_HHOOK khelp_destroy_osd(&tp->t_osd); #endif refcount_release(&tp->t_fb->tfb_refcnt); return (NULL); } } #ifdef STATS if (V_tcp_perconn_stats_enable == 1) tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0); #endif if (V_tcp_do_lrd) tp->t_flags |= TF_LRD; return (tp); } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tptosocket(tp); NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tptoinpcb(tp)); if (TCPS_HAVERCVDSYN(tp->t_state)) { tcp_state_change(tp, TCPS_CLOSED); /* Don't use tcp_output() here due to possible recursion. */ (void)tcp_output_nodrop(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } void tcp_discardcb(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); struct socket *so = tptosocket(tp); struct mbuf *m; #ifdef INET6 bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif INP_WLOCK_ASSERT(inp); MPASS(!callout_active(&tp->t_callout)); MPASS(TAILQ_EMPTY(&tp->snd_holes)); /* free the reassembly queue, if any */ tcp_reass_flush(tp); #ifdef TCP_OFFLOAD /* Disconnect offload device, if any. */ if (tp->t_flags & TF_TOE) tcp_offload_detach(tp); #endif /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(&tp->t_ccv); CC_DATA(tp) = NULL; /* Detach from the CC algorithm */ cc_detach(tp); #ifdef TCP_HHOOK khelp_destroy_osd(&tp->t_osd); #endif #ifdef STATS stats_blob_destroy(tp->t_stats); #endif CC_ALGO(tp) = NULL; if ((m = STAILQ_FIRST(&tp->t_inqueue)) != NULL) { struct mbuf *prev; STAILQ_INIT(&tp->t_inqueue); STAILQ_FOREACH_FROM_SAFE(m, &tp->t_inqueue, m_stailqpkt, prev) m_freem(m); } TCPSTATES_DEC(tp->t_state); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); MPASS(!tcp_in_hpts(tp)); #ifdef TCP_BLACKBOX tcp_log_tcpcbfini(tp); #endif /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as 4 rtt samples. * 4 samples is enough for the srtt filter to converge * to within enough % of the correct value; fewer samples * and we could save a bogus rtt. The danger is not high * as tcp quickly recovers from everything. * XXX: Works very well but needs some more statistics! * * XXXRRS: Updating must be after the stack fini() since * that may be converting some internal representation of * say srtt etc into the general one used by other stacks. */ if (tp->t_rttupdated >= 4) { struct hc_metrics_lite metrics; uint32_t ssthresh; bzero(&metrics, sizeof(metrics)); /* * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occurred on a session. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; if (ssthresh < 2) ssthresh = 2; ssthresh *= (tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); } else ssthresh = 0; metrics.hc_ssthresh = ssthresh; metrics.hc_rtt = tp->t_srtt; metrics.hc_rttvar = tp->t_rttvar; metrics.hc_cwnd = tp->snd_cwnd; metrics.hc_sendpipe = 0; metrics.hc_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } refcount_release(&tp->t_fb->tfb_refcnt); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); struct socket *so = tptosocket(tp); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_stop(tp); #endif /* * This releases the TFO pending counter resource for TFO listen * sockets as well as passively-created TFO sockets that transition * from SYN_RECEIVED to CLOSED. */ if (tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; } tcp_timer_stop(tp); if (tp->t_fb->tfb_tcp_timer_stop_all != NULL) tp->t_fb->tfb_tcp_timer_stop_all(tp); in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); if (tp->t_state != TCPS_CLOSED) tcp_state_change(tp, TCPS_CLOSED); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); tcp_free_sackholes(tp); soisdisconnected(so); if (inp->inp_flags & INP_SOCKREF) { inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); sorele(so); return (NULL); } return (tp); } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { if (inp->inp_route.ro_nh) { NH_FREE(inp->inp_route.ro_nh); inp->inp_route.ro_nh = (struct nhop_object *)NULL; } return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tp = tcp_drop(tp, error); if (tp != NULL) return (inp); else return (NULL); } else { tp->t_softerror = error; return (inp); } #if 0 wakeup( &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_RLOCKPCB); struct xinpgen xig; struct inpcb *inp; int error; if (req->newptr != NULL) return (EPERM); if (req->oldptr == NULL) { int n; n = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); return (0); } if ((error = sysctl_wire_old_buffer(req, 0)) != 0) return (error); bzero(&xig, sizeof(xig)); xig.xig_len = sizeof xig; xig.xig_count = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); error = syncache_pcblist(req); if (error) return (error); while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt <= xig.xig_gen && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { struct xtcpcb xt; tcp_inptoxtp(inp, &xt); error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) { INP_RUNLOCK(inp); break; } else continue; } } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); error = SYSCTL_OUT(req, &xig, sizeof xig); } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); #define SND_TAG_STATUS_MAXLEN 128 #ifdef KERN_TLS static struct sx ktlslist_lock; SX_SYSINIT(ktlslistlock, &ktlslist_lock, "ktlslist"); static uint64_t ktls_glob_gen = 1; static int tcp_ktlslist_locked(SYSCTL_HANDLER_ARGS, bool export_keys) { struct xinpgen xig; struct inpcb *inp; struct socket *so; struct ktls_session *ksr, *kss; char *buf; struct xktls_session *xktls; uint64_t ipi_gencnt; size_t buflen, len, sz; u_int cnt; int error; bool ek, p; sx_assert(&ktlslist_lock, SA_XLOCKED); if (req->newptr != NULL) return (EPERM); len = 0; cnt = 0; ipi_gencnt = V_tcbinfo.ipi_gencnt; bzero(&xig, sizeof(xig)); xig.xig_len = sizeof(xig); xig.xig_gen = ktls_glob_gen++; xig.xig_sogen = so_gencnt; struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_RLOCKPCB); while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt > ipi_gencnt || cr_canseeinpcb(req->td->td_ucred, inp) != 0) continue; so = inp->inp_socket; if (so != NULL && so->so_gencnt <= xig.xig_sogen) { p = false; ek = export_keys && cr_canexport_ktlskeys( req->td, inp); ksr = so->so_rcv.sb_tls_info; if (ksr != NULL) { ksr->gen = xig.xig_gen; p = true; if (ek) { sz = SIZE_T_MAX; ktls_session_copy_keys(ksr, NULL, &sz); len += sz; } if (ksr->snd_tag != NULL && ksr->snd_tag->sw->snd_tag_status_str != NULL) { sz = SND_TAG_STATUS_MAXLEN; in_pcbref(inp); INP_RUNLOCK(inp); error = ksr->snd_tag->sw-> snd_tag_status_str( ksr->snd_tag, NULL, &sz); if (in_pcbrele_rlock(inp)) return (EDEADLK); if (error == 0) len += sz; } } kss = so->so_snd.sb_tls_info; if (kss != NULL) { kss->gen = xig.xig_gen; p = true; if (ek) { sz = SIZE_T_MAX; ktls_session_copy_keys(kss, NULL, &sz); len += sz; } if (kss->snd_tag != NULL && kss->snd_tag->sw->snd_tag_status_str != NULL) { sz = SND_TAG_STATUS_MAXLEN; in_pcbref(inp); INP_RUNLOCK(inp); error = kss->snd_tag->sw-> snd_tag_status_str( kss->snd_tag, NULL, &sz); if (in_pcbrele_rlock(inp)) return (EDEADLK); if (error == 0) len += sz; } } if (p) { len += sizeof(*xktls); len = roundup2(len, __alignof(struct xktls_session)); } } } if (req->oldptr == NULL) { len += 2 * sizeof(xig); len += 3 * len / 4; req->oldidx = len; return (0); } if ((error = sysctl_wire_old_buffer(req, 0)) != 0) return (error); error = SYSCTL_OUT(req, &xig, sizeof xig); if (error != 0) return (error); buflen = roundup2(sizeof(*xktls) + 2 * TLS_MAX_PARAM_SIZE + 2 * SND_TAG_STATUS_MAXLEN, __alignof(struct xktls_session)); buf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); struct inpcb_iterator inpi1 = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_RLOCKPCB); while ((inp = inp_next(&inpi1)) != NULL) { if (inp->inp_gencnt > ipi_gencnt || cr_canseeinpcb(req->td->td_ucred, inp) != 0) continue; so = inp->inp_socket; if (so == NULL) continue; p = false; ek = export_keys && cr_canexport_ktlskeys(req->td, inp); ksr = so->so_rcv.sb_tls_info; kss = so->so_snd.sb_tls_info; xktls = (struct xktls_session *)buf; if (ksr != NULL && ksr->gen == xig.xig_gen) { p = true; ktls_session_to_xktls_onedir(ksr, ek, &xktls->rcv); } if (kss != NULL && kss->gen == xig.xig_gen) { p = true; ktls_session_to_xktls_onedir(kss, ek, &xktls->snd); } if (!p) continue; xktls->inp_gencnt = inp->inp_gencnt; xktls->so_pcb = (kvaddr_t)inp; memcpy(&xktls->coninf, &inp->inp_inc, sizeof(xktls->coninf)); len = sizeof(*xktls); if (ksr != NULL && ksr->gen == xig.xig_gen) { if (ek) { sz = buflen - len; ktls_session_copy_keys(ksr, buf + len, &sz); len += sz; } else { xktls->rcv.cipher_key_len = 0; xktls->rcv.auth_key_len = 0; } if (ksr->snd_tag != NULL && ksr->snd_tag->sw->snd_tag_status_str != NULL) { sz = SND_TAG_STATUS_MAXLEN; in_pcbref(inp); INP_RUNLOCK(inp); error = ksr->snd_tag->sw->snd_tag_status_str( ksr->snd_tag, buf + len, &sz); if (in_pcbrele_rlock(inp)) return (EDEADLK); if (error == 0) { xktls->rcv.drv_st_len = sz; len += sz; } } } if (kss != NULL && kss->gen == xig.xig_gen) { if (ek) { sz = buflen - len; ktls_session_copy_keys(kss, buf + len, &sz); len += sz; } else { xktls->snd.cipher_key_len = 0; xktls->snd.auth_key_len = 0; } if (kss->snd_tag != NULL && kss->snd_tag->sw->snd_tag_status_str != NULL) { sz = SND_TAG_STATUS_MAXLEN; in_pcbref(inp); INP_RUNLOCK(inp); error = kss->snd_tag->sw->snd_tag_status_str( kss->snd_tag, buf + len, &sz); if (in_pcbrele_rlock(inp)) return (EDEADLK); if (error == 0) { xktls->snd.drv_st_len = sz; len += sz; } } } len = roundup2(len, __alignof(*xktls)); xktls->tsz = len; xktls->fsz = sizeof(*xktls); error = SYSCTL_OUT(req, xktls, len); if (error != 0) { INP_RUNLOCK(inp); break; } cnt++; } if (error == 0) { xig.xig_sogen = so_gencnt; xig.xig_count = cnt; error = SYSCTL_OUT(req, &xig, sizeof(xig)); } zfree(buf, M_TEMP); return (error); } static int tcp_ktlslist1(SYSCTL_HANDLER_ARGS, bool export_keys) { int repeats, error; for (repeats = 0; repeats < 100; repeats++) { if (sx_xlock_sig(&ktlslist_lock)) return (EINTR); error = tcp_ktlslist_locked(oidp, arg1, arg2, req, export_keys); sx_xunlock(&ktlslist_lock); if (error != EDEADLK) break; if (sig_intr() != 0) { error = EINTR; break; } req->oldidx = 0; } return (error); } static int tcp_ktlslist_nokeys(SYSCTL_HANDLER_ARGS) { return (tcp_ktlslist1(oidp, arg1, arg2, req, false)); } static int tcp_ktlslist_wkeys(SYSCTL_HANDLER_ARGS) { return (tcp_ktlslist1(oidp, arg1, arg2, req, true)); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_KTLSLIST, ktlslist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, tcp_ktlslist_nokeys, "S,xktls_session", "List of active kTLS sessions for TCP connections"); SYSCTL_PROC(_net_inet_tcp, TCPCTL_KTLSLIST_WKEYS, ktlslist_wkeys, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, tcp_ktlslist_wkeys, "S,xktls_session", "List of active kTLS sessions for TCP connections with keys"); #endif /* KERN_TLS */ #ifdef INET static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct epoch_tracker et; struct inpcb *inp; int error; if (req->newptr == NULL) return (EINVAL); error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); NET_EPOCH_ENTER(et); inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #endif /* INET */ #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct epoch_tracker et; struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error; #ifdef INET int mapped = 0; #endif if (req->newptr == NULL) return (EINVAL); error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else #endif return (EINVAL); } NET_EPOCH_ENTER(et); #ifdef INET if (mapped == 1) inp = in_pcblookup(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); else #endif inp = in6_pcblookup(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif /* INET6 */ #ifdef INET /* Path MTU to try next when a fragmentation-needed message is received. */ static inline int tcp_next_pmtu(const struct icmp *icp, const struct ip *ip) { int mtu = ntohs(icp->icmp_nextmtu); /* If no alternative MTU was proposed, try the next smaller one. */ if (!mtu) mtu = ip_next_mtu(ntohs(ip->ip_len), 1); if (mtu < V_tcp_minmss + sizeof(struct tcpiphdr)) mtu = V_tcp_minmss + sizeof(struct tcpiphdr); return (mtu); } static void tcp_ctlinput_with_port(struct icmp *icp, uint16_t port) { struct ip *ip; struct tcphdr *th; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int); struct in_conninfo inc; tcp_seq icmp_tcp_seq; int errno, mtu; errno = icmp_errmap(icp); switch (errno) { case 0: return; case EMSGSIZE: notify = tcp_mtudisc_notify; break; case ECONNREFUSED: if (V_icmp_may_rst) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; case EHOSTUNREACH: if (V_icmp_may_rst && icp->icmp_type == ICMP_TIMXCEED) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; default: notify = tcp_notify; } ip = &icp->icmp_ip; th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); icmp_tcp_seq = th->th_seq; inp = in_pcblookup(&V_tcbinfo, ip->ip_dst, th->th_dport, ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { tp = intotcpcb(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE && errno == EMSGSIZE) { /* * MTU discovery for offloaded connections. Let * the TOE driver verify seq# and process it. */ mtu = tcp_next_pmtu(icp, ip); tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu); goto out; } #endif if (tp->t_port != port) goto out; if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (errno == EMSGSIZE) { /* * MTU discovery: we got a needfrag and * will potentially try a lower MTU. */ mtu = tcp_next_pmtu(icp, ip); /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof(struct tcpiphdr)) { bzero(&inc, sizeof(inc)); inc.inc_faddr = ip->ip_dst; inc.inc_fibnum = inp->inp_inc.inc_fibnum; tcp_hc_updatemtu(&inc, mtu); inp = tcp_mtudisc(inp, mtu); } } else inp = (*notify)(inp, errno); } } else { bzero(&inc, sizeof(inc)); inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = ip->ip_dst; inc.inc_laddr = ip->ip_src; syncache_unreach(&inc, icmp_tcp_seq, port); } out: if (inp != NULL) INP_WUNLOCK(inp); } static void tcp_ctlinput(struct icmp *icmp) { tcp_ctlinput_with_port(icmp, htons(0)); } static void tcp_ctlinput_viaudp(udp_tun_icmp_param_t param) { /* Its a tunneled TCP over UDP icmp */ struct icmp *icmp = param.icmp; struct ip *outer_ip, *inner_ip; struct udphdr *udp; struct tcphdr *th, ttemp; int i_hlen, o_len; uint16_t port; outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip)); inner_ip = &icmp->icmp_ip; i_hlen = inner_ip->ip_hl << 2; o_len = ntohs(outer_ip->ip_len); if (o_len < (sizeof(struct ip) + 8 + i_hlen + sizeof(struct udphdr) + offsetof(struct tcphdr, th_ack))) { /* Not enough data present */ return; } /* Ok lets strip out the inner udphdr header by copying up on top of it the tcp hdr */ udp = (struct udphdr *)(((caddr_t)inner_ip) + i_hlen); if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { return; } port = udp->uh_dport; th = (struct tcphdr *)(udp + 1); memcpy(&ttemp, th, sizeof(struct tcphdr)); memcpy(udp, &ttemp, sizeof(struct tcphdr)); /* Now adjust down the size of the outer IP header */ o_len -= sizeof(struct udphdr); outer_ip->ip_len = htons(o_len); /* Now call in to the normal handling code */ tcp_ctlinput_with_port(icmp, port); } #endif /* INET */ #ifdef INET6 static inline int tcp6_next_pmtu(const struct icmp6_hdr *icmp6) { int mtu = ntohl(icmp6->icmp6_mtu); /* * If no alternative MTU was proposed, or the proposed MTU was too * small, set to the min. */ if (mtu < IPV6_MMTU) - mtu = IPV6_MMTU - 8; /* XXXNP: what is the adjustment for? */ + mtu = IPV6_MMTU; return (mtu); } static void tcp6_ctlinput_with_port(struct ip6ctlparam *ip6cp, uint16_t port) { struct in6_addr *dst; struct inpcb *(*notify)(struct inpcb *, int); struct ip6_hdr *ip6; struct mbuf *m; struct inpcb *inp; struct tcpcb *tp; struct icmp6_hdr *icmp6; struct in_conninfo inc; struct tcp_ports { uint16_t th_sport; uint16_t th_dport; } t_ports; tcp_seq icmp_tcp_seq; unsigned int mtu; unsigned int off; int errno; icmp6 = ip6cp->ip6c_icmp6; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; dst = &ip6cp->ip6c_finaldst->sin6_addr; errno = icmp6_errmap(icmp6); switch (errno) { case 0: return; case EMSGSIZE: notify = tcp_mtudisc_notify; break; case ECONNREFUSED: if (V_icmp_may_rst) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; case EHOSTUNREACH: /* * There are only four ICMPs that may reset connection: * - administratively prohibited * - port unreachable * - time exceeded in transit * - unknown next header */ if (V_icmp_may_rst && ((icmp6->icmp6_type == ICMP6_DST_UNREACH && (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN || icmp6->icmp6_code == ICMP6_DST_UNREACH_NOPORT)) || (icmp6->icmp6_type == ICMP6_TIME_EXCEEDED && icmp6->icmp6_code == ICMP6_TIME_EXCEED_TRANSIT) || (icmp6->icmp6_type == ICMP6_PARAM_PROB && icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER))) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; default: notify = tcp_notify; } /* Check if we can safely get the ports from the tcp hdr */ if (m == NULL || (m->m_pkthdr.len < (int32_t) (off + sizeof(struct tcp_ports)))) { return; } bzero(&t_ports, sizeof(struct tcp_ports)); m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports); inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, t_ports.th_dport, &ip6->ip6_src, t_ports.th_sport, INPLOOKUP_WLOCKPCB, NULL); off += sizeof(struct tcp_ports); if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) { goto out; } m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq); if (inp != NULL) { tp = intotcpcb(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE && errno == EMSGSIZE) { /* MTU discovery for offloaded connections. */ mtu = tcp6_next_pmtu(icmp6); tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu); goto out; } #endif if (tp->t_port != port) goto out; if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (errno == EMSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ mtu = tcp6_next_pmtu(icmp6); bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) goto out; /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof (struct tcphdr) + sizeof (struct ip6_hdr)) { tcp_hc_updatemtu(&inc, mtu); tcp_mtudisc(inp, mtu); ICMP6STAT_INC(icp6s_pmtuchg); } } else inp = (*notify)(inp, errno); } } else { bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc_fport = t_ports.th_dport; inc.inc_lport = t_ports.th_sport; inc.inc6_faddr = *dst; inc.inc6_laddr = ip6->ip6_src; syncache_unreach(&inc, icmp_tcp_seq, port); } out: if (inp != NULL) INP_WUNLOCK(inp); } static void tcp6_ctlinput(struct ip6ctlparam *ctl) { tcp6_ctlinput_with_port(ctl, htons(0)); } static void tcp6_ctlinput_viaudp(udp_tun_icmp_param_t param) { struct ip6ctlparam *ip6cp = param.ip6cp; struct mbuf *m; struct udphdr *udp; uint16_t port; m = m_pulldown(ip6cp->ip6c_m, ip6cp->ip6c_off, sizeof(struct udphdr), NULL); if (m == NULL) { return; } udp = mtod(m, struct udphdr *); if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { return; } port = udp->uh_dport; m_adj(m, sizeof(struct udphdr)); if ((m->m_flags & M_PKTHDR) == 0) { ip6cp->ip6c_m->m_pkthdr.len -= sizeof(struct udphdr); } /* Now call in to the normal handling code */ tcp6_ctlinput_with_port(ip6cp, port); } #endif /* INET6 */ static uint32_t tcp_keyed_hash(struct in_conninfo *inc, u_char *key, u_int len) { SIPHASH_CTX ctx; uint32_t hash[2]; KASSERT(len >= SIPHASH_KEY_LENGTH, ("%s: keylen %u too short ", __func__, len)); SipHash24_Init(&ctx); SipHash_SetKey(&ctx, (uint8_t *)key); SipHash_Update(&ctx, &inc->inc_fport, sizeof(uint16_t)); SipHash_Update(&ctx, &inc->inc_lport, sizeof(uint16_t)); switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: SipHash_Update(&ctx, &inc->inc_faddr, sizeof(struct in_addr)); SipHash_Update(&ctx, &inc->inc_laddr, sizeof(struct in_addr)); break; #endif #ifdef INET6 case INC_ISIPV6: SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(struct in6_addr)); SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(struct in6_addr)); break; #endif } SipHash_Final((uint8_t *)hash, &ctx); return (hash[0] ^ hash[1]); } uint32_t tcp_new_ts_offset(struct in_conninfo *inc) { struct in_conninfo inc_store, *local_inc; if (!V_tcp_ts_offset_per_conn) { memcpy(&inc_store, inc, sizeof(struct in_conninfo)); inc_store.inc_lport = 0; inc_store.inc_fport = 0; local_inc = &inc_store; } else { local_inc = inc; } return (tcp_keyed_hash(local_inc, V_ts_offset_secret, sizeof(V_ts_offset_secret))); } /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used, with only small modifications. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * As reading the *exact* system time is too expensive to be done * whenever setting up a TCP connection, we increment the time * offset in two ways. First, a small random positive increment * is added to isn_offset for each connection that is set up. * Second, the function tcp_isn_tick fires once per clock tick * and increments isn_offset as necessary so that sequence numbers * are incremented at approximately ISN_BYTES_PER_SECOND. The * random positive increments serve only to ensure that the same * exact sequence number is never sent out twice (as could otherwise * happen when a port is recycled in less than the system tick * interval.) * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, * isn_offset_old, and isn_ctx is performed using the ISN lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) #define ISN_SECRET_LENGTH SIPHASH_KEY_LENGTH VNET_DEFINE_STATIC(u_char, isn_secret[ISN_SECRET_LENGTH]); VNET_DEFINE_STATIC(int, isn_last); VNET_DEFINE_STATIC(int, isn_last_reseed); VNET_DEFINE_STATIC(u_int32_t, isn_offset); VNET_DEFINE_STATIC(u_int32_t, isn_offset_old); #define V_isn_secret VNET(isn_secret) #define V_isn_last VNET(isn_last) #define V_isn_last_reseed VNET(isn_last_reseed) #define V_isn_offset VNET(isn_offset) #define V_isn_offset_old VNET(isn_offset_old) tcp_seq tcp_new_isn(struct in_conninfo *inc) { tcp_seq new_isn; u_int32_t projected_offset; ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) < (u_int)ticks))) { arc4rand(&V_isn_secret, sizeof(V_isn_secret), 0); V_isn_last_reseed = ticks; } /* Compute the hash and return the ISN. */ new_isn = (tcp_seq)tcp_keyed_hash(inc, V_isn_secret, sizeof(V_isn_secret)); V_isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); if (ticks != V_isn_last) { projected_offset = V_isn_offset_old + ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last); if (SEQ_GT(projected_offset, V_isn_offset)) V_isn_offset = projected_offset; V_isn_offset_old = V_isn_offset; V_isn_last = ticks; } new_isn += V_isn_offset; ISN_UNLOCK(); return (new_isn); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ static struct inpcb * tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); if (tp->t_state != TCPS_SYN_SENT) return (inp); if (tp->t_flags & TF_FASTOPEN) tcp_fastopen_disable_path(tp); tp = tcp_drop(tp, errno); if (tp != NULL) return (inp); else return (NULL); } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value. Also nudge TCP to send something, since we * know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ static struct inpcb * tcp_mtudisc_notify(struct inpcb *inp, int error) { return (tcp_mtudisc(inp, -1)); } static struct inpcb * tcp_mtudisc(struct inpcb *inp, int mtuoffer) { struct tcpcb *tp; struct socket *so; INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); tcp_mss_update(tp, -1, mtuoffer, NULL, NULL); so = inp->inp_socket; SOCK_SENDBUF_LOCK(so); /* If the mss is larger than the socket buffer, decrease the mss. */ if (so->so_snd.sb_hiwat < tp->t_maxseg) { tp->t_maxseg = so->so_snd.sb_hiwat; if (tp->t_maxseg < V_tcp_mssdflt) { /* * The MSS is so small we should not process incoming * SACK's since we are subject to attack in such a * case. */ tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; } else { tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; } } SOCK_SENDBUF_UNLOCK(so); TCPSTAT_INC(tcps_mturesent); tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); if (tp->t_fb->tfb_tcp_mtu_chg != NULL) { /* * Conceptually the snd_nxt setting * and freeing sack holes should * be done by the default stacks * own tfb_tcp_mtu_chg(). */ tp->t_fb->tfb_tcp_mtu_chg(tp); } if (tcp_output(tp) < 0) return (NULL); else return (inp); } #ifdef INET /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return 0. This routine * is called by TCP routines that access the rmx structure and by * tcp_mss_update to get the peer/interface MTU. */ uint32_t tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop_object *nh; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); if (inc->inc_faddr.s_addr != INADDR_ANY) { nh = fib4_lookup(inc->inc_fibnum, inc->inc_faddr, 0, NHR_NONE, 0); if (nh == NULL) return (0); ifp = nh->nh_ifp; maxmtu = nh->nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO4 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; /* XXXKIB IFCAP2_IPSEC_OFFLOAD_TSO */ cap->ipsec_tso = (ifp->if_capenable2 & IFCAP2_BIT(IFCAP2_IPSEC_OFFLOAD)) != 0; } } } return (maxmtu); } #endif /* INET */ #ifdef INET6 uint32_t tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop_object *nh; struct in6_addr dst6; uint32_t scopeid; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); if (inc->inc_flags & INC_IPV6MINMTU) return (IPV6_MMTU); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid); nh = fib6_lookup(inc->inc_fibnum, &dst6, scopeid, NHR_NONE, 0); if (nh == NULL) return (0); ifp = nh->nh_ifp; maxmtu = nh->nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO6 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; cap->ipsec_tso = false; /* XXXKIB */ } } } return (maxmtu); } /* * Handle setsockopt(IPV6_USE_MIN_MTU) by a TCP stack. * * XXXGL: we are updating inpcb here with INC_IPV6MINMTU flag. * The right place to do that is ip6_setpktopt() that has just been * executed. By the way it just filled ip6po_minmtu for us. */ void tcp6_use_min_mtu(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); INP_WLOCK_ASSERT(inp); /* * In case of the IPV6_USE_MIN_MTU socket * option, the INC_IPV6MINMTU flag to announce * a corresponding MSS during the initial * handshake. If the TCP connection is not in * the front states, just reduce the MSS being * used. This avoids the sending of TCP * segments which will be fragmented at the * IPv6 layer. */ inp->inp_inc.inc_flags |= INC_IPV6MINMTU; if ((tp->t_state >= TCPS_SYN_SENT) && (inp->inp_inc.inc_flags & INC_ISIPV6)) { struct ip6_pktopts *opt; opt = inp->in6p_outputopts; if (opt != NULL && opt->ip6po_minmtu == IP6PO_MINMTU_ALL && tp->t_maxseg > TCP6_MSS) { tp->t_maxseg = TCP6_MSS; if (tp->t_maxseg < V_tcp_mssdflt) { /* * The MSS is so small we should not process incoming * SACK's since we are subject to attack in such a * case. */ tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; } else { tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; } } } } #endif /* INET6 */ /* * Calculate effective SMSS per RFC5681 definition for a given TCP * connection at its current state, taking into account SACK and etc. */ u_int tcp_maxseg(const struct tcpcb *tp) { u_int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We might make mistakes with padding here in some edge cases, * but this is harmless, since result of tcp_maxseg() is used * only in cwnd and ssthresh estimations. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { optlen += TCPOLEN_SACKHDR; optlen += tp->rcv_numsacks * TCPOLEN_SACK; optlen = PADTCPOLEN(optlen); } } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PADTCPOLEN(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PADTCPOLEN(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PADTCPOLEN(TCPOLEN_SACK_PERMITTED); } optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } u_int tcp_fixed_maxseg(const struct tcpcb *tp) { int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We only consider fixed options that we would send every * time I.e. SACK is not considered. This is important * for cc modules to figure out what the modulo of the * cwnd should be. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PADTCPOLEN(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PADTCPOLEN(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PADTCPOLEN(TCPOLEN_SACK_PERMITTED); } optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } static int sysctl_drop(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; #ifdef INET struct sockaddr_in *fin = NULL, *lin = NULL; #endif struct epoch_tracker et; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); #ifdef INET fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; #endif break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } NET_EPOCH_ENTER(et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } if (inp != NULL) { if (!SOLISTENING(inp->inp_socket)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); if (tp != NULL) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); } else error = ESRCH; NET_EPOCH_EXIT(et); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_drop, "", "Drop TCP connection"); static int tcp_sysctl_setsockopt(SYSCTL_HANDLER_ARGS) { return (sysctl_setsockopt(oidp, arg1, arg2, req, &V_tcbinfo, &tcp_ctloutput_set)); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, setsockopt, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, tcp_sysctl_setsockopt, "", "Set socket option for TCP endpoint"); #ifdef KERN_TLS static int sysctl_switch_tls(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; #ifdef INET struct sockaddr_in *fin = NULL, *lin = NULL; #endif struct epoch_tracker et; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); #ifdef INET fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; #endif break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } NET_EPOCH_ENTER(et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } NET_EPOCH_EXIT(et); if (inp != NULL) { struct socket *so; so = inp->inp_socket; soref(so); error = ktls_set_tx_mode(so, arg2 == 0 ? TCP_TLS_MODE_SW : TCP_TLS_MODE_IFNET); INP_WUNLOCK(inp); sorele(so); } else error = ESRCH; return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_sw_tls, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_switch_tls, "", "Switch TCP connection to SW TLS"); SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_ifnet_tls, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 1, sysctl_switch_tls, "", "Switch TCP connection to ifnet TLS"); #endif /* * Generate a standardized TCP log line for use throughout the * tcp subsystem. Memory allocation is done with M_NOWAIT to * allow use in the interrupt context. * * NB: The caller MUST free(s, M_TCPLOG) the returned string. * NB: The function may return NULL if memory allocation failed. * * Due to header inclusion and ordering limitations the struct ip * and ip6_hdr pointers have to be passed as void pointers. */ char * tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (V_tcp_log_in_vain == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } char * tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (tcp_log_debug == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { char *s, *sp; size_t size; #ifdef INET const struct ip *ip = (const struct ip *)ip4hdr; #endif #ifdef INET6 const struct ip6_hdr *ip6 = (const struct ip6_hdr *)ip6hdr; #endif /* INET6 */ /* * The log line looks like this: * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2" */ size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + sizeof(PRINT_TH_FLAGS) + 1 + #ifdef INET6 2 * INET6_ADDRSTRLEN; #else 2 * INET_ADDRSTRLEN; #endif /* INET6 */ s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); if (s == NULL) return (NULL); strcat(s, "TCP: ["); sp = s + strlen(s); if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) { inet_ntoa_r(inc->inc_faddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); inet_ntoa_r(inc->inc_laddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); #ifdef INET6 } else if (inc) { ip6_sprintf(sp, &inc->inc6_faddr); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); ip6_sprintf(sp, &inc->inc6_laddr); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); } else if (ip6 && th) { ip6_sprintf(sp, &ip6->ip6_src); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); ip6_sprintf(sp, &ip6->ip6_dst); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET6 */ #ifdef INET } else if (ip && th) { inet_ntoa_r(ip->ip_src, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); inet_ntoa_r(ip->ip_dst, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET */ } else { free(s, M_TCPLOG); return (NULL); } sp = s + strlen(s); if (th) sprintf(sp, " tcpflags 0x%b", tcp_get_flags(th), PRINT_TH_FLAGS); if (*(s + size - 1) != '\0') panic("%s: string too long", __func__); return (s); } /* * A subroutine which makes it easy to track TCP state changes with DTrace. * This function shouldn't be called for t_state initializations that don't * correspond to actual TCP state transitions. */ void tcp_state_change(struct tcpcb *tp, int newstate) { #if defined(KDTRACE_HOOKS) int pstate = tp->t_state; #endif TCPSTATES_DEC(tp->t_state); TCPSTATES_INC(newstate); tp->t_state = newstate; TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate); } /* * Create an external-format (``xtcpcb'') structure using the information in * the kernel-format tcpcb structure pointed to by tp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt) { struct tcpcb *tp = intotcpcb(inp); sbintime_t now; bzero(xt, sizeof(*xt)); xt->t_state = tp->t_state; xt->t_logstate = tcp_get_bblog_state(tp); xt->t_flags = tp->t_flags; xt->t_sndzerowin = tp->t_sndzerowin; xt->t_sndrexmitpack = tp->t_sndrexmitpack; xt->t_rcvoopack = tp->t_rcvoopack; xt->t_rcv_wnd = tp->rcv_wnd; xt->t_snd_wnd = tp->snd_wnd; xt->t_snd_cwnd = tp->snd_cwnd; xt->t_snd_ssthresh = tp->snd_ssthresh; xt->t_dsack_bytes = tp->t_dsack_bytes; xt->t_dsack_tlp_bytes = tp->t_dsack_tlp_bytes; xt->t_dsack_pack = tp->t_dsack_pack; xt->t_maxseg = tp->t_maxseg; xt->xt_ecn = (tp->t_flags2 & TF2_ECN_PERMIT) ? 1 : 0 + (tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0; now = getsbinuptime(); #define COPYTIMER(which,where) do { \ if (tp->t_timers[which] != SBT_MAX) \ xt->where = (tp->t_timers[which] - now) / SBT_1MS; \ else \ xt->where = 0; \ } while (0) COPYTIMER(TT_DELACK, tt_delack); COPYTIMER(TT_REXMT, tt_rexmt); COPYTIMER(TT_PERSIST, tt_persist); COPYTIMER(TT_KEEP, tt_keep); COPYTIMER(TT_2MSL, tt_2msl); #undef COPYTIMER xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz; xt->xt_encaps_port = tp->t_port; bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack, TCP_FUNCTION_NAME_LEN_MAX); bcopy(CC_ALGO(tp)->name, xt->xt_cc, TCP_CA_NAME_MAX); #ifdef TCP_BLACKBOX (void)tcp_log_get_id(tp, xt->xt_logid); #endif xt->xt_len = sizeof(struct xtcpcb); in_pcbtoxinpcb(inp, &xt->xt_inp); } void tcp_log_end_status(struct tcpcb *tp, uint8_t status) { uint32_t bit, i; if ((tp == NULL) || (status > TCP_EI_STATUS_MAX_VALUE) || (status == 0)) { /* Invalid */ return; } if (status > (sizeof(uint32_t) * 8)) { /* Should this be a KASSERT? */ return; } bit = 1U << (status - 1); if (bit & tp->t_end_info_status) { /* already logged */ return; } for (i = 0; i < TCP_END_BYTE_INFO; i++) { if (tp->t_end_info_bytes[i] == TCP_EI_EMPTY_SLOT) { tp->t_end_info_bytes[i] = status; tp->t_end_info_status |= bit; break; } } } int tcp_can_enable_pacing(void) { if ((tcp_pacing_limit == -1) || (tcp_pacing_limit > number_of_tcp_connections_pacing)) { atomic_fetchadd_int(&number_of_tcp_connections_pacing, 1); shadow_num_connections = number_of_tcp_connections_pacing; return (1); } else { counter_u64_add(tcp_pacing_failures, 1); return (0); } } int tcp_incr_dgp_pacing_cnt(void) { if ((tcp_dgp_limit == -1) || (tcp_dgp_limit > number_of_dgp_connections)) { atomic_fetchadd_int(&number_of_dgp_connections, 1); shadow_tcp_pacing_dgp = number_of_dgp_connections; return (1); } else { counter_u64_add(tcp_dgp_failures, 1); return (0); } } static uint8_t tcp_dgp_warning = 0; void tcp_dec_dgp_pacing_cnt(void) { uint32_t ret; ret = atomic_fetchadd_int(&number_of_dgp_connections, -1); shadow_tcp_pacing_dgp = number_of_dgp_connections; KASSERT(ret != 0, ("number_of_dgp_connections -1 would cause wrap?")); if (ret == 0) { if (tcp_dgp_limit != -1) { printf("Warning all DGP is now disabled, count decrements invalidly!\n"); tcp_dgp_limit = 0; tcp_dgp_warning = 1; } else if (tcp_dgp_warning == 0) { printf("Warning DGP pacing is invalid, invalid decrement\n"); tcp_dgp_warning = 1; } } } static uint8_t tcp_pacing_warning = 0; void tcp_decrement_paced_conn(void) { uint32_t ret; ret = atomic_fetchadd_int(&number_of_tcp_connections_pacing, -1); shadow_num_connections = number_of_tcp_connections_pacing; KASSERT(ret != 0, ("tcp_paced_connection_exits -1 would cause wrap?")); if (ret == 0) { if (tcp_pacing_limit != -1) { printf("Warning all pacing is now disabled, count decrements invalidly!\n"); tcp_pacing_limit = 0; } else if (tcp_pacing_warning == 0) { printf("Warning pacing count is invalid, invalid decrement\n"); tcp_pacing_warning = 1; } } } static void tcp_default_switch_failed(struct tcpcb *tp) { /* * If a switch fails we only need to * care about two things: * a) The t_flags2 * and * b) The timer granularity. * Timeouts, at least for now, don't use the * old callout system in the other stacks so * those are hopefully safe. */ tcp_lro_features_off(tp); tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS); } #ifdef TCP_ACCOUNTING int tcp_do_ack_accounting(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t tiwin, int mss) { if (SEQ_LT(th->th_ack, tp->snd_una)) { /* Do we have a SACK? */ if (to->to_flags & TOF_SACK) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_SACK]++; } return (ACK_SACK); } else { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_BEHIND]++; } return (ACK_BEHIND); } } else if (th->th_ack == tp->snd_una) { /* Do we have a SACK? */ if (to->to_flags & TOF_SACK) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_SACK]++; } return (ACK_SACK); } else if (tiwin != tp->snd_wnd) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_RWND]++; } return (ACK_RWND); } else { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_DUPACK]++; } return (ACK_DUPACK); } } else { if (!SEQ_GT(th->th_ack, tp->snd_max)) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((th->th_ack - tp->snd_una) + mss - 1)/mss); } } if (to->to_flags & TOF_SACK) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_CUMACK_SACK]++; } return (ACK_CUMACK_SACK); } else { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_CUMACK]++; } return (ACK_CUMACK); } } } #endif void tcp_change_time_units(struct tcpcb *tp, int granularity) { if (tp->t_tmr_granularity == granularity) { /* We are there */ return; } if (granularity == TCP_TMR_GRANULARITY_USEC) { KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_TICKS), ("Granularity is not TICKS its %u in tp:%p", tp->t_tmr_granularity, tp)); tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow); if (tp->t_srtt > 1) { uint32_t val, frac; val = tp->t_srtt >> TCP_RTT_SHIFT; frac = tp->t_srtt & 0x1f; tp->t_srtt = TICKS_2_USEC(val); /* * frac is the fractional part of the srtt (if any) * but its in ticks and every bit represents * 1/32nd of a hz. */ if (frac) { if (hz == 1000) { frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); } else { frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); } tp->t_srtt += frac; } } if (tp->t_rttvar) { uint32_t val, frac; val = tp->t_rttvar >> TCP_RTTVAR_SHIFT; frac = tp->t_rttvar & 0x1f; tp->t_rttvar = TICKS_2_USEC(val); /* * frac is the fractional part of the srtt (if any) * but its in ticks and every bit represents * 1/32nd of a hz. */ if (frac) { if (hz == 1000) { frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); } else { frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); } tp->t_rttvar += frac; } } tp->t_tmr_granularity = TCP_TMR_GRANULARITY_USEC; } else if (granularity == TCP_TMR_GRANULARITY_TICKS) { /* Convert back to ticks, with */ KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_USEC), ("Granularity is not USEC its %u in tp:%p", tp->t_tmr_granularity, tp)); if (tp->t_srtt > 1) { uint32_t val, frac; val = USEC_2_TICKS(tp->t_srtt); frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); tp->t_srtt = val << TCP_RTT_SHIFT; /* * frac is the fractional part here is left * over from converting to hz and shifting. * We need to convert this to the 5 bit * remainder. */ if (frac) { if (hz == 1000) { frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); } else { frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); } tp->t_srtt += frac; } } if (tp->t_rttvar) { uint32_t val, frac; val = USEC_2_TICKS(tp->t_rttvar); frac = tp->t_rttvar % (HPTS_USEC_IN_SEC / hz); tp->t_rttvar = val << TCP_RTTVAR_SHIFT; /* * frac is the fractional part here is left * over from converting to hz and shifting. * We need to convert this to the 4 bit * remainder. */ if (frac) { if (hz == 1000) { frac = (((uint64_t)frac * (uint64_t)TCP_RTTVAR_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); } else { frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTTVAR_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); } tp->t_rttvar += frac; } } tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow); tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS; } #ifdef INVARIANTS else { panic("Unknown granularity:%d tp:%p", granularity, tp); } #endif } void tcp_handle_orphaned_packets(struct tcpcb *tp) { struct mbuf *save, *m, *prev; /* * Called when a stack switch is occuring from the fini() * of the old stack. We assue the init() as already been * run of the new stack and it has set the t_flags2 to * what it supports. This function will then deal with any * differences i.e. cleanup packets that maybe queued that * the newstack does not support. */ if (tp->t_flags2 & TF2_MBUF_L_ACKS) return; if ((tp->t_flags2 & TF2_SUPPORTS_MBUFQ) == 0 && !STAILQ_EMPTY(&tp->t_inqueue)) { /* * It is unsafe to process the packets since a * reset may be lurking in them (its rare but it * can occur). If we were to find a RST, then we * would end up dropping the connection and the * INP lock, so when we return the caller (tcp_usrreq) * will blow up when it trys to unlock the inp. * This new stack does not do any fancy LRO features * so all we can do is toss the packets. */ m = STAILQ_FIRST(&tp->t_inqueue); STAILQ_INIT(&tp->t_inqueue); STAILQ_FOREACH_FROM_SAFE(m, &tp->t_inqueue, m_stailqpkt, save) m_freem(m); } else { /* * Here we have a stack that does mbuf queuing but * does not support compressed ack's. We must * walk all the mbufs and discard any compressed acks. */ STAILQ_FOREACH_SAFE(m, &tp->t_inqueue, m_stailqpkt, save) { if (m->m_flags & M_ACKCMP) { if (m == STAILQ_FIRST(&tp->t_inqueue)) STAILQ_REMOVE_HEAD(&tp->t_inqueue, m_stailqpkt); else STAILQ_REMOVE_AFTER(&tp->t_inqueue, prev, m_stailqpkt); m_freem(m); } else prev = m; } } } #ifdef TCP_REQUEST_TRK uint32_t tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes) { #ifdef KERN_TLS struct ktls_session *tls; uint32_t rec_oh, records; tls = so->so_snd.sb_tls_info; if (tls == NULL) return (0); rec_oh = tls->params.tls_hlen + tls->params.tls_tlen; records = ((tls_usr_bytes + tls->params.max_frame_len - 1)/tls->params.max_frame_len); return (records * rec_oh); #else return (0); #endif } extern uint32_t tcp_stale_entry_time; uint32_t tcp_stale_entry_time = 250000; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, usrlog_stale, CTLFLAG_RW, &tcp_stale_entry_time, 250000, "Time that a tcpreq entry without a sendfile ages out"); void tcp_req_log_req_info(struct tcpcb *tp, struct tcp_sendfile_track *req, uint16_t slot, uint8_t val, uint64_t offset, uint64_t nbytes) { if (tcp_bblogging_on(tp)) { union tcp_log_stackspecific log; struct timeval tv; memset(&log, 0, sizeof(log)); log.u_bbr.inhpts = tcp_in_hpts(tp); log.u_bbr.flex8 = val; log.u_bbr.rttProp = req->timestamp; log.u_bbr.delRate = req->start; log.u_bbr.cur_del_rate = req->end; log.u_bbr.flex1 = req->start_seq; log.u_bbr.flex2 = req->end_seq; log.u_bbr.flex3 = req->flags; log.u_bbr.flex4 = ((req->localtime >> 32) & 0x00000000ffffffff); log.u_bbr.flex5 = (req->localtime & 0x00000000ffffffff); log.u_bbr.flex7 = slot; log.u_bbr.bw_inuse = offset; /* nbytes = flex6 | epoch */ log.u_bbr.flex6 = ((nbytes >> 32) & 0x00000000ffffffff); log.u_bbr.epoch = (nbytes & 0x00000000ffffffff); /* cspr = lt_epoch | pkts_out */ log.u_bbr.lt_epoch = ((req->cspr >> 32) & 0x00000000ffffffff); log.u_bbr.pkts_out |= (req->cspr & 0x00000000ffffffff); log.u_bbr.applimited = tp->t_tcpreq_closed; log.u_bbr.applimited <<= 8; log.u_bbr.applimited |= tp->t_tcpreq_open; log.u_bbr.applimited <<= 8; log.u_bbr.applimited |= tp->t_tcpreq_req; log.u_bbr.timeStamp = tcp_get_usecs(&tv); TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv, &tptosocket(tp)->so_snd, TCP_LOG_REQ_T, 0, 0, &log, false, &tv); } } void tcp_req_free_a_slot(struct tcpcb *tp, struct tcp_sendfile_track *ent) { if (tp->t_tcpreq_req > 0) tp->t_tcpreq_req--; if (ent->flags & TCP_TRK_TRACK_FLG_OPEN) { if (tp->t_tcpreq_open > 0) tp->t_tcpreq_open--; } else { if (tp->t_tcpreq_closed > 0) tp->t_tcpreq_closed--; } ent->flags = TCP_TRK_TRACK_FLG_EMPTY; } static void tcp_req_check_for_stale_entries(struct tcpcb *tp, uint64_t ts, int rm_oldest) { struct tcp_sendfile_track *ent; uint64_t time_delta, oldest_delta; int i, oldest, oldest_set = 0, cnt_rm = 0; for (i = 0; i < MAX_TCP_TRK_REQ; i++) { ent = &tp->t_tcpreq_info[i]; if (ent->flags != TCP_TRK_TRACK_FLG_USED) { /* * We only care about closed end ranges * that are allocated and have no sendfile * ever touching them. They would be in * state USED. */ continue; } if (ts >= ent->localtime) time_delta = ts - ent->localtime; else time_delta = 0; if (time_delta && ((oldest_delta < time_delta) || (oldest_set == 0))) { oldest_set = 1; oldest = i; oldest_delta = time_delta; } if (tcp_stale_entry_time && (time_delta >= tcp_stale_entry_time)) { /* * No sendfile in a our time-limit * time to purge it. */ cnt_rm++; tcp_req_log_req_info(tp, &tp->t_tcpreq_info[i], i, TCP_TRK_REQ_LOG_STALE, time_delta, 0); tcp_req_free_a_slot(tp, ent); } } if ((cnt_rm == 0) && rm_oldest && oldest_set) { ent = &tp->t_tcpreq_info[oldest]; tcp_req_log_req_info(tp, &tp->t_tcpreq_info[i], i, TCP_TRK_REQ_LOG_STALE, oldest_delta, 1); tcp_req_free_a_slot(tp, ent); } } int tcp_req_check_for_comp(struct tcpcb *tp, tcp_seq ack_point) { int i, ret = 0; struct tcp_sendfile_track *ent; /* Clean up any old closed end requests that are now completed */ if (tp->t_tcpreq_req == 0) return (0); if (tp->t_tcpreq_closed == 0) return (0); for (i = 0; i < MAX_TCP_TRK_REQ; i++) { ent = &tp->t_tcpreq_info[i]; /* Skip empty ones */ if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) continue; /* Skip open ones */ if (ent->flags & TCP_TRK_TRACK_FLG_OPEN) continue; if (SEQ_GEQ(ack_point, ent->end_seq)) { /* We are past it -- free it */ tcp_req_log_req_info(tp, ent, i, TCP_TRK_REQ_LOG_FREED, 0, 0); tcp_req_free_a_slot(tp, ent); ret++; } } return (ret); } int tcp_req_is_entry_comp(struct tcpcb *tp, struct tcp_sendfile_track *ent, tcp_seq ack_point) { if (tp->t_tcpreq_req == 0) return (-1); if (tp->t_tcpreq_closed == 0) return (-1); if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) return (-1); if (SEQ_GEQ(ack_point, ent->end_seq)) { return (1); } return (0); } struct tcp_sendfile_track * tcp_req_find_a_req_that_is_completed_by(struct tcpcb *tp, tcp_seq th_ack, int *ip) { /* * Given an ack point (th_ack) walk through our entries and * return the first one found that th_ack goes past the * end_seq. */ struct tcp_sendfile_track *ent; int i; if (tp->t_tcpreq_req == 0) { /* none open */ return (NULL); } for (i = 0; i < MAX_TCP_TRK_REQ; i++) { ent = &tp->t_tcpreq_info[i]; if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) continue; if ((ent->flags & TCP_TRK_TRACK_FLG_OPEN) == 0) { if (SEQ_GEQ(th_ack, ent->end_seq)) { *ip = i; return (ent); } } } return (NULL); } struct tcp_sendfile_track * tcp_req_find_req_for_seq(struct tcpcb *tp, tcp_seq seq) { struct tcp_sendfile_track *ent; int i; if (tp->t_tcpreq_req == 0) { /* none open */ return (NULL); } for (i = 0; i < MAX_TCP_TRK_REQ; i++) { ent = &tp->t_tcpreq_info[i]; tcp_req_log_req_info(tp, ent, i, TCP_TRK_REQ_LOG_SEARCH, (uint64_t)seq, 0); if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) { continue; } if (ent->flags & TCP_TRK_TRACK_FLG_OPEN) { /* * An open end request only needs to * match the beginning seq or be * all we have (once we keep going on * a open end request we may have a seq * wrap). */ if ((SEQ_GEQ(seq, ent->start_seq)) || (tp->t_tcpreq_closed == 0)) return (ent); } else { /* * For this one we need to * be a bit more careful if its * completed at least. */ if ((SEQ_GEQ(seq, ent->start_seq)) && (SEQ_LT(seq, ent->end_seq))) { return (ent); } } } return (NULL); } /* Should this be in its own file tcp_req.c ? */ struct tcp_sendfile_track * tcp_req_alloc_req_full(struct tcpcb *tp, struct tcp_snd_req *req, uint64_t ts, int rec_dups) { struct tcp_sendfile_track *fil; int i, allocated; /* In case the stack does not check for completions do so now */ tcp_req_check_for_comp(tp, tp->snd_una); /* Check for stale entries */ if (tp->t_tcpreq_req) tcp_req_check_for_stale_entries(tp, ts, (tp->t_tcpreq_req >= MAX_TCP_TRK_REQ)); /* Check to see if this is a duplicate of one not started */ if (tp->t_tcpreq_req) { for (i = 0, allocated = 0; i < MAX_TCP_TRK_REQ; i++) { fil = &tp->t_tcpreq_info[i]; if ((fil->flags & TCP_TRK_TRACK_FLG_USED) == 0) continue; if ((fil->timestamp == req->timestamp) && (fil->start == req->start) && ((fil->flags & TCP_TRK_TRACK_FLG_OPEN) || (fil->end == req->end))) { /* * We already have this request * and it has not been started with sendfile. * This probably means the user was returned * a 4xx of some sort and its going to age * out, lets not duplicate it. */ return (fil); } } } /* Ok if there is no room at the inn we are in trouble */ if (tp->t_tcpreq_req >= MAX_TCP_TRK_REQ) { tcp_trace_point(tp, TCP_TP_REQ_LOG_FAIL); for (i = 0; i < MAX_TCP_TRK_REQ; i++) { tcp_req_log_req_info(tp, &tp->t_tcpreq_info[i], i, TCP_TRK_REQ_LOG_ALLOCFAIL, 0, 0); } return (NULL); } for (i = 0, allocated = 0; i < MAX_TCP_TRK_REQ; i++) { fil = &tp->t_tcpreq_info[i]; if (fil->flags == TCP_TRK_TRACK_FLG_EMPTY) { allocated = 1; fil->flags = TCP_TRK_TRACK_FLG_USED; fil->timestamp = req->timestamp; fil->playout_ms = req->playout_ms; fil->localtime = ts; fil->start = req->start; if (req->flags & TCP_LOG_HTTPD_RANGE_END) { fil->end = req->end; } else { fil->end = 0; fil->flags |= TCP_TRK_TRACK_FLG_OPEN; } /* * We can set the min boundaries to the TCP Sequence space, * but it might be found to be further up when sendfile * actually runs on this range (if it ever does). */ fil->sbcc_at_s = tptosocket(tp)->so_snd.sb_ccc; fil->start_seq = tp->snd_una + tptosocket(tp)->so_snd.sb_ccc; if (req->flags & TCP_LOG_HTTPD_RANGE_END) fil->end_seq = (fil->start_seq + ((uint32_t)(fil->end - fil->start))); else fil->end_seq = 0; if (tptosocket(tp)->so_snd.sb_tls_info) { /* * This session is doing TLS. Take a swag guess * at the overhead. */ fil->end_seq += tcp_estimate_tls_overhead( tptosocket(tp), (fil->end - fil->start)); } tp->t_tcpreq_req++; if (fil->flags & TCP_TRK_TRACK_FLG_OPEN) tp->t_tcpreq_open++; else tp->t_tcpreq_closed++; tcp_req_log_req_info(tp, fil, i, TCP_TRK_REQ_LOG_NEW, 0, 0); break; } else fil = NULL; } return (fil); } void tcp_req_alloc_req(struct tcpcb *tp, union tcp_log_userdata *user, uint64_t ts) { (void)tcp_req_alloc_req_full(tp, &user->tcp_req, ts, 1); } #endif void tcp_log_socket_option(struct tcpcb *tp, uint32_t option_num, uint32_t option_val, int err) { if (tcp_bblogging_on(tp)) { struct tcp_log_buffer *l; l = tcp_log_event(tp, NULL, &tptosocket(tp)->so_rcv, &tptosocket(tp)->so_snd, TCP_LOG_SOCKET_OPT, err, 0, NULL, 1, NULL, NULL, 0, NULL); if (l) { l->tlb_flex1 = option_num; l->tlb_flex2 = option_val; } } } uint32_t tcp_get_srtt(struct tcpcb *tp, int granularity) { uint32_t srtt; KASSERT(granularity == TCP_TMR_GRANULARITY_USEC || granularity == TCP_TMR_GRANULARITY_TICKS, ("%s: called with unexpected granularity %d", __func__, granularity)); srtt = tp->t_srtt; /* * We only support two granularities. If the stored granularity * does not match the granularity requested by the caller, * convert the stored value to the requested unit of granularity. */ if (tp->t_tmr_granularity != granularity) { if (granularity == TCP_TMR_GRANULARITY_USEC) srtt = TICKS_2_USEC(srtt); else srtt = USEC_2_TICKS(srtt); } /* * If the srtt is stored with ticks granularity, we need to * unshift to get the actual value. We do this after the * conversion above (if one was necessary) in order to maximize * precision. */ if (tp->t_tmr_granularity == TCP_TMR_GRANULARITY_TICKS) srtt = srtt >> TCP_RTT_SHIFT; return (srtt); } void tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt, uint8_t is_tlp, bool hw_tls) { if (is_tlp) { tp->t_sndtlppack++; tp->t_sndtlpbyte += len; } /* To get total bytes sent you must add t_snd_rxt_bytes to t_sndbytes */ if (is_rxt) tp->t_snd_rxt_bytes += len; else tp->t_sndbytes += len; #ifdef KERN_TLS if (hw_tls && is_rxt && len != 0) { uint64_t rexmit_percent; rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / (10ULL * (tp->t_snd_rxt_bytes + tp->t_sndbytes)); if (rexmit_percent > ktls_ifnet_max_rexmit_pct) ktls_disable_ifnet(tp); } #endif } diff --git a/sys/netinet6/icmp6.c b/sys/netinet6/icmp6.c index d89515d7eda5..f98381499b2d 100644 --- a/sys/netinet6/icmp6.c +++ b/sys/netinet6/icmp6.c @@ -1,2934 +1,2906 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #define MBUF_PRIVATE /* XXXRW: Optimisation tries to avoid M_EXT mbufs */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern ip6proto_ctlinput_t *ip6_ctlprotox[]; VNET_PCPUSTAT_DEFINE(struct icmp6stat, icmp6stat); VNET_PCPUSTAT_SYSINIT(icmp6stat); SYSCTL_VNET_PCPUSTAT(_net_inet6_icmp6, ICMPV6CTL_STATS, stats, struct icmp6stat, icmp6stat, "ICMPv6 statistics (struct icmp6stat, netinet/icmp6.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(icmp6stat); #endif /* VIMAGE */ VNET_DEFINE_STATIC(int, icmp6_rediraccept) = 1; #define V_icmp6_rediraccept VNET(icmp6_rediraccept) SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, rediraccept, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_rediraccept), 0, "Accept ICMPv6 redirect messages"); VNET_DEFINE_STATIC(int, icmp6_redirtimeout) = 10 * 60; /* 10 minutes */ #define V_icmp6_redirtimeout VNET(icmp6_redirtimeout) SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT, redirtimeout, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_redirtimeout), 0, "Delay in seconds before expiring redirect route"); VNET_DEFINE_STATIC(int, icmp6_nodeinfo) = 0; #define V_icmp6_nodeinfo VNET(icmp6_nodeinfo) SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO, nodeinfo, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_nodeinfo), 0, "Mask of enabled RFC4620 node information query types"); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) VNET_DECLARE(int, rip_bind_all_fibs); #define V_rip_bind_all_fibs VNET(rip_bind_all_fibs) static void icmp6_errcount(int, int); static int icmp6_rip6_input(struct mbuf **, int); static void icmp6_reflect(struct mbuf *, size_t); static const char *icmp6_redirect_diag(struct in6_addr *, struct in6_addr *, struct in6_addr *); static struct mbuf *ni6_input(struct mbuf *, int, struct prison *); static struct mbuf *ni6_nametodns(const char *, int, int); static int ni6_dnsmatch(const char *, int, const char *, int); static int ni6_addrs(struct icmp6_nodeinfo *, struct mbuf *, struct ifnet **, struct in6_addr *); static int ni6_store_addrs(struct icmp6_nodeinfo *, struct icmp6_nodeinfo *, struct ifnet *, int); static int icmp6_notify_error(struct mbuf **, int, int); +static void icmp6_mtudisc_update(struct ip6ctlparam *ip6cp); /* * Kernel module interface for updating icmp6stat. The argument is an index * into icmp6stat treated as an array of u_quad_t. While this encodes the * general layout of icmp6stat into the caller, it doesn't encode its * location, so that future changes to add, for example, per-CPU stats * support won't cause binary compatibility problems for kernel modules. */ void kmod_icmp6stat_inc(int statnum) { counter_u64_add(VNET(icmp6stat)[statnum], 1); } static void icmp6_errcount(int type, int code) { switch (type) { case ICMP6_DST_UNREACH: switch (code) { case ICMP6_DST_UNREACH_NOROUTE: ICMP6STAT_INC(icp6s_odst_unreach_noroute); return; case ICMP6_DST_UNREACH_ADMIN: ICMP6STAT_INC(icp6s_odst_unreach_admin); return; case ICMP6_DST_UNREACH_BEYONDSCOPE: ICMP6STAT_INC(icp6s_odst_unreach_beyondscope); return; case ICMP6_DST_UNREACH_ADDR: ICMP6STAT_INC(icp6s_odst_unreach_addr); return; case ICMP6_DST_UNREACH_NOPORT: ICMP6STAT_INC(icp6s_odst_unreach_noport); return; } break; case ICMP6_PACKET_TOO_BIG: ICMP6STAT_INC(icp6s_opacket_too_big); return; case ICMP6_TIME_EXCEEDED: switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: ICMP6STAT_INC(icp6s_otime_exceed_transit); return; case ICMP6_TIME_EXCEED_REASSEMBLY: ICMP6STAT_INC(icp6s_otime_exceed_reassembly); return; } break; case ICMP6_PARAM_PROB: switch (code) { case ICMP6_PARAMPROB_HEADER: ICMP6STAT_INC(icp6s_oparamprob_header); return; case ICMP6_PARAMPROB_NEXTHEADER: ICMP6STAT_INC(icp6s_oparamprob_nextheader); return; case ICMP6_PARAMPROB_OPTION: ICMP6STAT_INC(icp6s_oparamprob_option); return; } break; case ND_REDIRECT: ICMP6STAT_INC(icp6s_oredirect); return; } ICMP6STAT_INC(icp6s_ounknown); } /* * A wrapper function for icmp6_error() necessary when the erroneous packet * may not contain enough scope zone information. */ void icmp6_error2(struct mbuf *m, int type, int code, int param, struct ifnet *ifp) { struct ip6_hdr *ip6; if (ifp == NULL) return; if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); return; } } ip6 = mtod(m, struct ip6_hdr *); if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0) return; if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) return; icmp6_error(m, type, code, param); } /* * Generate an error packet of type error in response to bad IP6 packet. */ void icmp6_error(struct mbuf *m, int type, int code, int param) { struct ip6_hdr *oip6, *nip6; struct icmp6_hdr *icmp6; struct epoch_tracker et; u_int preplen; int off; int nxt; ICMP6STAT_INC(icp6s_error); /* count per-type-code statistics */ icmp6_errcount(type, code); #ifdef M_DECRYPTED /*not openbsd*/ if (m->m_flags & M_DECRYPTED) { ICMP6STAT_INC(icp6s_canterror); goto freeit; } #endif if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); return; } } oip6 = mtod(m, struct ip6_hdr *); /* * If the destination address of the erroneous packet is a multicast * address, or the packet was sent using link-layer multicast, * we should basically suppress sending an error (RFC 2463, Section * 2.4). * We have two exceptions (the item e.2 in that section): * - the Packet Too Big message can be sent for path MTU discovery. * - the Parameter Problem Message that can be allowed an icmp6 error * in the option type field. This check has been done in * ip6_unknown_opt(), so we can just check the type and code. */ if ((m->m_flags & (M_BCAST|M_MCAST) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) && (type != ICMP6_PACKET_TOO_BIG && (type != ICMP6_PARAM_PROB || code != ICMP6_PARAMPROB_OPTION))) goto freeit; /* * RFC 2463, 2.4 (e.5): source address check. * XXX: the case of anycast source? */ if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_src)) goto freeit; /* * If we are about to send ICMPv6 against ICMPv6 error/redirect, * don't do it. */ nxt = -1; off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); if (off >= 0 && nxt == IPPROTO_ICMPV6) { struct icmp6_hdr *icp; if (m->m_len < off + sizeof(struct icmp6_hdr)) { m = m_pullup(m, off + sizeof(struct icmp6_hdr)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); return; } } oip6 = mtod(m, struct ip6_hdr *); icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); if (icp->icmp6_type < ICMP6_ECHO_REQUEST || icp->icmp6_type == ND_REDIRECT) { /* * ICMPv6 error * Special case: for redirect (which is * informational) we must not send icmp6 error. */ ICMP6STAT_INC(icp6s_canterror); goto freeit; } else { /* ICMPv6 informational - send the error */ } } else { /* non-ICMPv6 - send the error */ } /* Finally, do rate limitation check. */ if (icmp6_ratelimit(&oip6->ip6_src, type, code)) goto freeit; /* * OK, ICMP6 can be generated. */ if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN) m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len); preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); M_PREPEND(m, preplen, M_NOWAIT); /* FIB is also copied over. */ if (m == NULL) { nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__)); return; } nip6 = mtod(m, struct ip6_hdr *); nip6->ip6_src = oip6->ip6_src; nip6->ip6_dst = oip6->ip6_dst; in6_clearscope(&oip6->ip6_src); in6_clearscope(&oip6->ip6_dst); icmp6 = (struct icmp6_hdr *)(nip6 + 1); icmp6->icmp6_type = type; icmp6->icmp6_code = code; icmp6->icmp6_pptr = htonl((u_int32_t)param); ICMP6STAT_INC2(icp6s_outhist, type); NET_EPOCH_ENTER(et); icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */ NET_EPOCH_EXIT(et); return; freeit: /* * If we can't tell whether or not we can generate ICMP6, free it. */ m_freem(m); } int icmp6_errmap(const struct icmp6_hdr *icmp6) { switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: switch (icmp6->icmp6_code) { case ICMP6_DST_UNREACH_NOROUTE: case ICMP6_DST_UNREACH_ADDR: return (EHOSTUNREACH); case ICMP6_DST_UNREACH_NOPORT: case ICMP6_DST_UNREACH_ADMIN: return (ECONNREFUSED); case ICMP6_DST_UNREACH_BEYONDSCOPE: return (ENOPROTOOPT); default: return (0); /* Shouldn't happen. */ } case ICMP6_PACKET_TOO_BIG: return (EMSGSIZE); case ICMP6_TIME_EXCEEDED: switch (icmp6->icmp6_code) { case ICMP6_TIME_EXCEED_TRANSIT: return (EHOSTUNREACH); case ICMP6_TIME_EXCEED_REASSEMBLY: return (0); default: return (0); /* Shouldn't happen. */ } case ICMP6_PARAM_PROB: switch (icmp6->icmp6_code) { case ICMP6_PARAMPROB_NEXTHEADER: return (ECONNREFUSED); case ICMP6_PARAMPROB_HEADER: case ICMP6_PARAMPROB_OPTION: return (ENOPROTOOPT); default: return (0); /* Shouldn't happen. */ } default: return (0); } } /* * Process a received ICMP6 message. */ int icmp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m, *n; struct ifnet *ifp; struct ip6_hdr *ip6, *nip6; struct icmp6_hdr *icmp6, *nicmp6; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; int code, error, icmp6len, ip6len, noff, off, sum; NET_EPOCH_ASSERT(); m = *mp; off = *offp; if (m->m_len < off + sizeof(struct icmp6_hdr)) { m = m_pullup(m, off + sizeof(struct icmp6_hdr)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (IPPROTO_DONE); } } /* * Locate icmp6 structure in mbuf, and check * that not corrupted and of at least minimum length */ icmp6len = m->m_pkthdr.len - off; if (icmp6len < sizeof(struct icmp6_hdr)) { ICMP6STAT_INC(icp6s_tooshort); goto freeit; } ip6 = mtod(m, struct ip6_hdr *); ifp = m->m_pkthdr.rcvif; /* * Check multicast group membership. * Note: SSM filters are not applied for ICMPv6 traffic. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct in6_multi *inm; inm = in6m_lookup(ifp, &ip6->ip6_dst); if (inm == NULL) { IP6STAT_INC(ip6s_notmember); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto freeit; } } /* Calculate the checksum. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); code = icmp6->icmp6_code; if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) { nd6log((LOG_ERR, "ICMP6 checksum error(%d|%x) %s\n", icmp6->icmp6_type, sum, ip6_sprintf(ip6bufs, &ip6->ip6_src))); ICMP6STAT_INC(icp6s_checksum); goto freeit; } ICMP6STAT_INC2(icp6s_inhist, icmp6->icmp6_type); icmp6_ifstat_inc(ifp, ifs6_in_msg); if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK) icmp6_ifstat_inc(ifp, ifs6_in_error); ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: icmp6_ifstat_inc(ifp, ifs6_in_dstunreach); switch (code) { case ICMP6_DST_UNREACH_ADMIN: icmp6_ifstat_inc(ifp, ifs6_in_adminprohib); case ICMP6_DST_UNREACH_NOROUTE: case ICMP6_DST_UNREACH_ADDR: case ICMP6_DST_UNREACH_BEYONDSCOPE: case ICMP6_DST_UNREACH_NOPORT: goto deliver; default: goto badcode; } case ICMP6_PACKET_TOO_BIG: icmp6_ifstat_inc(ifp, ifs6_in_pkttoobig); /* * Validation is made in icmp6_mtudisc_update. * Updating the path MTU will be done after examining * intermediate extension headers. */ goto deliver; case ICMP6_TIME_EXCEEDED: icmp6_ifstat_inc(ifp, ifs6_in_timeexceed); switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: case ICMP6_TIME_EXCEED_REASSEMBLY: goto deliver; default: goto badcode; } case ICMP6_PARAM_PROB: icmp6_ifstat_inc(ifp, ifs6_in_paramprob); switch (code) { case ICMP6_PARAMPROB_NEXTHEADER: case ICMP6_PARAMPROB_HEADER: case ICMP6_PARAMPROB_OPTION: goto deliver; default: goto badcode; } case ICMP6_ECHO_REQUEST: icmp6_ifstat_inc(ifp, ifs6_in_echo); if (code != 0) goto badcode; if (icmp6_ratelimit(&ip6->ip6_src, ICMP6_ECHO_REPLY, 0)) break; if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) { /* Give up remote */ break; } if (!M_WRITABLE(n) || n->m_len < off + sizeof(struct icmp6_hdr)) { struct mbuf *n0 = n; int n0len; CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) <= MHLEN); n = m_gethdr(M_NOWAIT, n0->m_type); if (n == NULL) { /* Give up remote */ m_freem(n0); break; } m_move_pkthdr(n, n0); /* FIB copied. */ n0len = n0->m_pkthdr.len; /* save for use below */ /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); noff = sizeof(struct ip6_hdr); /* new mbuf contains only ipv6+icmpv6 headers */ n->m_len = noff + sizeof(struct icmp6_hdr); /* * Adjust mbuf. ip6_plen will be adjusted in * ip6_output(). */ m_adj(n0, off + sizeof(struct icmp6_hdr)); /* recalculate complete packet size */ n->m_pkthdr.len = n0len + (noff - off); n->m_next = n0; } else { if (n->m_len < off + sizeof(*nicmp6)) { n = m_pullup(n, off + sizeof(*nicmp6)); if (n == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); break; } } nicmp6 = (struct icmp6_hdr *)(mtod(n, caddr_t) + off); noff = off; } if (n) { nicmp6->icmp6_type = ICMP6_ECHO_REPLY; nicmp6->icmp6_code = 0; ICMP6STAT_INC(icp6s_reflect); ICMP6STAT_INC2(icp6s_outhist, ICMP6_ECHO_REPLY); icmp6_reflect(n, noff); } break; case ICMP6_ECHO_REPLY: icmp6_ifstat_inc(ifp, ifs6_in_echoreply); if (code != 0) goto badcode; break; case MLD_LISTENER_QUERY: case MLD_LISTENER_REPORT: case MLD_LISTENER_DONE: case MLDV2_LISTENER_REPORT: /* * Drop MLD traffic which is not link-local, has a hop limit * of greater than 1 hop, or which does not have the * IPv6 HBH Router Alert option. * As IPv6 HBH options are stripped in ip6_input() we must * check an mbuf header flag. * XXX Should we also sanity check that these messages * were directed to a link-local multicast prefix? */ if ((ip6->ip6_hlim != 1) || (m->m_flags & M_RTALERT_MLD) == 0) goto freeit; if (mld_input(&m, off, icmp6len) != 0) { *mp = NULL; return (IPPROTO_DONE); } /* m stays. */ break; case ICMP6_WRUREQUEST: /* ICMP6_FQDN_QUERY */ { enum { WRU, FQDN } mode; struct prison *pr; if (!V_icmp6_nodeinfo) break; if (icmp6len == sizeof(struct icmp6_hdr) + 4) mode = WRU; else if (icmp6len >= sizeof(struct icmp6_nodeinfo)) mode = FQDN; else goto badlen; pr = NULL; sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_vnet == ifp->if_vnet) break; sx_sunlock(&allprison_lock); if (pr == NULL) pr = curthread->td_ucred->cr_prison; if (mode == FQDN) { if (m->m_len < off + sizeof(struct icmp6_nodeinfo)) { m = m_pullup(m, off + sizeof(struct icmp6_nodeinfo)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (IPPROTO_DONE); } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n) n = ni6_input(n, off, pr); /* XXX meaningless if n == NULL */ noff = sizeof(struct ip6_hdr); } else { u_char *p; int maxhlen, hlen; /* * XXX: this combination of flags is pointless, * but should we keep this for compatibility? */ if ((V_icmp6_nodeinfo & (ICMP6_NODEINFO_FQDNOK | ICMP6_NODEINFO_TMPADDROK)) != (ICMP6_NODEINFO_FQDNOK | ICMP6_NODEINFO_TMPADDROK)) break; if (code != 0) goto badcode; CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) + 4 <= MHLEN); n = m_gethdr(M_NOWAIT, m->m_type); if (n == NULL) { /* Give up remote */ break; } if (!m_dup_pkthdr(n, m, M_NOWAIT)) { /* * Previous code did a blind M_COPY_PKTHDR * and said "just for rcvif". If true, then * we could tolerate the dup failing (due to * the deep copy of the tag chain). For now * be conservative and just fail. */ m_free(n); n = NULL; break; } /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); p = (u_char *)(nicmp6 + 1); bzero(p, 4); maxhlen = M_TRAILINGSPACE(n) - (sizeof(*nip6) + sizeof(*nicmp6) + 4); mtx_lock(&pr->pr_mtx); hlen = strlen(pr->pr_hostname); if (maxhlen > hlen) maxhlen = hlen; /* meaningless TTL */ bcopy(pr->pr_hostname, p + 4, maxhlen); mtx_unlock(&pr->pr_mtx); noff = sizeof(struct ip6_hdr); n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + 4 + maxhlen; nicmp6->icmp6_type = ICMP6_WRUREPLY; nicmp6->icmp6_code = 0; } if (n) { ICMP6STAT_INC(icp6s_reflect); ICMP6STAT_INC2(icp6s_outhist, ICMP6_WRUREPLY); icmp6_reflect(n, noff); } break; } case ICMP6_WRUREPLY: if (code != 0) goto badcode; break; case ND_ROUTER_SOLICIT: icmp6_ifstat_inc(ifp, ifs6_in_routersolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_solicit)) goto badlen; if (send_sendso_input_hook != NULL) { if (m->m_len < off + icmp6len) { m = m_pullup(m, off + icmp6len); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = NULL; return (IPPROTO_DONE); } } error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_rs_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_ROUTER_ADVERT: icmp6_ifstat_inc(ifp, ifs6_in_routeradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_advert)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_ra_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_NEIGHBOR_SOLICIT: icmp6_ifstat_inc(ifp, ifs6_in_neighborsolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_solicit)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_ns_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_NEIGHBOR_ADVERT: icmp6_ifstat_inc(ifp, ifs6_in_neighboradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_advert)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_na_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_REDIRECT: icmp6_ifstat_inc(ifp, ifs6_in_redirect); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_redirect)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); icmp6_redirect_input(m, off); m = n; if (m == NULL) goto freeit; break; case ICMP6_ROUTER_RENUMBERING: if (code != ICMP6_ROUTER_RENUMBERING_COMMAND && code != ICMP6_ROUTER_RENUMBERING_RESULT) goto badcode; if (icmp6len < sizeof(struct icmp6_router_renum)) goto badlen; break; default: nd6log((LOG_DEBUG, "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n", icmp6->icmp6_type, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ifp ? ifp->if_index : 0)); if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) { /* ICMPv6 error: MUST deliver it by spec... */ goto deliver; } else { /* ICMPv6 informational: MUST not deliver */ break; } deliver: if (icmp6_notify_error(&m, off, icmp6len) != 0) { /* In this case, m should've been freed. */ *mp = NULL; return (IPPROTO_DONE); } break; badcode: ICMP6STAT_INC(icp6s_badcode); break; badlen: ICMP6STAT_INC(icp6s_badlen); break; } /* deliver the packet to appropriate sockets */ icmp6_rip6_input(&m, *offp); *mp = m; return (IPPROTO_DONE); freeit: m_freem(m); *mp = NULL; return (IPPROTO_DONE); } static int icmp6_notify_error(struct mbuf **mp, int off, int icmp6len) { struct mbuf *m; struct icmp6_hdr *icmp6; struct ip6_hdr *eip6; u_int32_t notifymtu; struct sockaddr_in6 icmp6src, icmp6dst; m = *mp; if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) { ICMP6STAT_INC(icp6s_tooshort); goto freeit; } if (m->m_len < off + sizeof(*icmp6) + sizeof(struct ip6_hdr)) { m = m_pullup(m, off + sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (-1); } } icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); eip6 = (struct ip6_hdr *)(icmp6 + 1); bzero(&icmp6dst, sizeof(icmp6dst)); /* Detect the upper level protocol */ { u_int8_t nxt = eip6->ip6_nxt; int eoff = off + sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr); struct ip6ctlparam ip6cp; int icmp6type = icmp6->icmp6_type; struct ip6_frag *fh; struct ip6_rthdr *rth; struct ip6_rthdr0 *rth0; int rthlen; while (1) { /* XXX: should avoid infinite loop explicitly? */ struct ip6_ext *eh; switch (nxt) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_AH: if (m->m_len < eoff + sizeof(struct ip6_ext)) { m = m_pullup(m, eoff + sizeof(struct ip6_ext)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (-1); } } eh = (struct ip6_ext *) (mtod(m, caddr_t) + eoff); if (nxt == IPPROTO_AH) eoff += (eh->ip6e_len + 2) << 2; else eoff += (eh->ip6e_len + 1) << 3; nxt = eh->ip6e_nxt; break; case IPPROTO_ROUTING: /* * When the erroneous packet contains a * routing header, we should examine the * header to determine the final destination. * Otherwise, we can't properly update * information that depends on the final * destination (e.g. path MTU). */ if (m->m_len < eoff + sizeof(*rth)) { m = m_pullup(m, eoff + sizeof(*rth)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (-1); } } rth = (struct ip6_rthdr *) (mtod(m, caddr_t) + eoff); rthlen = (rth->ip6r_len + 1) << 3; /* * XXX: currently there is no * officially defined type other * than type-0. * Note that if the segment left field * is 0, all intermediate hops must * have been passed. */ if (rth->ip6r_segleft && rth->ip6r_type == IPV6_RTHDR_TYPE_0) { int hops; if (m->m_len < eoff + rthlen) { m = m_pullup(m, eoff + rthlen); if (m == NULL) { IP6STAT_INC( ip6s_exthdrtoolong); *mp = m; return (-1); } } rth0 = (struct ip6_rthdr0 *) (mtod(m, caddr_t) + eoff); /* just ignore a bogus header */ if ((rth0->ip6r0_len % 2) == 0 && (hops = rth0->ip6r0_len/2)) icmp6dst.sin6_addr = *((struct in6_addr *)(rth0 + 1) + (hops - 1)); } eoff += rthlen; nxt = rth->ip6r_nxt; break; case IPPROTO_FRAGMENT: if (m->m_len < eoff + sizeof(struct ip6_frag)) { m = m_pullup(m, eoff + sizeof(struct ip6_frag)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (-1); } } fh = (struct ip6_frag *)(mtod(m, caddr_t) + eoff); /* * Data after a fragment header is meaningless * unless it is the first fragment, but * we'll go to the notify label for path MTU * discovery. */ if (fh->ip6f_offlg & IP6F_OFF_MASK) goto notify; eoff += sizeof(struct ip6_frag); nxt = fh->ip6f_nxt; break; default: /* * This case includes ESP and the No Next * Header. In such cases going to the notify * label does not have any meaning * (i.e. ctlfunc will be NULL), but we go * anyway since we might have to update * path MTU information. */ goto notify; } } notify: icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); /* * retrieve parameters from the inner IPv6 header, and convert * them into sockaddr structures. * XXX: there is no guarantee that the source or destination * addresses of the inner packet are in the same scope as * the addresses of the icmp packet. But there is no other * way to determine the zone. */ eip6 = (struct ip6_hdr *)(icmp6 + 1); /* * Protocol layers can't do anything useful with unspecified * addresses. */ if (IN6_IS_ADDR_UNSPECIFIED(&eip6->ip6_src) || IN6_IS_ADDR_UNSPECIFIED(&eip6->ip6_dst)) goto freeit; icmp6dst.sin6_len = sizeof(struct sockaddr_in6); icmp6dst.sin6_family = AF_INET6; if (IN6_IS_ADDR_UNSPECIFIED(&icmp6dst.sin6_addr)) icmp6dst.sin6_addr = eip6->ip6_dst; if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; bzero(&icmp6src, sizeof(icmp6src)); icmp6src.sin6_len = sizeof(struct sockaddr_in6); icmp6src.sin6_family = AF_INET6; icmp6src.sin6_addr = eip6->ip6_src; if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; icmp6src.sin6_flowinfo = (eip6->ip6_flow & IPV6_FLOWLABEL_MASK); ip6cp.ip6c_m = m; ip6cp.ip6c_icmp6 = icmp6; ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1); ip6cp.ip6c_off = eoff; ip6cp.ip6c_finaldst = &icmp6dst; ip6cp.ip6c_src = &icmp6src; ip6cp.ip6c_nxt = nxt; if (icmp6type == ICMP6_PACKET_TOO_BIG) { notifymtu = ntohl(icmp6->icmp6_mtu); ip6cp.ip6c_cmdarg = (void *)¬ifymtu; - icmp6_mtudisc_update(&ip6cp, 1); /*XXX*/ + icmp6_mtudisc_update(&ip6cp); } if (ip6_ctlprotox[nxt] != NULL) ip6_ctlprotox[nxt](&ip6cp); } *mp = m; return (0); freeit: m_freem(m); *mp = NULL; return (-1); } -void -icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated) +static void +icmp6_mtudisc_update(struct ip6ctlparam *ip6cp) { struct in6_addr *dst = &ip6cp->ip6c_finaldst->sin6_addr; struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6; - struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */ + struct mbuf *m = ip6cp->ip6c_m; u_int mtu = ntohl(icmp6->icmp6_mtu); struct in_conninfo inc; uint32_t max_mtu; -#if 0 - /* - * RFC2460 section 5, last paragraph. - * even though minimum link MTU for IPv6 is IPV6_MMTU, - * we may see ICMPv6 too big with mtu < IPV6_MMTU - * due to packet translator in the middle. - * see ip6_output() and ip6_getpmtu() "alwaysfrag" case for - * special handling. - */ if (mtu < IPV6_MMTU) return; -#endif - - /* - * we reject ICMPv6 too big with abnormally small value. - * XXX what is the good definition of "abnormally small"? - */ - if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8) - return; - - if (!validated) - return; - - /* - * In case the suggested mtu is less than IPV6_MMTU, we - * only need to remember that it was for above mentioned - * "alwaysfrag" case. - * Try to be as close to the spec as possible. - */ - if (mtu < IPV6_MMTU) - mtu = IPV6_MMTU - 8; bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) return; max_mtu = tcp_hc_getmtu(&inc); if (max_mtu == 0) max_mtu = tcp_maxmtu6(&inc, NULL); if (mtu < max_mtu) { tcp_hc_updatemtu(&inc, mtu); ICMP6STAT_INC(icp6s_pmtuchg); } } /* * Process a Node Information Query packet, based on * draft-ietf-ipngwg-icmp-name-lookups-07. * * Spec incompatibilities: * - IPv6 Subject address handling * - IPv4 Subject address handling support missing * - Proxy reply (answer even if it's not for me) * - joins NI group address at in6_ifattach() time only, does not cope * with hostname changes by sethostname(3) */ static struct mbuf * ni6_input(struct mbuf *m, int off, struct prison *pr) { struct icmp6_nodeinfo *ni6, *nni6; struct mbuf *n = NULL; u_int16_t qtype; int subjlen; int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); struct ni_reply_fqdn *fqdn; int addrs; /* for NI_QTYPE_NODEADDR */ struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */ struct in6_addr in6_subj; /* subject address */ struct ip6_hdr *ip6; int oldfqdn = 0; /* if 1, return pascal string (03 draft) */ char *subj = NULL; struct in6_ifaddr *ia6 = NULL; ip6 = mtod(m, struct ip6_hdr *); ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off); /* * Validate IPv6 source address. * The default configuration MUST be to refuse answering queries from * global-scope addresses according to RFC4602. * Notes: * - it's not very clear what "refuse" means; this implementation * simply drops it. * - it's not very easy to identify global-scope (unicast) addresses * since there are many prefixes for them. It should be safer * and in practice sufficient to check "all" but loopback and * link-local (note that site-local unicast was deprecated and * ULA is defined as global scope-wise) */ if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 && !IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) && !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) goto bad; /* * Validate IPv6 destination address. * * The Responder must discard the Query without further processing * unless it is one of the Responder's unicast or anycast addresses, or * a link-local scope multicast address which the Responder has joined. * [RFC4602, Section 5.] */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) goto bad; /* else it's a link-local multicast, fine */ } else { /* unicast or anycast */ ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); if (ia6 == NULL) goto bad; /* XXX impossible */ if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) && !(V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) { nd6log((LOG_DEBUG, "ni6_input: ignore node info to " "a temporary address in %s:%d", __FILE__, __LINE__)); goto bad; } } /* validate query Subject field. */ qtype = ntohs(ni6->ni_qtype); subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo); switch (qtype) { case NI_QTYPE_NOOP: case NI_QTYPE_SUPTYPES: /* 07 draft */ if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0) break; /* FALLTHROUGH */ case NI_QTYPE_FQDN: case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: #if ICMP6_NI_SUBJ_IPV6 != 0 case 0: #endif /* * backward compatibility - try to accept 03 draft * format, where no Subject is present. */ if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 && subjlen == 0) { oldfqdn++; break; } #if ICMP6_NI_SUBJ_IPV6 != 0 if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6) goto bad; #endif if (subjlen != sizeof(struct in6_addr)) goto bad; /* * Validate Subject address. * * Not sure what exactly "address belongs to the node" * means in the spec, is it just unicast, or what? * * At this moment we consider Subject address as * "belong to the node" if the Subject address equals * to the IPv6 destination address; validation for * IPv6 destination address should have done enough * check for us. * * We do not do proxy at this moment. */ m_copydata(m, off + sizeof(struct icmp6_nodeinfo), subjlen, (caddr_t)&in6_subj); if (in6_setscope(&in6_subj, m->m_pkthdr.rcvif, NULL)) goto bad; subj = (char *)&in6_subj; if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj)) break; /* * XXX if we are to allow other cases, we should really * be careful about scope here. * basically, we should disallow queries toward IPv6 * destination X with subject Y, * if scope(X) > scope(Y). * if we allow scope(X) > scope(Y), it will result in * information leakage across scope boundary. */ goto bad; case ICMP6_NI_SUBJ_FQDN: /* * Validate Subject name with gethostname(3). * * The behavior may need some debate, since: * - we are not sure if the node has FQDN as * hostname (returned by gethostname(3)). * - the code does wildcard match for truncated names. * however, we are not sure if we want to perform * wildcard match, if gethostname(3) side has * truncated hostname. */ mtx_lock(&pr->pr_mtx); n = ni6_nametodns(pr->pr_hostname, strlen(pr->pr_hostname), 0); mtx_unlock(&pr->pr_mtx); if (!n || n->m_next || n->m_len == 0) goto bad; if (m->m_len < off + sizeof(struct icmp6_nodeinfo) + subjlen) { m = m_pullup(m, off + sizeof(struct icmp6_nodeinfo) + subjlen); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); goto bad; } } /* ip6 possibly invalid but not used after. */ ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off); subj = (char *)(mtod(m, caddr_t) + off + sizeof(struct icmp6_nodeinfo)); if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *), n->m_len)) { goto bad; } m_freem(n); n = NULL; break; case ICMP6_NI_SUBJ_IPV4: /* XXX: to be implemented? */ default: goto bad; } break; } /* refuse based on configuration. XXX ICMP6_NI_REFUSED? */ switch (qtype) { case NI_QTYPE_FQDN: if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0) goto bad; break; case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0) goto bad; break; } /* guess reply length */ switch (qtype) { case NI_QTYPE_NOOP: break; /* no reply data */ case NI_QTYPE_SUPTYPES: replylen += sizeof(u_int32_t); break; case NI_QTYPE_FQDN: /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); break; case NI_QTYPE_NODEADDR: addrs = ni6_addrs(ni6, m, &ifp, (struct in6_addr *)subj); if ((replylen += addrs * (sizeof(struct in6_addr) + sizeof(u_int32_t))) > MCLBYTES) replylen = MCLBYTES; /* XXX: will truncate pkt later */ break; case NI_QTYPE_IPV4ADDR: /* unsupported - should respond with unknown Qtype? */ break; default: /* * XXX: We must return a reply with the ICMP6 code * `unknown Qtype' in this case. However we regard the case * as an FQDN query for backward compatibility. * Older versions set a random value to this field, * so it rarely varies in the defined qtypes. * But the mechanism is not reliable... * maybe we should obsolete older versions. */ qtype = NI_QTYPE_FQDN; /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); oldfqdn++; break; } /* Allocate an mbuf to reply. */ if (replylen > MCLBYTES) { /* * XXX: should we try to allocate more? But MCLBYTES * is probably much larger than IPV6_MMTU... */ goto bad; } if (replylen > MHLEN) n = m_getcl(M_NOWAIT, m->m_type, M_PKTHDR); else n = m_gethdr(M_NOWAIT, m->m_type); if (n == NULL) { m_freem(m); return (NULL); } m_move_pkthdr(n, m); /* just for recvif and FIB */ n->m_pkthdr.len = n->m_len = replylen; /* copy mbuf header and IPv6 + Node Information base headers */ bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr)); nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1); bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo)); /* qtype dependent procedure */ switch (qtype) { case NI_QTYPE_NOOP: nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = 0; break; case NI_QTYPE_SUPTYPES: { u_int32_t v; nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = htons(0x0000); /* raw bitmap */ /* supports NOOP, SUPTYPES, FQDN, and NODEADDR */ v = (u_int32_t)htonl(0x0000000f); bcopy(&v, nni6 + 1, sizeof(u_int32_t)); break; } case NI_QTYPE_FQDN: nni6->ni_code = ICMP6_NI_SUCCESS; fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) + sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo)); nni6->ni_flags = 0; /* XXX: meaningless TTL */ fqdn->ni_fqdn_ttl = 0; /* ditto. */ /* * XXX do we really have FQDN in hostname? */ mtx_lock(&pr->pr_mtx); n->m_next = ni6_nametodns(pr->pr_hostname, strlen(pr->pr_hostname), oldfqdn); mtx_unlock(&pr->pr_mtx); if (n->m_next == NULL) goto bad; /* XXX we assume that n->m_next is not a chain */ if (n->m_next->m_next != NULL) goto bad; n->m_pkthdr.len += n->m_next->m_len; break; case NI_QTYPE_NODEADDR: { int lenlim, copied; nni6->ni_code = ICMP6_NI_SUCCESS; n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); lenlim = M_TRAILINGSPACE(n); copied = ni6_store_addrs(ni6, nni6, ifp, lenlim); /* XXX: reset mbuf length */ n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo) + copied; break; } default: break; /* XXX impossible! */ } nni6->ni_type = ICMP6_NI_REPLY; m_freem(m); return (n); bad: m_freem(m); if (n) m_freem(n); return (NULL); } /* * make a mbuf with DNS-encoded string. no compression support. * * XXX names with less than 2 dots (like "foo" or "foo.section") will be * treated as truncated name (two \0 at the end). this is a wild guess. * * old - return pascal string if non-zero */ static struct mbuf * ni6_nametodns(const char *name, int namelen, int old) { struct mbuf *m; char *cp, *ep; const char *p, *q; int i, len, nterm; if (old) len = namelen + 1; else len = MCLBYTES; /* Because MAXHOSTNAMELEN is usually 256, we use cluster mbuf. */ if (len > MLEN) m = m_getcl(M_NOWAIT, MT_DATA, 0); else m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) goto fail; if (old) { m->m_len = len; *mtod(m, char *) = namelen; bcopy(name, mtod(m, char *) + 1, namelen); return m; } else { m->m_len = 0; cp = mtod(m, char *); ep = mtod(m, char *) + M_TRAILINGSPACE(m); /* if not certain about my name, return empty buffer */ if (namelen == 0) return m; /* * guess if it looks like shortened hostname, or FQDN. * shortened hostname needs two trailing "\0". */ i = 0; for (p = name; p < name + namelen; p++) { if (*p && *p == '.') i++; } if (i < 2) nterm = 2; else nterm = 1; p = name; while (cp < ep && p < name + namelen) { i = 0; for (q = p; q < name + namelen && *q && *q != '.'; q++) i++; /* result does not fit into mbuf */ if (cp + i + 1 >= ep) goto fail; /* * DNS label length restriction, RFC1035 page 8. * "i == 0" case is included here to avoid returning * 0-length label on "foo..bar". */ if (i <= 0 || i >= 64) goto fail; *cp++ = i; bcopy(p, cp, i); cp += i; p = q; if (p < name + namelen && *p == '.') p++; } /* termination */ if (cp + nterm >= ep) goto fail; while (nterm-- > 0) *cp++ = '\0'; m->m_len = cp - mtod(m, char *); return m; } panic("should not reach here"); /* NOTREACHED */ fail: if (m) m_freem(m); return NULL; } /* * check if two DNS-encoded string matches. takes care of truncated * form (with \0\0 at the end). no compression support. * XXX upper/lowercase match (see RFC2065) */ static int ni6_dnsmatch(const char *a, int alen, const char *b, int blen) { const char *a0, *b0; int l; /* simplest case - need validation? */ if (alen == blen && bcmp(a, b, alen) == 0) return 1; a0 = a; b0 = b; /* termination is mandatory */ if (alen < 2 || blen < 2) return 0; if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0') return 0; alen--; blen--; while (a - a0 < alen && b - b0 < blen) { if (a - a0 + 1 > alen || b - b0 + 1 > blen) return 0; if ((signed char)a[0] < 0 || (signed char)b[0] < 0) return 0; /* we don't support compression yet */ if (a[0] >= 64 || b[0] >= 64) return 0; /* truncated case */ if (a[0] == 0 && a - a0 == alen - 1) return 1; if (b[0] == 0 && b - b0 == blen - 1) return 1; if (a[0] == 0 || b[0] == 0) return 0; if (a[0] != b[0]) return 0; l = a[0]; if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen) return 0; if (bcmp(a + 1, b + 1, l) != 0) return 0; a += 1 + l; b += 1 + l; } if (a - a0 == alen && b - b0 == blen) return 1; else return 0; } /* * calculate the number of addresses to be returned in the node info reply. */ static int ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp, struct in6_addr *subj) { struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; int addrs = 0, addrsofif, iffound = 0; int niflags = ni6->ni_flags; NET_EPOCH_ASSERT(); if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) { switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: if (subj == NULL) /* must be impossible... */ return (0); break; default: /* * XXX: we only support IPv6 subject address for * this Qtype. */ return (0); } } CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { addrsofif = 0; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 && IN6_ARE_ADDR_EQUAL(subj, &ifa6->ia_addr.sin6_addr)) iffound = 1; /* * IPv4-mapped addresses can only be returned by a * Node Information proxy, since they represent * addresses of IPv4-only nodes, which perforce do * not implement this protocol. * [icmp-name-lookups-07, Section 5.4] * So we don't support NI_NODEADDR_FLAG_COMPAT in * this function at this moment. */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; /* we need only unicast addresses */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } addrsofif++; /* count the address */ } if (iffound) { *ifpp = ifp; return (addrsofif); } addrs += addrsofif; } return (addrs); } static int ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, struct ifnet *ifp0, int resid) { struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; struct ifnet *ifp_dep = NULL; int copied = 0, allow_deprecated = 0; u_char *cp = (u_char *)(nni6 + 1); int niflags = ni6->ni_flags; u_int32_t ltime; NET_EPOCH_ASSERT(); if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL)) return (0); /* needless to copy */ ifp = ifp0 ? ifp0 : CK_STAILQ_FIRST(&V_ifnet); again: for (; ifp; ifp = CK_STAILQ_NEXT(ifp, if_link)) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 && allow_deprecated == 0) { /* * prefererred address should be put before * deprecated addresses. */ /* record the interface for later search */ if (ifp_dep == NULL) ifp_dep = ifp; continue; } else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 && allow_deprecated != 0) continue; /* we now collect deprecated addrs */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } /* now we can copy the address */ if (resid < sizeof(struct in6_addr) + sizeof(u_int32_t)) { /* * We give up much more copy. * Set the truncate flag and return. */ nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE; return (copied); } /* * Set the TTL of the address. * The TTL value should be one of the following * according to the specification: * * 1. The remaining lifetime of a DHCP lease on the * address, or * 2. The remaining Valid Lifetime of a prefix from * which the address was derived through Stateless * Autoconfiguration. * * Note that we currently do not support stateful * address configuration by DHCPv6, so the former * case can't happen. */ if (ifa6->ia6_lifetime.ia6t_expire == 0) ltime = ND6_INFINITE_LIFETIME; else { if (ifa6->ia6_lifetime.ia6t_expire > time_uptime) ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_uptime); else ltime = 0; } bcopy(<ime, cp, sizeof(u_int32_t)); cp += sizeof(u_int32_t); /* copy the address itself */ bcopy(&ifa6->ia_addr.sin6_addr, cp, sizeof(struct in6_addr)); in6_clearscope((struct in6_addr *)cp); /* XXX */ cp += sizeof(struct in6_addr); resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t)); copied += (sizeof(struct in6_addr) + sizeof(u_int32_t)); } if (ifp0) /* we need search only on the specified IF */ break; } if (allow_deprecated == 0 && ifp_dep != NULL) { ifp = ifp_dep; allow_deprecated = 1; goto again; } return (copied); } static bool icmp6_rip6_match(const struct inpcb *inp, void *v) { struct ip6_hdr *ip6 = v; if ((inp->inp_vflag & INP_IPV6) == 0) return (false); if (inp->inp_ip_p != IPPROTO_ICMPV6) return (false); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) return (false); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) return (false); return (true); } /* * XXX almost dup'ed code with rip6_input. */ static int icmp6_rip6_input(struct mbuf **mp, int off) { struct mbuf *n, *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo, INPLOOKUP_RLOCKPCB, icmp6_rip6_match, ip6); struct inpcb *inp; struct sockaddr_in6 fromsa; struct icmp6_hdr *icmp6; struct mbuf *opts = NULL; int delivered = 0, fib; /* This is assumed to be safe; icmp6_input() does a pullup. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); /* * XXX: the address may have embedded scope zone ID, which should be * hidden from applications. */ bzero(&fromsa, sizeof(fromsa)); fromsa.sin6_family = AF_INET6; fromsa.sin6_len = sizeof(struct sockaddr_in6); fromsa.sin6_addr = ip6->ip6_src; if (sa6_recoverscope(&fromsa)) { m_freem(m); *mp = NULL; return (IPPROTO_DONE); } fib = M_GETFIB(m); while ((inp = inp_next(&inpi)) != NULL) { if (V_rip_bind_all_fibs == 0 && fib != inp->inp_inc.inc_fibnum) /* * Sockets bound to a specific FIB can only receive * packets from that FIB. */ continue; if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, inp->in6p_icmp6filt)) continue; /* * Recent network drivers tend to allocate a single * mbuf cluster, rather than to make a couple of * mbufs without clusters. Also, since the IPv6 code * path tries to avoid m_pullup(), it is highly * probable that we still have an mbuf cluster here * even though the necessary length can be stored in an * mbuf's internal buffer. * Meanwhile, the default size of the receive socket * buffer for raw sockets is not so large. This means * the possibility of packet loss is relatively higher * than before. To avoid this scenario, we copy the * received data to a separate mbuf that does not use * a cluster, if possible. * XXX: it is better to copy the data after stripping * intermediate headers. */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { n = m_get(M_NOWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; } else { m_free(n); n = NULL; } } } else n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n == NULL) continue; if (inp->inp_flags & INP_CONTROLOPTS) ip6_savecontrol(inp, n, &opts); /* strip intermediate headers */ m_adj(n, off); SOCKBUF_LOCK(&inp->inp_socket->so_rcv); if (sbappendaddr_locked(&inp->inp_socket->so_rcv, (struct sockaddr *)&fromsa, n, opts) == 0) { soroverflow_locked(inp->inp_socket); m_freem(n); if (opts) m_freem(opts); } else { sorwakeup_locked(inp->inp_socket); delivered++; } opts = NULL; } m_freem(m); *mp = NULL; if (delivered == 0) IP6STAT_DEC(ip6s_delivered); return (IPPROTO_DONE); } /* * Reflect the ip6 packet back to the source. * OFF points to the icmp6 header, counted from the top of the mbuf. */ static void icmp6_reflect(struct mbuf *m, size_t off) { struct in6_addr src6, *srcp; struct ip6_hdr *ip6; struct icmp6_hdr *icmp6; struct in6_ifaddr *ia = NULL; struct ifnet *outif = NULL; int plen; int type, code, hlim; /* too short to reflect */ if (off < sizeof(struct ip6_hdr)) { nd6log((LOG_DEBUG, "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n", (u_long)off, (u_long)sizeof(struct ip6_hdr), __FILE__, __LINE__)); goto bad; } /* * If there are extra headers between IPv6 and ICMPv6, strip * off that header first. */ #ifdef DIAGNOSTIC if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN) panic("assumption failed in icmp6_reflect"); #endif if (off > sizeof(struct ip6_hdr)) { size_t l; struct ip6_hdr nip6; l = off - sizeof(struct ip6_hdr); m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6); m_adj(m, l); l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6)); } else /* off == sizeof(struct ip6_hdr) */ { size_t l; l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } } plen = m->m_pkthdr.len - sizeof(struct ip6_hdr); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_nxt = IPPROTO_ICMPV6; icmp6 = (struct icmp6_hdr *)(ip6 + 1); type = icmp6->icmp6_type; /* keep type for statistics */ code = icmp6->icmp6_code; /* ditto. */ hlim = 0; srcp = NULL; if (__predict_false(IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src))) { nd6log((LOG_DEBUG, "icmp6_reflect: source address is unspecified\n")); goto bad; } /* * If the incoming packet was addressed directly to us (i.e. unicast), * use dst as the src for the reply. * The IN6_IFF_NOTREADY case should be VERY rare, but is possible * (for example) when we encounter an error while forwarding procedure * destined to a duplicated address of ours. */ if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); if (ia != NULL && !(ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY))) { src6 = ia->ia_addr.sin6_addr; srcp = &src6; if (m->m_pkthdr.rcvif != NULL) { /* XXX: This may not be the outgoing interface */ hlim = ND_IFINFO(m->m_pkthdr.rcvif)->chlim; } else hlim = V_ip6_defhlim; } } if (srcp == NULL) { int error; struct in6_addr dst6; uint32_t scopeid; /* * This case matches to multicasts, our anycast, or unicasts * that we do not own. Select a source address based on the * source address of the erroneous packet. */ in6_splitscope(&ip6->ip6_src, &dst6, &scopeid); error = in6_selectsrc_addr(M_GETFIB(m), &dst6, scopeid, NULL, &src6, &hlim); if (error) { char ip6buf[INET6_ADDRSTRLEN]; nd6log((LOG_DEBUG, "icmp6_reflect: source can't be determined: " "dst=%s, error=%d\n", ip6_sprintf(ip6buf, &ip6->ip6_dst), error)); goto bad; } srcp = &src6; } /* * ip6_input() drops a packet if its src is multicast. * So, the src is never multicast. */ ip6->ip6_dst = ip6->ip6_src; ip6->ip6_src = *srcp; ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = hlim; icmp6->icmp6_cksum = 0; icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), plen); /* * XXX option handling */ m->m_flags &= ~(M_BCAST|M_MCAST); m->m_pkthdr.rcvif = NULL; ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) icmp6_ifoutstat_inc(outif, type, code); return; bad: m_freem(m); return; } static const char * icmp6_redirect_diag(struct in6_addr *src6, struct in6_addr *dst6, struct in6_addr *tgt6) { static char buf[1024]; char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; char ip6buft[INET6_ADDRSTRLEN]; snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)", ip6_sprintf(ip6bufs, src6), ip6_sprintf(ip6bufd, dst6), ip6_sprintf(ip6buft, tgt6)); return buf; } void icmp6_redirect_input(struct mbuf *m, int off) { struct ifnet *ifp; struct ip6_hdr *ip6; struct nd_redirect *nd_rd; struct in6_addr src6, redtgt6, reddst6; union nd_opts ndopts; char ip6buf[INET6_ADDRSTRLEN]; char *lladdr; int icmp6len, is_onlink, is_router, lladdrlen; M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: no rcvif", __func__)); /* XXX if we are router, we don't update route by icmp6 redirect */ if (V_ip6_forwarding) goto freeit; if (!V_icmp6_rediraccept) goto freeit; /* RFC 6980: Nodes MUST silently ignore fragments */ if(m->m_flags & M_FRAGMENTED) goto freeit; ip6 = mtod(m, struct ip6_hdr *); icmp6len = ntohs(ip6->ip6_plen); if (m->m_len < off + icmp6len) { m = m_pullup(m, off + icmp6len); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); return; } } ip6 = mtod(m, struct ip6_hdr *); nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off); ifp = m->m_pkthdr.rcvif; redtgt6 = nd_rd->nd_rd_target; reddst6 = nd_rd->nd_rd_dst; if (in6_setscope(&redtgt6, ifp, NULL) || in6_setscope(&reddst6, ifp, NULL)) { goto freeit; } /* validation */ src6 = ip6->ip6_src; if (!IN6_IS_ADDR_LINKLOCAL(&src6)) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "must be from linklocal\n", ip6_sprintf(ip6buf, &src6))); goto bad; } if (__predict_false(ip6->ip6_hlim != 255)) { ICMP6STAT_INC(icp6s_invlhlim); nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "hlim=%d (must be 255)\n", ip6_sprintf(ip6buf, &src6), ip6->ip6_hlim)); goto bad; } { /* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */ struct nhop_object *nh; struct in6_addr kdst; uint32_t scopeid; in6_splitscope(&reddst6, &kdst, &scopeid); NET_EPOCH_ASSERT(); nh = fib6_lookup(ifp->if_fib, &kdst, scopeid, 0, 0); if (nh != NULL) { struct in6_addr nh_addr; nh_addr = ifatoia6(nh->nh_ifa)->ia_addr.sin6_addr; if ((nh->nh_flags & NHF_GATEWAY) == 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; no route " "with inet6 gateway found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* * Embed scope zone id into next hop address. */ nh_addr = nh->gw6_sa.sin6_addr; if (IN6_ARE_ADDR_EQUAL(&src6, &nh_addr) == 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "not equal to gw-for-src=%s (must be same): " "%s\n", ip6_sprintf(ip6buf, &nh_addr), icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } } else { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "no route found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } } if (IN6_IS_ADDR_MULTICAST(&reddst6)) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "redirect dst must be unicast: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } is_router = is_onlink = 0; if (IN6_IS_ADDR_LINKLOCAL(&redtgt6)) is_router = 1; /* router case */ if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0) is_onlink = 1; /* on-link destination case */ if (!is_router && !is_onlink) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "neither router case nor onlink case: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } icmp6len -= sizeof(*nd_rd); nd6_option_init(nd_rd + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "%s: invalid ND option, rejected: %s\n", __func__, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); /* nd6_options have incremented stats */ goto freeit; } lladdr = NULL; lladdrlen = 0; if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "%s: lladdrlen mismatch for %s " "(if %d, icmp6 packet %d): %s\n", __func__, ip6_sprintf(ip6buf, &redtgt6), ifp->if_addrlen, lladdrlen - 2, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* Validation passed. */ /* RFC 2461 8.3 */ nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT, is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER); /* * Install a gateway route in the better-router case or an interface * route in the on-link-destination case. */ { struct sockaddr_in6 sdst; struct sockaddr_in6 sgw; struct sockaddr_in6 ssrc; struct sockaddr *gw; int rt_flags; u_int fibnum; bzero(&sdst, sizeof(sdst)); bzero(&ssrc, sizeof(ssrc)); sdst.sin6_family = ssrc.sin6_family = AF_INET6; sdst.sin6_len = ssrc.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr)); rt_flags = 0; if (is_router) { bzero(&sgw, sizeof(sgw)); sgw.sin6_family = AF_INET6; sgw.sin6_len = sizeof(struct sockaddr_in6); bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr)); gw = (struct sockaddr *)&sgw; rt_flags |= RTF_GATEWAY; } else gw = ifp->if_addr->ifa_addr; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) rib_add_redirect(fibnum, (struct sockaddr *)&sdst, gw, (struct sockaddr *)&ssrc, ifp, rt_flags, V_icmp6_redirtimeout); } freeit: m_freem(m); return; bad: ICMP6STAT_INC(icp6s_badredirect); m_freem(m); } void icmp6_redirect_output(struct mbuf *m0, struct nhop_object *nh) { struct ifnet *ifp; /* my outgoing interface */ struct in6_addr ifp_ll6; struct in6_addr *router_ll6; struct ip6_hdr *sip6; /* m0 as struct ip6_hdr */ struct mbuf *m = NULL; /* newly allocated one */ struct m_tag *mtag; struct ip6_hdr *ip6; /* m as struct ip6_hdr */ struct nd_redirect *nd_rd; struct llentry *ln = NULL; size_t maxlen; u_char *p; struct ifnet *outif = NULL; struct sockaddr_in6 src_sa; icmp6_errcount(ND_REDIRECT, 0); /* if we are not router, we don't send icmp6 redirect */ if (!V_ip6_forwarding) goto fail; /* sanity check */ if (!m0 || !nh || !(NH_IS_VALID(nh)) || !(ifp = nh->nh_ifp)) goto fail; /* * Address check: * the source address must identify a neighbor, and * the destination address must not be a multicast address * [RFC 2461, sec 8.2] */ sip6 = mtod(m0, struct ip6_hdr *); bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = sip6->ip6_src; if (nd6_is_addr_neighbor(&src_sa, ifp) == 0) goto fail; if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst)) goto fail; /* what should we do here? */ /* rate limit */ if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0)) goto fail; /* * Since we are going to append up to 1280 bytes (= IPV6_MMTU), * we almost always ask for an mbuf cluster for simplicity. * (MHLEN < IPV6_MMTU is almost always true) */ #if IPV6_MMTU >= MCLBYTES # error assumption failed about IPV6_MMTU and MCLBYTES #endif m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) goto fail; M_SETFIB(m, M_GETFIB(m0)); maxlen = M_TRAILINGSPACE(m); maxlen = min(IPV6_MMTU, maxlen); /* just for safety */ if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) { goto fail; } { /* get ip6 linklocal address for ifp(my outgoing interface). */ struct in6_ifaddr *ia; if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY| IN6_IFF_ANYCAST)) == NULL) goto fail; bcopy(&ia->ia_addr.sin6_addr, &ifp_ll6, sizeof(ifp_ll6)); ifa_free(&ia->ia_ifa); } /* get ip6 linklocal address for the router. */ if (nh->nh_flags & NHF_GATEWAY) { struct sockaddr_in6 *sin6; sin6 = &nh->gw6_sa; router_ll6 = &sin6->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(router_ll6)) router_ll6 = (struct in6_addr *)NULL; } else router_ll6 = (struct in6_addr *)NULL; /* ip6 */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; /* ip6->ip6_src must be linklocal addr for my outgoing if. */ bcopy(&ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr)); bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr)); /* ND Redirect */ nd_rd = (struct nd_redirect *)(ip6 + 1); nd_rd->nd_rd_type = ND_REDIRECT; nd_rd->nd_rd_code = 0; nd_rd->nd_rd_reserved = 0; if (nh->nh_flags & NHF_GATEWAY) { /* * nd_rd->nd_rd_target must be a link-local address in * better router cases. */ if (!router_ll6) goto fail; bcopy(router_ll6, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } else { /* make sure redtgt == reddst */ bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } p = (u_char *)(nd_rd + 1); if (!router_ll6) goto nolladdropt; { /* target lladdr option */ int len; struct nd_opt_hdr *nd_opt; char *lladdr; ln = nd6_lookup(router_ll6, LLE_SF(AF_INET6, 0), ifp); if (ln == NULL) goto nolladdropt; len = sizeof(*nd_opt) + ifp->if_addrlen; len = (len + 7) & ~7; /* round by 8 */ /* safety check */ if (len + (p - (u_char *)ip6) > maxlen) goto nolladdropt; if (ln->la_flags & LLE_VALID) { nd_opt = (struct nd_opt_hdr *)p; nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = len >> 3; lladdr = (char *)(nd_opt + 1); bcopy(ln->ll_addr, lladdr, ifp->if_addrlen); p += len; } } nolladdropt: if (ln != NULL) LLE_RUNLOCK(ln); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* just to be safe */ #ifdef M_DECRYPTED /*not openbsd*/ if (m0->m_flags & M_DECRYPTED) goto noredhdropt; #endif if (p - (u_char *)ip6 > maxlen) goto noredhdropt; { /* redirected header option */ int len; struct nd_opt_rd_hdr *nd_opt_rh; /* * compute the maximum size for icmp6 redirect header option. * XXX room for auth header? */ len = maxlen - (p - (u_char *)ip6); len &= ~7; /* This is just for simplicity. */ if (m0->m_pkthdr.len != m0->m_len) { if (m0->m_next) { m_freem(m0->m_next); m0->m_next = NULL; } m0->m_pkthdr.len = m0->m_len; } /* * Redirected header option spec (RFC2461 4.6.3) talks nothing * about padding/truncate rule for the original IP packet. * From the discussion on IPv6imp in Feb 1999, * the consensus was: * - "attach as much as possible" is the goal * - pad if not aligned (original size can be guessed by * original ip6 header) * Following code adds the padding if it is simple enough, * and truncates if not. */ if (m0->m_next || m0->m_pkthdr.len != m0->m_len) panic("assumption failed in %s:%d", __FILE__, __LINE__); if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) { /* not enough room, truncate */ m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } else { /* enough room, pad or truncate */ size_t extra; extra = m0->m_pkthdr.len % 8; if (extra) { /* pad if easy enough, truncate if not */ if (8 - extra <= M_TRAILINGSPACE(m0)) { /* pad */ bzero(m0->m_data + m0->m_len, 8 - extra); m0->m_len += (8 - extra); m0->m_pkthdr.len += (8 - extra); } else { /* truncate */ m0->m_pkthdr.len -= extra; m0->m_len -= extra; } } len = m0->m_pkthdr.len + sizeof(*nd_opt_rh); m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } nd_opt_rh = (struct nd_opt_rd_hdr *)p; bzero(nd_opt_rh, sizeof(*nd_opt_rh)); nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER; nd_opt_rh->nd_opt_rh_len = len >> 3; p += sizeof(*nd_opt_rh); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* connect m0 to m */ m_tag_delete_chain(m0, NULL); m0->m_flags &= ~M_PKTHDR; m->m_next = m0; m->m_pkthdr.len = m->m_len + m0->m_len; m0 = NULL; } noredhdropt:; if (m0) { m_freem(m0); m0 = NULL; } /* XXX: clear embedded link IDs in the inner header */ in6_clearscope(&sip6->ip6_src); in6_clearscope(&sip6->ip6_dst); in6_clearscope(&nd_rd->nd_rd_target); in6_clearscope(&nd_rd->nd_rd_dst); ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); nd_rd->nd_rd_cksum = 0; nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), ntohs(ip6->ip6_plen)); if (send_sendso_input_hook != NULL) { mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short), M_NOWAIT); if (mtag == NULL) goto fail; *(unsigned short *)(mtag + 1) = nd_rd->nd_rd_type; m_tag_prepend(m, mtag); } /* send the packet to outside... */ ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_redirect); } ICMP6STAT_INC2(icp6s_outhist, ND_REDIRECT); return; fail: if (m) m_freem(m); if (m0) m_freem(m0); } /* * ICMPv6 socket option processing. */ int icmp6_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0; int optlen; struct inpcb *inp = sotoinpcb(so); int level, op, optname; if (sopt) { level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; } else level = op = optname = optlen = 0; if (level != IPPROTO_ICMPV6) { return EINVAL; } switch (op) { case SOPT_SET: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter ic6f; if (optlen != sizeof(ic6f)) { error = EMSGSIZE; break; } error = sooptcopyin(sopt, &ic6f, optlen, optlen); if (error == 0) { INP_WLOCK(inp); *inp->in6p_icmp6filt = ic6f; INP_WUNLOCK(inp); } break; } default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter ic6f; INP_RLOCK(inp); ic6f = *inp->in6p_icmp6filt; INP_RUNLOCK(inp); error = sooptcopyout(sopt, &ic6f, sizeof(ic6f)); break; } default: error = ENOPROTOOPT; break; } break; } return (error); } static int sysctl_icmp6lim_and_jitter(SYSCTL_HANDLER_ARGS); VNET_DEFINE_STATIC(u_int, icmp6errppslim) = 100; #define V_icmp6errppslim VNET(icmp6errppslim) SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, errppslimit, CTLTYPE_UINT | CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6errppslim), 0, &sysctl_icmp6lim_and_jitter, "IU", "Maximum number of ICMPv6 error/reply messages per second"); typedef enum { RATELIM_PARAM_PROB = 0, RATELIM_TOO_BIG, RATELIM_UNREACH, RATELIM_TEXCEED, RATELIM_REDIR, RATELIM_REPLY, RATELIM_OTHER, RATELIM_MAX } ratelim_which; static const char *icmp6_rate_descrs[RATELIM_MAX] = { [RATELIM_PARAM_PROB] = "bad IPv6 header", [RATELIM_TOO_BIG] = "packet too big", [RATELIM_UNREACH] = "destination unreachable", [RATELIM_TEXCEED] = "time exceeded", [RATELIM_REPLY] = "echo reply", [RATELIM_REDIR] = "neighbor discovery redirect", [RATELIM_OTHER] = "(other)", }; VNET_DEFINE_STATIC(int, icmp6lim_curr_jitter[RATELIM_MAX]) = {0}; #define V_icmp6lim_curr_jitter VNET(icmp6lim_curr_jitter) VNET_DEFINE_STATIC(u_int, icmp6lim_jitter) = 8; #define V_icmp6lim_jitter VNET(icmp6lim_jitter) SYSCTL_PROC(_net_inet6_icmp6, OID_AUTO, icmp6lim_jitter, CTLTYPE_UINT | CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6lim_jitter), 0, &sysctl_icmp6lim_and_jitter, "IU", "Random errppslimit jitter adjustment limit"); VNET_DEFINE_STATIC(int, icmp6lim_output) = 1; #define V_icmp6lim_output VNET(icmp6lim_output) SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, icmp6lim_output, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6lim_output), 0, "Enable logging of ICMPv6 response rate limiting"); static void icmp6lim_new_jitter(int which) { /* * Adjust limit +/- to jitter the measurement to deny a side-channel * port scan as in https://dl.acm.org/doi/10.1145/3372297.3417280 */ KASSERT(which >= 0 && which < RATELIM_MAX, ("%s: which %d", __func__, which)); if (V_icmp6lim_jitter > 0) V_icmp6lim_curr_jitter[which] = arc4random_uniform(V_icmp6lim_jitter * 2 + 1) - V_icmp6lim_jitter; } static int sysctl_icmp6lim_and_jitter(SYSCTL_HANDLER_ARGS) { uint32_t new; int error; bool lim; MPASS(oidp->oid_arg1 == &VNET_NAME(icmp6errppslim) || oidp->oid_arg1 == &VNET_NAME(icmp6lim_jitter)); lim = (oidp->oid_arg1 == &VNET_NAME(icmp6errppslim)); new = lim ? V_icmp6errppslim : V_icmp6lim_jitter; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (lim) { if (new != 0 && new <= V_icmp6lim_jitter) error = EINVAL; else V_icmp6errppslim = new; } else { if (new >= V_icmp6errppslim) error = EINVAL; else { V_icmp6lim_jitter = new; for (int i = 0; i < RATELIM_MAX; i++) { icmp6lim_new_jitter(i); } } } } MPASS(V_icmp6errppslim == 0 || V_icmp6errppslim > V_icmp6lim_jitter); return (error); } VNET_DEFINE_STATIC(struct counter_rate *, icmp6_rates[RATELIM_MAX]); #define V_icmp6_rates VNET(icmp6_rates) static void icmp6_ratelimit_init(void) { for (int i = 0; i < RATELIM_MAX; i++) { V_icmp6_rates[i] = counter_rate_alloc(M_WAITOK, 1); icmp6lim_new_jitter(i); } } VNET_SYSINIT(icmp6_ratelimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, icmp6_ratelimit_init, NULL); #ifdef VIMAGE static void icmp6_ratelimit_uninit(void) { for (int i = 0; i < RATELIM_MAX; i++) counter_rate_free(V_icmp6_rates[i]); } VNET_SYSUNINIT(icmp6_ratelimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, icmp6_ratelimit_uninit, NULL); #endif /* * Perform rate limit check. * Returns 0 if it is okay to send the icmp6 packet. * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate * limitation. * * XXX per-destination/type check necessary? * * dst - not used at this moment * code - not used at this moment */ int icmp6_ratelimit(const struct in6_addr *dst, const int type, const int code) { ratelim_which which; int64_t pps; if (V_icmp6errppslim == 0) return (0); switch (type) { case ICMP6_PARAM_PROB: which = RATELIM_PARAM_PROB; break; case ICMP6_PACKET_TOO_BIG: which = RATELIM_TOO_BIG; break; case ICMP6_DST_UNREACH: which = RATELIM_UNREACH; break; case ICMP6_TIME_EXCEEDED: which = RATELIM_TEXCEED; break; case ND_REDIRECT: which = RATELIM_REDIR; break; case ICMP6_ECHO_REPLY: which = RATELIM_REPLY; break; default: which = RATELIM_OTHER; break; }; pps = counter_ratecheck(V_icmp6_rates[which], V_icmp6errppslim + V_icmp6lim_curr_jitter[which]); if (pps > 0) { if (V_icmp6lim_output) log(LOG_NOTICE, "Limiting ICMPv6 %s output from %jd " "to %d packets/sec\n", icmp6_rate_descrs[which], (intmax_t )pps, V_icmp6errppslim + V_icmp6lim_curr_jitter[which]); icmp6lim_new_jitter(which); } if (pps == -1) { ICMP6STAT_INC(icp6s_toofreq); return (-1); } return (0); } diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c index ed71c58fffbe..6299ce6e146b 100644 --- a/sys/netinet6/ip6_output.c +++ b/sys/netinet6/ip6_output.c @@ -1,3380 +1,3341 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include "opt_route.h" #include "opt_rss.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SCTP) || defined(SCTP_SUPPORT) #include #include #endif #include extern int in6_mcast_loop; struct ip6_exthdrs { struct mbuf *ip6e_ip6; struct mbuf *ip6e_hbh; struct mbuf *ip6e_dest1; struct mbuf *ip6e_rthdr; struct mbuf *ip6e_dest2; }; static MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options"); static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **, struct ucred *, int); static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *, struct socket *, struct sockopt *); static int ip6_getpcbopt(struct inpcb *, int, struct sockopt *); static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *, struct ucred *, int, int, int); static int ip6_copyexthdr(struct mbuf **, caddr_t, int); static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int, struct ip6_frag **); static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t); static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *); -static int ip6_getpmtu(struct route_in6 *, int, - struct ifnet *, const struct in6_addr *, u_long *, int *, u_int, - u_int); -static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long, - u_long *, int *, u_int); +static void ip6_getpmtu(struct route_in6 *, int, + struct ifnet *, const struct in6_addr *, u_long *, u_int, u_int); +static void ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long, + u_long *, u_int); static int ip6_getpmtu_ctl(u_int, const struct in6_addr *, u_long *); static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int); /* * Make an extension header from option data. hp is the source, * mp is the destination, and _ol is the optlen. */ #define MAKE_EXTHDR(hp, mp, _ol) \ do { \ struct ip6_ext *eh = (struct ip6_ext *)(hp); \ error = ip6_copyexthdr((mp), (caddr_t)(hp), \ ((eh)->ip6e_len + 1) << 3); \ if (error) \ goto freehdrs; \ (_ol) += (*(mp))->m_len; \ } while (/*CONSTCOND*/ 0) /* * Form a chain of extension headers. * m is the extension header mbuf * mp is the previous mbuf in the chain * p is the next header * i is the type of option. */ #define MAKE_CHAIN(m, mp, p, i)\ do {\ if (m) {\ if (!hdrsplit) \ panic("%s:%d: assumption failed: "\ "hdr not split: hdrsplit %d exthdrs %p",\ __func__, __LINE__, hdrsplit, &exthdrs);\ *mtod((m), u_char *) = *(p);\ *(p) = (i);\ p = mtod((m), u_char *);\ (m)->m_next = (mp)->m_next;\ (mp)->m_next = (m);\ (mp) = (m);\ }\ } while (/*CONSTCOND*/ 0) void in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset) { u_short csum; csum = in_cksum_skip(m, offset + plen, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(csum) > m->m_len) m_copyback(m, offset, sizeof(csum), (caddr_t)&csum); else *(u_short *)mtodo(m, offset) = csum; } static void ip6_output_delayed_csum(struct mbuf *m, struct ifnet *ifp, int csum_flags, int plen, int optlen) { KASSERT((plen >= optlen), ("%s:%d: plen %d < optlen %d, m %p, ifp %p " "csum_flags %#x", __func__, __LINE__, plen, optlen, m, ifp, csum_flags)); if (csum_flags & CSUM_DELAY_DATA_IPV6) { in6_delayed_cksum(m, plen - optlen, sizeof(struct ip6_hdr) + optlen); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; } #if defined(SCTP) || defined(SCTP_SUPPORT) if (csum_flags & CSUM_SCTP_IPV6) { sctp_delayed_cksum(m, sizeof(struct ip6_hdr) + optlen); m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6; } #endif } int ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto, int fraglen , uint32_t id) { struct mbuf *m, **mnext, *m_frgpart; struct ip6_hdr *ip6, *mhip6; struct ip6_frag *ip6f; int off; int error; int tlen = m0->m_pkthdr.len; KASSERT((fraglen % 8 == 0), ("Fragment length must be a multiple of 8")); m = m0; ip6 = mtod(m, struct ip6_hdr *); mnext = &m->m_nextpkt; for (off = hlen; off < tlen; off += fraglen) { m = m_gethdr(M_NOWAIT, MT_DATA); if (!m) { IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } /* * Make sure the complete packet header gets copied * from the originating mbuf to the newly created * mbuf. This also ensures that existing firewall * classification(s), VLAN tags and so on get copied * to the resulting fragmented packet(s): */ if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) { m_free(m); IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } *mnext = m; mnext = &m->m_nextpkt; m->m_data += max_linkhdr; mhip6 = mtod(m, struct ip6_hdr *); *mhip6 = *ip6; m->m_len = sizeof(*mhip6); error = ip6_insertfraghdr(m0, m, hlen, &ip6f); if (error) { IP6STAT_INC(ip6s_odropped); return (error); } ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7)); if (off + fraglen >= tlen) fraglen = tlen - off; else ip6f->ip6f_offlg |= IP6F_MORE_FRAG; mhip6->ip6_plen = htons((u_short)(fraglen + hlen + sizeof(*ip6f) - sizeof(struct ip6_hdr))); if ((m_frgpart = m_copym(m0, off, fraglen, M_NOWAIT)) == NULL) { IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } m_cat(m, m_frgpart); m->m_pkthdr.len = fraglen + hlen + sizeof(*ip6f); ip6f->ip6f_reserved = 0; ip6f->ip6f_ident = id; ip6f->ip6f_nxt = nextproto; IP6STAT_INC(ip6s_ofragments); in6_ifstat_inc(ifp, ifs6_out_fragcreat); } return (0); } static int ip6_output_send(struct inpcb *inp, struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m, struct sockaddr_in6 *dst, struct route_in6 *ro, bool stamp_tag) { #ifdef KERN_TLS struct ktls_session *tls = NULL; #endif struct m_snd_tag *mst; int error; MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); mst = NULL; #ifdef KERN_TLS /* * If this is an unencrypted TLS record, save a reference to * the record. This local reference is used to call * ktls_output_eagain after the mbuf has been freed (thus * dropping the mbuf's reference) in if_output. */ if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) { tls = ktls_hold(m->m_next->m_epg_tls); mst = tls->snd_tag; /* * If a TLS session doesn't have a valid tag, it must * have had an earlier ifp mismatch, so drop this * packet. */ if (mst == NULL) { m_freem(m); error = EAGAIN; goto done; } /* * Always stamp tags that include NIC ktls. */ stamp_tag = true; } #endif #ifdef RATELIMIT if (inp != NULL && mst == NULL) { if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 || (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp)) in_pcboutput_txrtlmt(inp, ifp, m); if (inp->inp_snd_tag != NULL) mst = inp->inp_snd_tag; } #endif if (stamp_tag && mst != NULL) { KASSERT(m->m_pkthdr.rcvif == NULL, ("trying to add a send tag to a forwarded packet")); if (mst->ifp != ifp) { m_freem(m); error = EAGAIN; goto done; } /* stamp send tag on mbuf */ m->m_pkthdr.snd_tag = m_snd_tag_ref(mst); m->m_pkthdr.csum_flags |= CSUM_SND_TAG; } error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); done: /* Check for route change invalidating send tags. */ #ifdef KERN_TLS if (tls != NULL) { if (error == EAGAIN) error = ktls_output_eagain(inp, tls); ktls_free(tls); } #endif #ifdef RATELIMIT if (error == EAGAIN) in_pcboutput_eagain(inp); #endif return (error); } /* * IP6 output. * The packet in mbuf chain m contains a skeletal IP6 header (with pri, len, * nxt, hlim, src, dst). * This function may modify ver and hlim only. * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route_in6 ro is present and has ro_nh initialized, route lookup would be * skipped and ro->ro_nh would be used. If ro is present but ro->ro_nh is NULL, * then result of route lookup is stored in ro->ro_nh. * * Type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and nd_ifinfo.linkmtu * is uint32_t. So we use u_long to hold largest one, which is rt_mtu. * * ifpp - XXX: just for statistics */ int ip6_output(struct mbuf *m0, struct ip6_pktopts *opt, struct route_in6 *ro, int flags, struct ip6_moptions *im6o, struct ifnet **ifpp, struct inpcb *inp) { struct ip6_hdr *ip6; struct ifnet *ifp, *origifp; struct mbuf *m = m0; struct mbuf *mprev; struct route_in6 *ro_pmtu; struct nhop_object *nh; struct sockaddr_in6 *dst, sin6, src_sa, dst_sa; struct in6_addr odst; u_char *nexthdrp; int tlen, len; int error = 0; int vlan_pcp = -1; struct in6_ifaddr *ia = NULL; u_long mtu; - int alwaysfrag, dontfrag; + int dontfrag; u_int32_t optlen, plen = 0, unfragpartlen; struct ip6_exthdrs exthdrs; struct in6_addr src0, dst0; u_int32_t zone; bool hdrsplit; int sw_csum, tso; int needfiblookup; uint32_t fibnum; struct m_tag *fwd_tag = NULL; uint32_t id; uint32_t optvalid; NET_EPOCH_ASSERT(); if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); if ((flags & IP_NODEFAULTFLOWID) == 0) { /* Unconditionally set flowid. */ m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } if ((inp->inp_flags2 & INP_2PCP_SET) != 0) vlan_pcp = (inp->inp_flags2 & INP_2PCP_MASK) >> INP_2PCP_SHIFT; #ifdef NUMA m->m_pkthdr.numa_domain = inp->inp_numa_domain; #endif } /* Source address validation. */ ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && (flags & IPV6_UNSPECSRC) == 0) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } /* * If we are given packet options to add extension headers prepare them. * Calculate the total length of the extension header chain. * Keep the length of the unfragmentable part for fragmentation. */ bzero(&exthdrs, sizeof(exthdrs)); optlen = optvalid = 0; unfragpartlen = sizeof(struct ip6_hdr); if (opt) { optvalid = opt->ip6po_valid; /* Hop-by-Hop options header. */ if ((optvalid & IP6PO_VALID_HBH) != 0) MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh, optlen); /* Destination options header (1st part). */ if ((optvalid & IP6PO_VALID_RHINFO) != 0) { #ifndef RTHDR_SUPPORT_IMPLEMENTED /* * If there is a routing header, discard the packet * right away here. RH0/1 are obsolete and we do not * currently support RH2/3/4. * People trying to use RH253/254 may want to disable * this check. * The moment we do support any routing header (again) * this block should check the routing type more * selectively. */ error = EINVAL; goto bad; #endif /* * Destination options header (1st part). * This only makes sense with a routing header. * See Section 9.2 of RFC 3542. * Disabling this part just for MIP6 convenience is * a bad idea. We need to think carefully about a * way to make the advanced API coexist with MIP6 * options, which might automatically be inserted in * the kernel. */ if ((optvalid & IP6PO_VALID_DEST1) != 0) MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1, optlen); } /* Routing header. */ if ((optvalid & IP6PO_VALID_RHINFO) != 0) MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr, optlen); unfragpartlen += optlen; /* * NOTE: we don't add AH/ESP length here (done in * ip6_ipsec_output()). */ /* Destination options header (2nd part). */ if ((optvalid & IP6PO_VALID_DEST2) != 0) MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2, optlen); } /* * If there is at least one extension header, * separate IP6 header from the payload. */ hdrsplit = false; if (optlen) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; ip6 = mtod(m, struct ip6_hdr *); hdrsplit = true; } /* Adjust mbuf packet header length. */ m->m_pkthdr.len += optlen; plen = m->m_pkthdr.len - sizeof(*ip6); /* If this is a jumbo payload, insert a jumbo payload option. */ if (plen > IPV6_MAXPACKET) { if (!hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; ip6 = mtod(m, struct ip6_hdr *); hdrsplit = true; } if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0) goto freehdrs; ip6->ip6_plen = 0; } else ip6->ip6_plen = htons(plen); nexthdrp = &ip6->ip6_nxt; if (optlen) { /* * Concatenate headers and fill in next header fields. * Here we have, on "m" * IPv6 payload * and we insert headers accordingly. * Finally, we should be getting: * IPv6 hbh dest1 rthdr ah* [esp* dest2 payload]. * * During the header composing process "m" points to IPv6 * header. "mprev" points to an extension header prior to esp. */ mprev = m; /* * We treat dest2 specially. This makes IPsec processing * much easier. The goal here is to make mprev point the * mbuf prior to dest2. * * Result: IPv6 dest2 payload. * m and mprev will point to IPv6 header. */ if (exthdrs.ip6e_dest2) { if (!hdrsplit) panic("%s:%d: assumption failed: " "hdr not split: hdrsplit %d exthdrs %p", __func__, __LINE__, hdrsplit, &exthdrs); exthdrs.ip6e_dest2->m_next = m->m_next; m->m_next = exthdrs.ip6e_dest2; *mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_DSTOPTS; } /* * Result: IPv6 hbh dest1 rthdr dest2 payload. * m will point to IPv6 header. mprev will point to the * extension header prior to dest2 (rthdr in the above case). */ MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS); MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp, IPPROTO_DSTOPTS); MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp, IPPROTO_ROUTING); } IP6STAT_INC(ip6s_localout); /* Route packet. */ ro_pmtu = ro; if ((optvalid & IP6PO_VALID_RHINFO) != 0) ro = &opt->ip6po_route; if (ro != NULL) dst = (struct sockaddr_in6 *)&ro->ro_dst; else dst = &sin6; fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); again: /* * If specified, try to fill in the traffic class field. * Do not override if a non-zero value is already set. * We check the diffserv field and the ECN field separately. */ if ((optvalid & IP6PO_VALID_TC) != 0){ int mask = 0; if (IPV6_DSCP(ip6) == 0) mask |= 0xfc; if (IPV6_ECN(ip6) == 0) mask |= 0x03; if (mask != 0) ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); } /* Fill in or override the hop limit field, if necessary. */ if ((optvalid & IP6PO_VALID_HLIM) != 0) ip6->ip6_hlim = opt->ip6po_hlim & 0xff; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (im6o != NULL) ip6->ip6_hlim = im6o->im6o_multicast_hlim; else ip6->ip6_hlim = V_ip6_defmcasthlim; } if (ro == NULL || ro->ro_nh == NULL) { bzero(dst, sizeof(*dst)); dst->sin6_family = AF_INET6; dst->sin6_len = sizeof(*dst); dst->sin6_addr = ip6->ip6_dst; } /* * Validate route against routing table changes. * Make sure that the address family is set in route. */ nh = NULL; ifp = NULL; mtu = 0; if (ro != NULL) { if (ro->ro_nh != NULL && inp != NULL) { ro->ro_dst.sin6_family = AF_INET6; /* XXX KASSERT? */ NH_VALIDATE((struct route *)ro, &inp->inp_rt_cookie, fibnum); } if (ro->ro_nh != NULL && fwd_tag == NULL && (!NH_IS_VALID(ro->ro_nh) || ro->ro_dst.sin6_family != AF_INET6 || !IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst))) RO_INVALIDATE_CACHE(ro); if (ro->ro_nh != NULL && fwd_tag == NULL && ro->ro_dst.sin6_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)) { /* Nexthop is valid and contains valid ifp */ nh = ro->ro_nh; } else { if (ro->ro_lle) LLE_FREE(ro->ro_lle); /* zeros ro_lle */ ro->ro_lle = NULL; if (fwd_tag == NULL) { bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; } error = in6_selectroute(&dst_sa, opt, im6o, ro, &ifp, &nh, fibnum, m->m_pkthdr.flowid); if (error != 0) { IP6STAT_INC(ip6s_noroute); if (ifp != NULL) in6_ifstat_inc(ifp, ifs6_out_discard); goto bad; } /* * At this point at least @ifp is not NULL * Can be the case when dst is multicast, link-local or * interface is explicitly specificed by the caller. */ } if (nh == NULL) { /* * If in6_selectroute() does not return a nexthop * dst may not have been updated. */ *dst = dst_sa; /* XXX */ origifp = ifp; mtu = ifp->if_mtu; } else { ifp = nh->nh_ifp; origifp = nh->nh_aifp; ia = (struct in6_ifaddr *)(nh->nh_ifa); counter_u64_add(nh->nh_pksent, 1); } } else { struct nhop_object *nh; struct in6_addr kdst; uint32_t scopeid; if (fwd_tag == NULL) { bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; } if (IN6_IS_ADDR_MULTICAST(&dst_sa.sin6_addr) && im6o != NULL && (ifp = im6o->im6o_multicast_ifp) != NULL) { /* We do not need a route lookup. */ *dst = dst_sa; /* XXX */ origifp = ifp; goto nonh6lookup; } in6_splitscope(&dst_sa.sin6_addr, &kdst, &scopeid); if (IN6_IS_ADDR_MC_LINKLOCAL(&dst_sa.sin6_addr) || IN6_IS_ADDR_MC_NODELOCAL(&dst_sa.sin6_addr)) { if (scopeid > 0) { ifp = in6_getlinkifnet(scopeid); if (ifp == NULL) { error = EHOSTUNREACH; goto bad; } *dst = dst_sa; /* XXX */ origifp = ifp; goto nonh6lookup; } } nh = fib6_lookup(fibnum, &kdst, scopeid, NHR_NONE, m->m_pkthdr.flowid); if (nh == NULL) { IP6STAT_INC(ip6s_noroute); /* No ifp in6_ifstat_inc(ifp, ifs6_out_discard); */ error = EHOSTUNREACH; goto bad; } ifp = nh->nh_ifp; origifp = nh->nh_aifp; ia = ifatoia6(nh->nh_ifa); if (nh->nh_flags & NHF_GATEWAY) dst->sin6_addr = nh->gw6_sa.sin6_addr; else if (fwd_tag != NULL) dst->sin6_addr = dst_sa.sin6_addr; nonh6lookup: ; } /* * At this point ifp MUST be pointing to the valid transmit ifp. * origifp MUST be valid and pointing to either the same ifp or, * in case of loopback output, to the interface which ip6_src * belongs to. * Examples: * fe80::1%em0 -> fe80::2%em0 -> ifp=em0, origifp=em0 * fe80::1%em0 -> fe80::1%em0 -> ifp=lo0, origifp=em0 * ::1 -> ::1 -> ifp=lo0, origifp=lo0 * * mtu can be 0 and will be refined later. */ KASSERT((ifp != NULL), ("output interface must not be NULL")); KASSERT((origifp != NULL), ("output address interface must not be NULL")); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * IPSec checking which handles several cases. * FAST IPSEC: We re-injected the packet. * XXX: need scope argument. */ if (IPSEC_ENABLED(ipv6)) { if ((error = IPSEC_OUTPUT(ipv6, ifp, m, inp, mtu == 0 ? ifp->if_mtu : mtu)) != 0) { if (error == EINPROGRESS) error = 0; goto done; } } #endif /* IPSEC */ if ((flags & IPV6_FORWARDING) == 0) { /* XXX: the FORWARDING flag can be set for mrouting. */ in6_ifstat_inc(ifp, ifs6_out_request); } /* Setup data structures for scope ID checks. */ src0 = ip6->ip6_src; bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = ip6->ip6_src; dst0 = ip6->ip6_dst; /* Re-initialize to be sure. */ bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; /* Check for valid scope ID. */ if (in6_setscope(&src0, origifp, &zone) == 0 && sa6_recoverscope(&src_sa) == 0 && zone == src_sa.sin6_scope_id && in6_setscope(&dst0, origifp, &zone) == 0 && sa6_recoverscope(&dst_sa) == 0 && zone == dst_sa.sin6_scope_id) { /* * The outgoing interface is in the zone of the source * and destination addresses. * */ } else if ((origifp->if_flags & IFF_LOOPBACK) == 0 || sa6_recoverscope(&src_sa) != 0 || sa6_recoverscope(&dst_sa) != 0 || dst_sa.sin6_scope_id == 0 || (src_sa.sin6_scope_id != 0 && src_sa.sin6_scope_id != dst_sa.sin6_scope_id) || ifnet_byindex(dst_sa.sin6_scope_id) == NULL) { /* * If the destination network interface is not a * loopback interface, or the destination network * address has no scope ID, or the source address has * a scope ID set which is different from the * destination address one, or there is no network * interface representing this scope ID, the address * pair is considered invalid. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(origifp, ifs6_out_discard); if (error == 0) error = EHOSTUNREACH; /* XXX */ goto bad; } /* All scope ID checks are successful. */ if (nh && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if ((optvalid & IP6PO_VALID_NHINFO) != 0) { /* * The nexthop is explicitly specified by the * application. We assume the next hop is an IPv6 * address. */ dst = (struct sockaddr_in6 *)opt->ip6po_nexthop; } else if ((nh->nh_flags & NHF_GATEWAY)) dst = &nh->gw6_sa; } if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { m->m_flags &= ~(M_BCAST | M_MCAST); /* Just in case. */ } else { m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST; in6_ifstat_inc(ifp, ifs6_out_mcast); /* Confirm that the outgoing interface supports multicast. */ if (!(ifp->if_flags & IFF_MULTICAST)) { IP6STAT_INC(ip6s_noroute); in6_ifstat_inc(ifp, ifs6_out_discard); error = ENETUNREACH; goto bad; } if ((im6o == NULL && in6_mcast_loop) || (im6o && im6o->im6o_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we have not joined * the address; protocols will filter it later, * thus deferring a hash lookup and lock acquisition * at the expense of an m_copym(). */ ip6_mloopback(ifp, m); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IPV6_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip6_mloopback(), * above, will be forwarded by the ip6_input() routine, * if necessary. */ if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) { /* * XXX: ip6_mforward expects that rcvif is NULL * when it is called from the originating path. * However, it may not always be the case. */ m->m_pkthdr.rcvif = NULL; if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a hoplimit of zero may be looped back, * above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip6_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) { m_freem(m); goto done; } } /* * Fill the outgoing inteface to tell the upper layer * to increment per-interface statistics. */ if (ifpp) *ifpp = ifp; /* Determine path MTU. */ - if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst, - &mtu, &alwaysfrag, fibnum, *nexthdrp)) != 0) - goto bad; - KASSERT(mtu > 0, ("%s:%d: mtu %ld, ro_pmtu %p ro %p ifp %p " - "alwaysfrag %d fibnum %u\n", __func__, __LINE__, mtu, ro_pmtu, ro, - ifp, alwaysfrag, fibnum)); + ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst, &mtu, fibnum, + *nexthdrp); + KASSERT(mtu > 0, ("%s:%d: mtu %ld, ro_pmtu %p ro %p ifp %p fibnum %u", + __func__, __LINE__, mtu, ro_pmtu, ro, ifp, fibnum)); /* * The caller of this function may specify to use the minimum MTU * in some cases. * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU * setting. The logic is a bit complicated; by default, unicast * packets will follow path MTU while multicast packets will be sent at * the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets * including unicast ones will be sent at the minimum MTU. Multicast * packets will always be sent at the minimum MTU unless * IP6PO_MINMTU_DISABLE is explicitly specified. * See RFC 3542 for more details. */ if (mtu > IPV6_MMTU) { if ((flags & IPV6_MINMTU)) mtu = IPV6_MMTU; else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL) mtu = IPV6_MMTU; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && (opt == NULL || opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) { mtu = IPV6_MMTU; } } /* * Clear embedded scope identifiers if necessary. * in6_clearscope() will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); /* * If the outgoing packet contains a hop-by-hop options header, * it must be examined and processed even by the source node. * (RFC 2460, section 4.) */ if (exthdrs.ip6e_hbh) { struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *); u_int32_t dummy; /* XXX unused */ u_int32_t plen = 0; /* XXX: ip6_process will check the value */ #ifdef DIAGNOSTIC if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len) panic("ip6e_hbh is not contiguous"); #endif /* * XXX: if we have to send an ICMPv6 error to the sender, * we need the M_LOOP flag since icmp6_error() expects * the IPv6 and the hop-by-hop options header are * contiguous unless the flag is set. */ m->m_flags |= M_LOOP; m->m_pkthdr.rcvif = ifp; if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1), ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh), &dummy, &plen) < 0) { /* m was already freed at this point. */ error = EINVAL;/* better error? */ goto done; } m->m_flags &= ~M_LOOP; /* XXX */ m->m_pkthdr.rcvif = NULL; } /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED_OUT(V_inet6_pfil_head)) goto passout; odst = ip6->ip6_dst; /* Run through list of hooks for output packets. */ switch (pfil_mbuf_out(V_inet6_pfil_head, &m, ifp, inp)) { case PFIL_PASS: ip6 = mtod(m, struct ip6_hdr *); break; case PFIL_DROPPED: error = EACCES; /* FALLTHROUGH */ case PFIL_CONSUMED: goto done; } needfiblookup = 0; /* See if destination IP address was changed by packet filter. */ if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip6_input(). */ if (in6_localip(&ip6->ip6_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #if defined(SCTP) || defined(SCTP_SUPPORT) if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } else { if (ro != NULL) RO_INVALIDATE_CACHE(ro); needfiblookup = 1; /* Redo the routing table lookup. */ } } /* See if fib was changed by packet filter. */ if (fibnum != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; fibnum = M_GETFIB(m); if (ro != NULL) RO_INVALIDATE_CACHE(ro); needfiblookup = 1; } if (needfiblookup) goto again; /* See if local, if yes, send it to netisr. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #if defined(SCTP) || defined(SCTP_SUPPORT) if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } /* Or forward to some other address? */ if ((m->m_flags & M_IP6_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { if (ro != NULL) dst = (struct sockaddr_in6 *)&ro->ro_dst; else dst = &sin6; bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP6_NEXTHOP; m_tag_delete(m, fwd_tag); goto again; } passout: if (vlan_pcp > -1) EVL_APPLY_PRI(m, vlan_pcp); /* Ensure the packet data is mapped if the interface requires it. */ if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) { struct mbuf *m1; error = mb_unmapped_to_ext(m, &m1); if (error != 0) { if (error == EINVAL) { if_printf(ifp, "TLS packet\n"); /* XXXKIB */ } else if (error == ENOMEM) { error = ENOBUFS; } IP6STAT_INC(ip6s_odropped); return (error); } else { m = m1; } } /* * Send the packet to the outgoing interface. * If necessary, do IPv6 fragmentation before sending. * - * The logic here is rather complex: - * 1: normal case (dontfrag == 0, alwaysfrag == 0) + * 1: normal case (dontfrag == 0) * 1-a: send as is if tlen <= path mtu * 1-b: fragment if tlen > path mtu * * 2: if user asks us not to fragment (dontfrag == 1) * 2-a: send as is if tlen <= interface mtu * 2-b: error if tlen > interface mtu - * - * 3: if we always need to attach fragment header (alwaysfrag == 1) - * always fragment - * - * 4: if dontfrag == 1 && alwaysfrag == 1 - * error, as we cannot handle this conflicting request. */ sw_csum = m->m_pkthdr.csum_flags; if (!hdrsplit) { tso = ((sw_csum & ifp->if_hwassist & (CSUM_TSO | CSUM_INNER_TSO)) != 0) ? 1 : 0; sw_csum &= ~ifp->if_hwassist; } else tso = 0; /* * If we added extension headers, we will not do TSO and calculate the * checksums ourselves for now. * XXX-BZ Need a framework to know when the NIC can handle it, even * with ext. hdrs. */ ip6_output_delayed_csum(m, ifp, sw_csum, plen, optlen); /* XXX-BZ m->m_pkthdr.csum_flags &= ~ifp->if_hwassist; */ tlen = m->m_pkthdr.len; if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso) dontfrag = 1; else dontfrag = 0; - if (dontfrag && alwaysfrag) { /* Case 4. */ - /* Conflicting request - can't transmit. */ - error = EMSGSIZE; - goto bad; - } if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) { /* Case 2-b. */ /* - * Even if the DONTFRAG option is specified, we cannot send the + * If the DONTFRAG option is specified, we cannot send the * packet when the data length is larger than the MTU of the * outgoing interface. * Notify the error by sending IPV6_PATHMTU ancillary data if * application wanted to know the MTU value. Also return an * error code (this is not described in the API spec). */ if (inp != NULL) ip6_notify_pmtu(inp, &dst_sa, (u_int32_t)mtu); error = EMSGSIZE; goto bad; } /* Transmit packet without fragmentation. */ - if (dontfrag || (!alwaysfrag && tlen <= mtu)) { /* Cases 1-a and 2-a. */ + if (dontfrag || tlen <= mtu) { /* Cases 1-a and 2-a. */ struct in6_ifaddr *ia6; ip6 = mtod(m, struct ip6_hdr *); ia6 = in6_ifawithifp(ifp, &ip6->ip6_src); if (ia6) { /* Record statistics for this interface address. */ counter_u64_add(ia6->ia_ifa.ifa_opackets, 1); counter_u64_add(ia6->ia_ifa.ifa_obytes, m->m_pkthdr.len); } error = ip6_output_send(inp, ifp, origifp, m, dst, ro, (flags & IP_NO_SND_TAG_RL) ? false : true); goto done; } - /* Try to fragment the packet. Cases 1-b and 3. */ + /* Try to fragment the packet. Case 1-b. */ if (mtu < IPV6_MMTU) { /* Path MTU cannot be less than IPV6_MMTU. */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else if (ip6->ip6_plen == 0) { /* Jumbo payload cannot be fragmented. */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else { u_char nextproto; /* * Too large for the destination or interface; * fragment if possible. * Must be able to put at least 8 bytes per fragment. */ if (mtu > IPV6_MAXPACKET) mtu = IPV6_MAXPACKET; len = (mtu - unfragpartlen - sizeof(struct ip6_frag)) & ~7; if (len < 8) { error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } /* * If the interface will not calculate checksums on * fragmented packets, then do it here. * XXX-BZ handle the hw offloading case. Need flags. */ ip6_output_delayed_csum(m, ifp, m->m_pkthdr.csum_flags, plen, optlen); /* * Change the next header field of the last header in the * unfragmentable part. */ if (exthdrs.ip6e_rthdr) { nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *); *mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_dest1) { nextproto = *mtod(exthdrs.ip6e_dest1, u_char *); *mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_hbh) { nextproto = *mtod(exthdrs.ip6e_hbh, u_char *); *mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT; } else { ip6 = mtod(m, struct ip6_hdr *); nextproto = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_FRAGMENT; } /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto * chain. */ m0 = m; id = htonl(ip6_randomid()); error = ip6_fragment(ifp, m, unfragpartlen, nextproto,len, id); if (error != 0) goto sendorfree; in6_ifstat_inc(ifp, ifs6_out_fragok); } /* Remove leading garbage. */ sendorfree: m = m0->m_nextpkt; m0->m_nextpkt = 0; m_freem(m0); for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } if (vlan_pcp > -1) EVL_APPLY_PRI(m, vlan_pcp); error = ip6_output_send(inp, ifp, origifp, m, dst, ro, true); } else m_freem(m); } if (error == 0) IP6STAT_INC(ip6s_fragmented); done: return (error); freehdrs: m_freem(exthdrs.ip6e_hbh); /* m_freem() checks if mbuf is NULL. */ m_freem(exthdrs.ip6e_dest1); m_freem(exthdrs.ip6e_rthdr); m_freem(exthdrs.ip6e_dest2); /* FALLTHROUGH */ bad: if (m) m_freem(m); goto done; } static int ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen) { struct mbuf *m; if (hlen > MCLBYTES) return (ENOBUFS); /* XXX */ if (hlen > MLEN) m = m_getcl(M_NOWAIT, MT_DATA, 0); else m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_len = hlen; if (hdr) bcopy(hdr, mtod(m, caddr_t), hlen); *mp = m; return (0); } /* * Insert jumbo payload option. */ static int ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen) { struct mbuf *mopt; u_char *optbuf; u_int32_t v; #define JUMBOOPTLEN 8 /* length of jumbo payload option and padding */ /* * If there is no hop-by-hop options header, allocate new one. * If there is one but it doesn't have enough space to store the * jumbo payload option, allocate a cluster to store the whole options. * Otherwise, use it to store the options. */ if (exthdrs->ip6e_hbh == NULL) { mopt = m_get(M_NOWAIT, MT_DATA); if (mopt == NULL) return (ENOBUFS); mopt->m_len = JUMBOOPTLEN; optbuf = mtod(mopt, u_char *); optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */ exthdrs->ip6e_hbh = mopt; } else { struct ip6_hbh *hbh; mopt = exthdrs->ip6e_hbh; if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) { /* * XXX assumption: * - exthdrs->ip6e_hbh is not referenced from places * other than exthdrs. * - exthdrs->ip6e_hbh is not an mbuf chain. */ int oldoptlen = mopt->m_len; struct mbuf *n; /* * XXX: give up if the whole (new) hbh header does * not fit even in an mbuf cluster. */ if (oldoptlen + JUMBOOPTLEN > MCLBYTES) return (ENOBUFS); /* * As a consequence, we must always prepare a cluster * at this point. */ n = m_getcl(M_NOWAIT, MT_DATA, 0); if (n == NULL) return (ENOBUFS); n->m_len = oldoptlen + JUMBOOPTLEN; bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t), oldoptlen); optbuf = mtod(n, caddr_t) + oldoptlen; m_freem(mopt); mopt = exthdrs->ip6e_hbh = n; } else { optbuf = mtod(mopt, u_char *) + mopt->m_len; mopt->m_len += JUMBOOPTLEN; } optbuf[0] = IP6OPT_PADN; optbuf[1] = 1; /* * Adjust the header length according to the pad and * the jumbo payload option. */ hbh = mtod(mopt, struct ip6_hbh *); hbh->ip6h_len += (JUMBOOPTLEN >> 3); } /* fill in the option. */ optbuf[2] = IP6OPT_JUMBO; optbuf[3] = 4; v = (u_int32_t)htonl(plen + JUMBOOPTLEN); bcopy(&v, &optbuf[4], sizeof(u_int32_t)); /* finally, adjust the packet header length */ exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN; return (0); #undef JUMBOOPTLEN } /* * Insert fragment header and copy unfragmentable header portions. */ static int ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen, struct ip6_frag **frghdrp) { struct mbuf *n, *mlast; if (hlen > sizeof(struct ip6_hdr)) { n = m_copym(m0, sizeof(struct ip6_hdr), hlen - sizeof(struct ip6_hdr), M_NOWAIT); if (n == NULL) return (ENOBUFS); m->m_next = n; } else n = m; /* Search for the last mbuf of unfragmentable part. */ for (mlast = n; mlast->m_next; mlast = mlast->m_next) ; if (M_WRITABLE(mlast) && M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) { /* use the trailing space of the last mbuf for the fragment hdr */ *frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) + mlast->m_len); mlast->m_len += sizeof(struct ip6_frag); m->m_pkthdr.len += sizeof(struct ip6_frag); } else { /* allocate a new mbuf for the fragment header */ struct mbuf *mfrg; mfrg = m_get(M_NOWAIT, MT_DATA); if (mfrg == NULL) return (ENOBUFS); mfrg->m_len = sizeof(struct ip6_frag); *frghdrp = mtod(mfrg, struct ip6_frag *); mlast->m_next = mfrg; } return (0); } /* * Calculates IPv6 path mtu for destination @dst. * Resulting MTU is stored in @mtup. * * Returns 0 on success. */ static int ip6_getpmtu_ctl(u_int fibnum, const struct in6_addr *dst, u_long *mtup) { struct epoch_tracker et; struct nhop_object *nh; struct in6_addr kdst; uint32_t scopeid; int error; in6_splitscope(dst, &kdst, &scopeid); NET_EPOCH_ENTER(et); nh = fib6_lookup(fibnum, &kdst, scopeid, NHR_NONE, 0); - if (nh != NULL) - error = ip6_calcmtu(nh->nh_ifp, dst, nh->nh_mtu, mtup, NULL, 0); - else + if (nh != NULL) { + ip6_calcmtu(nh->nh_ifp, dst, nh->nh_mtu, mtup, 0); + error = 0; + } else error = EHOSTUNREACH; NET_EPOCH_EXIT(et); return (error); } /* * Calculates IPv6 path MTU for @dst based on transmit @ifp, * and cached data in @ro_pmtu. * MTU from (successful) route lookup is saved (along with dst) * inside @ro_pmtu to avoid subsequent route lookups after packet * filter processing. * - * Stores mtu and always-frag value into @mtup and @alwaysfragp. - * Returns 0 on success. + * Stores mtu into @mtup. */ -static int +static void ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup, struct ifnet *ifp, const struct in6_addr *dst, u_long *mtup, - int *alwaysfragp, u_int fibnum, u_int proto) + u_int fibnum, u_int proto) { struct nhop_object *nh; struct in6_addr kdst; uint32_t scopeid; struct sockaddr_in6 *sa6_dst, sin6; u_long mtu; NET_EPOCH_ASSERT(); mtu = 0; if (ro_pmtu == NULL || do_lookup) { /* * Here ro_pmtu has final destination address, while * ro might represent immediate destination. * Use ro_pmtu destination since mtu might differ. */ if (ro_pmtu != NULL) { sa6_dst = (struct sockaddr_in6 *)&ro_pmtu->ro_dst; if (!IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst)) ro_pmtu->ro_mtu = 0; } else sa6_dst = &sin6; if (ro_pmtu == NULL || ro_pmtu->ro_mtu == 0) { bzero(sa6_dst, sizeof(*sa6_dst)); sa6_dst->sin6_family = AF_INET6; sa6_dst->sin6_len = sizeof(struct sockaddr_in6); sa6_dst->sin6_addr = *dst; in6_splitscope(dst, &kdst, &scopeid); nh = fib6_lookup(fibnum, &kdst, scopeid, NHR_NONE, 0); if (nh != NULL) { mtu = nh->nh_mtu; if (ro_pmtu != NULL) ro_pmtu->ro_mtu = mtu; } } else mtu = ro_pmtu->ro_mtu; } if (ro_pmtu != NULL && ro_pmtu->ro_nh != NULL) mtu = ro_pmtu->ro_nh->nh_mtu; - return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto)); + ip6_calcmtu(ifp, dst, mtu, mtup, proto); } /* * Calculate MTU based on transmit @ifp, route mtu @rt_mtu and * hostcache data for @dst. - * Stores mtu and always-frag value into @mtup and @alwaysfragp. - * - * Returns 0 on success. + * Stores mtu into @mtup. */ -static int +static void ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu, - u_long *mtup, int *alwaysfragp, u_int proto) + u_long *mtup, u_int proto) { u_long mtu = 0; - int alwaysfrag = 0; - int error = 0; if (rt_mtu > 0) { - u_int32_t ifmtu; - struct in_conninfo inc; - - bzero(&inc, sizeof(inc)); - inc.inc_flags |= INC_ISIPV6; - inc.inc6_faddr = *dst; - - ifmtu = IN6_LINKMTU(ifp); + /* Skip the hostcache if the protocol handles PMTU changes. */ + if (proto != IPPROTO_TCP && proto != IPPROTO_SCTP) { + struct in_conninfo inc = { + .inc_flags = INC_ISIPV6, + .inc6_faddr = *dst, + }; - /* TCP is known to react to pmtu changes so skip hc */ - if (proto != IPPROTO_TCP) mtu = tcp_hc_getmtu(&inc); + } if (mtu) mtu = min(mtu, rt_mtu); else mtu = rt_mtu; - if (mtu == 0) - mtu = ifmtu; - else if (mtu < IPV6_MMTU) { - /* - * RFC2460 section 5, last paragraph: - * if we record ICMPv6 too big message with - * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU - * or smaller, with framgent header attached. - * (fragment header is needed regardless from the - * packet size, for translators to identify packets) - */ - alwaysfrag = 1; - mtu = IPV6_MMTU; - } - } else if (ifp) { + } + + if (mtu == 0) mtu = IN6_LINKMTU(ifp); - } else - error = EHOSTUNREACH; /* XXX */ *mtup = mtu; - if (alwaysfragp) - *alwaysfragp = alwaysfrag; - return (error); } /* * IP6 socket option processing. */ int ip6_ctloutput(struct socket *so, struct sockopt *sopt) { int optdatalen, uproto; void *optdata; struct inpcb *inp = sotoinpcb(so); int error, optval; int level, op, optname; int optlen; struct thread *td; #ifdef RSS uint32_t rss_bucket; int retval; #endif /* * Don't use more than a quarter of mbuf clusters. N.B.: * nmbclusters is an int, but nmbclusters * MCLBYTES may overflow * on LP64 architectures, so cast to u_long to avoid undefined * behavior. ILP32 architectures cannot have nmbclusters * large enough to overflow for other reasons. */ #define IPV6_PKTOPTIONS_MBUF_LIMIT ((u_long)nmbclusters * MCLBYTES / 4) level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; td = sopt->sopt_td; error = 0; optval = 0; uproto = (int)so->so_proto->pr_protocol; if (level != IPPROTO_IPV6) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_SETFIB: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; INP_WLOCK(inp); if ((inp->inp_flags & INP_BOUNDFIB) != 0 && optval != so->so_fibnum) { INP_WUNLOCK(inp); error = EISCONN; break; } error = sosetfib(inp->inp_socket, optval); if (error == 0) inp->inp_inc.inc_fibnum = optval; INP_WUNLOCK(inp); break; case SO_MAX_PACING_RATE: #ifdef RATELIMIT INP_WLOCK(inp); inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; INP_WUNLOCK(inp); error = 0; #else error = EOPNOTSUPP; #endif break; default: break; } } } else { /* level == IPPROTO_IPV6 */ switch (op) { case SOPT_SET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif { struct mbuf *m; if (optlen > IPV6_PKTOPTIONS_MBUF_LIMIT) { printf("ip6_ctloutput: mbuf limit hit\n"); error = ENOBUFS; break; } error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; INP_WLOCK(inp); error = ip6_pcbopts(&inp->in6p_outputopts, m, so, sopt); INP_WUNLOCK(inp); m_freem(m); /* XXX */ break; } /* * Use of some Hop-by-Hop options or some * Destination options, might require special * privilege. That is, normal applications * (without special privilege) might be forbidden * from setting certain options in outgoing packets, * and might never see certain options in received * packets. [RFC 2292 Section 6] * KAME specific note: * KAME prevents non-privileged users from sending or * receiving ANY hbh/dst options in order to avoid * overhead of parsing options in the kernel. */ case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) break; } /* FALLTHROUGH */ case IPV6_UNICAST_HOPS: case IPV6_HOPLIMIT: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_RECVTCLASS: case IPV6_RECVFLOWID: #ifdef RSS case IPV6_RECVRSSBUCKETID: #endif case IPV6_V6ONLY: case IPV6_AUTOFLOWLABEL: case IPV6_ORIGDSTADDR: case IPV6_BINDANY: case IPV6_VLAN_PCP: if (optname == IPV6_BINDANY && td != NULL) { error = priv_check(td, PRIV_NETINET_BINDANY); if (error) break; } if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_UNICAST_HOPS: if (optval < -1 || optval >= 256) error = EINVAL; else { /* -1 = kernel default */ inp->in6p_hops = optval; if ((inp->inp_vflag & INP_IPV4) != 0) inp->inp_ip_ttl = optval; } break; #define OPTSET(bit) \ do { \ INP_WLOCK(inp); \ if (optval) \ inp->inp_flags |= (bit); \ else \ inp->inp_flags &= ~(bit); \ INP_WUNLOCK(inp); \ } while (/*CONSTCOND*/ 0) #define OPTSET2292(bit) \ do { \ INP_WLOCK(inp); \ inp->inp_flags |= IN6P_RFC2292; \ if (optval) \ inp->inp_flags |= (bit); \ else \ inp->inp_flags &= ~(bit); \ INP_WUNLOCK(inp); \ } while (/*CONSTCOND*/ 0) #define OPTBIT(bit) (inp->inp_flags & (bit) ? 1 : 0) #define OPTSET2_N(bit, val) do { \ if (val) \ inp->inp_flags2 |= bit; \ else \ inp->inp_flags2 &= ~bit; \ } while (0) #define OPTSET2(bit, val) do { \ INP_WLOCK(inp); \ OPTSET2_N(bit, val); \ INP_WUNLOCK(inp); \ } while (0) #define OPTBIT2(bit) (inp->inp_flags2 & (bit) ? 1 : 0) #define OPTSET2292_EXCLUSIVE(bit) \ do { \ INP_WLOCK(inp); \ if (OPTBIT(IN6P_RFC2292)) { \ error = EINVAL; \ } else { \ if (optval) \ inp->inp_flags |= (bit); \ else \ inp->inp_flags &= ~(bit); \ } \ INP_WUNLOCK(inp); \ } while (/*CONSTCOND*/ 0) case IPV6_RECVPKTINFO: OPTSET2292_EXCLUSIVE(IN6P_PKTINFO); break; case IPV6_HOPLIMIT: { struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); return (ECONNRESET); } optp = &inp->in6p_outputopts; error = ip6_pcbopt(IPV6_HOPLIMIT, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); INP_WUNLOCK(inp); break; } case IPV6_RECVHOPLIMIT: OPTSET2292_EXCLUSIVE(IN6P_HOPLIMIT); break; case IPV6_RECVHOPOPTS: OPTSET2292_EXCLUSIVE(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: OPTSET2292_EXCLUSIVE(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: OPTSET2292_EXCLUSIVE(IN6P_RTHDRDSTOPTS); break; case IPV6_RECVRTHDR: OPTSET2292_EXCLUSIVE(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: /* * We ignore this option for TCP * sockets. * (RFC3542 leaves this case * unspecified.) */ if (uproto != IPPROTO_TCP) OPTSET(IN6P_MTU); break; case IPV6_RECVFLOWID: OPTSET2(INP_RECVFLOWID, optval); break; #ifdef RSS case IPV6_RECVRSSBUCKETID: OPTSET2(INP_RECVRSSBUCKETID, optval); break; #endif case IPV6_V6ONLY: INP_WLOCK(inp); if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { /* * The socket is already bound. */ INP_WUNLOCK(inp); error = EINVAL; break; } if (optval) { inp->inp_flags |= IN6P_IPV6_V6ONLY; inp->inp_vflag &= ~INP_IPV4; } else { inp->inp_flags &= ~IN6P_IPV6_V6ONLY; inp->inp_vflag |= INP_IPV4; } INP_WUNLOCK(inp); break; case IPV6_RECVTCLASS: /* cannot mix with RFC2292 XXX */ OPTSET2292_EXCLUSIVE(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: OPTSET(IN6P_AUTOFLOWLABEL); break; case IPV6_ORIGDSTADDR: OPTSET2(INP_ORIGDSTADDR, optval); break; case IPV6_BINDANY: OPTSET(INP_BINDANY); break; case IPV6_VLAN_PCP: if ((optval >= -1) && (optval <= (INP_2PCP_MASK >> INP_2PCP_SHIFT))) { if (optval == -1) { INP_WLOCK(inp); inp->inp_flags2 &= ~(INP_2PCP_SET | INP_2PCP_MASK); INP_WUNLOCK(inp); } else { INP_WLOCK(inp); inp->inp_flags2 |= INP_2PCP_SET; inp->inp_flags2 &= ~INP_2PCP_MASK; inp->inp_flags2 |= optval << INP_2PCP_SHIFT; INP_WUNLOCK(inp); } } else error = EINVAL; break; } break; case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: if (optlen != sizeof(optval)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; { struct ip6_pktopts **optp; INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); return (ECONNRESET); } optp = &inp->in6p_outputopts; error = ip6_pcbopt(optname, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); INP_WUNLOCK(inp); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: /* RFC 2292 */ if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_2292PKTINFO: OPTSET2292(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: OPTSET2292(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: /* * Check super-user privilege. * See comments for IPV6_RECVHOPOPTS. */ if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_HOPOPTS); break; case IPV6_2292DSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */ break; case IPV6_2292RTHDR: OPTSET2292(IN6P_RTHDR); break; } break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: { /* new advanced API (RFC3542) */ u_char *optbuf; u_char optbuf_storage[MCLBYTES]; int optlen; struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } /* * We only ensure valsize is not too large * here. Further validation will be done * later. */ error = sooptcopyin(sopt, optbuf_storage, sizeof(optbuf_storage), 0); if (error) break; optlen = sopt->sopt_valsize; optbuf = optbuf_storage; INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); return (ECONNRESET); } optp = &inp->in6p_outputopts; error = ip6_pcbopt(optname, optbuf, optlen, optp, (td != NULL) ? td->td_ucred : NULL, uproto); INP_WUNLOCK(inp); break; } #undef OPTSET case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: case IPV6_MSFILTER: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = ip6_setmoptions(inp, sopt); break; case IPV6_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(inp); switch (optval) { case IPV6_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IPV6_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IPV6_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(inp); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IPV6_IPSEC_POLICY: if (IPSEC_ENABLED(ipv6)) { error = IPSEC_PCBCTL(ipv6, inp, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif /* * RFC3542 (effectively) deprecated the * semantics of the 2292-style pktoptions. * Since it was not reliable in nature (i.e., * applications had to expect the lack of some * information after all), it would make sense * to simplify this part by always returning * empty data. */ sopt->sopt_valsize = 0; break; case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: case IPV6_UNICAST_HOPS: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_V6ONLY: case IPV6_PORTRANGE: case IPV6_RECVTCLASS: case IPV6_AUTOFLOWLABEL: case IPV6_BINDANY: case IPV6_FLOWID: case IPV6_FLOWTYPE: case IPV6_RECVFLOWID: #ifdef RSS case IPV6_RSSBUCKETID: case IPV6_RECVRSSBUCKETID: #endif case IPV6_VLAN_PCP: switch (optname) { case IPV6_RECVHOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: optval = OPTBIT(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: optval = OPTBIT(IN6P_RTHDRDSTOPTS); break; case IPV6_UNICAST_HOPS: optval = inp->in6p_hops; break; case IPV6_RECVPKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_RECVHOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_RECVRTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: optval = OPTBIT(IN6P_MTU); break; case IPV6_V6ONLY: optval = OPTBIT(IN6P_IPV6_V6ONLY); break; case IPV6_PORTRANGE: { int flags; flags = inp->inp_flags; if (flags & INP_HIGHPORT) optval = IPV6_PORTRANGE_HIGH; else if (flags & INP_LOWPORT) optval = IPV6_PORTRANGE_LOW; else optval = 0; break; } case IPV6_RECVTCLASS: optval = OPTBIT(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: optval = OPTBIT(IN6P_AUTOFLOWLABEL); break; case IPV6_ORIGDSTADDR: optval = OPTBIT2(INP_ORIGDSTADDR); break; case IPV6_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IPV6_FLOWID: optval = inp->inp_flowid; break; case IPV6_FLOWTYPE: optval = inp->inp_flowtype; break; case IPV6_RECVFLOWID: optval = OPTBIT2(INP_RECVFLOWID); break; #ifdef RSS case IPV6_RSSBUCKETID: retval = rss_hash2bucket(inp->inp_flowid, inp->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; case IPV6_RECVRSSBUCKETID: optval = OPTBIT2(INP_RECVRSSBUCKETID); break; #endif case IPV6_VLAN_PCP: if (OPTBIT2(INP_2PCP_SET)) { optval = (inp->inp_flags2 & INP_2PCP_MASK) >> INP_2PCP_SHIFT; } else { optval = -1; } break; } if (error) break; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PATHMTU: { u_long pmtu = 0; struct ip6_mtuinfo mtuinfo; struct in6_addr addr; if (!(so->so_state & SS_ISCONNECTED)) return (ENOTCONN); /* * XXX: we dot not consider the case of source * routing, or optional information to specify * the outgoing interface. * Copy faddr out of inp to avoid holding lock * on inp during route lookup. */ INP_RLOCK(inp); bcopy(&inp->in6p_faddr, &addr, sizeof(addr)); INP_RUNLOCK(inp); error = ip6_getpmtu_ctl(so->so_fibnum, &addr, &pmtu); if (error) break; if (pmtu > IPV6_MAXPACKET) pmtu = IPV6_MAXPACKET; bzero(&mtuinfo, sizeof(mtuinfo)); mtuinfo.ip6m_mtu = (u_int32_t)pmtu; optdata = (void *)&mtuinfo; optdatalen = sizeof(mtuinfo); error = sooptcopyout(sopt, optdata, optdatalen); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292RTHDR: case IPV6_2292DSTOPTS: switch (optname) { case IPV6_2292PKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_2292RTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_2292DSTOPTS: optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: error = ip6_getpcbopt(inp, optname, sopt); break; case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_MSFILTER: error = ip6_getmoptions(inp, sopt); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IPV6_IPSEC_POLICY: if (IPSEC_ENABLED(ipv6)) { error = IPSEC_PCBCTL(ipv6, inp, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } } return (error); } int ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0, optval, optlen; const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum); struct inpcb *inp = sotoinpcb(so); int level, op, optname; level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; if (level != IPPROTO_IPV6) { return (EINVAL); } switch (optname) { case IPV6_CHECKSUM: /* * For ICMPv6 sockets, no modification allowed for checksum * offset, permit "no change" values to help existing apps. * * RFC3542 says: "An attempt to set IPV6_CHECKSUM * for an ICMPv6 socket will fail." * The current behavior does not meet RFC3542. */ switch (op) { case SOPT_SET: if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; if (optval < -1 || (optval % 2) != 0) { /* * The API assumes non-negative even offset * values or -1 as a special value. */ error = EINVAL; } else if (inp->inp_ip_p == IPPROTO_ICMPV6) { if (optval != icmp6off) error = EINVAL; } else inp->in6p_cksum = optval; break; case SOPT_GET: if (inp->inp_ip_p == IPPROTO_ICMPV6) optval = icmp6off; else optval = inp->in6p_cksum; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = EINVAL; break; } break; default: error = ENOPROTOOPT; break; } return (error); } /* * Set up IP6 options in pcb for insertion in output packets or * specifying behavior of outgoing packets. */ static int ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m, struct socket *so, struct sockopt *sopt) { struct ip6_pktopts *opt = *pktopt; int error = 0; struct thread *td = sopt->sopt_td; struct epoch_tracker et; /* turn off any old options. */ if (opt) { #ifdef DIAGNOSTIC if (opt->ip6po_pktinfo || opt->ip6po_nexthop || opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 || opt->ip6po_rhinfo.ip6po_rhi_rthdr) printf("ip6_pcbopts: all specified options are cleared.\n"); #endif ip6_clearpktopts(opt, -1); } else { opt = malloc(sizeof(*opt), M_IP6OPT, M_NOWAIT); if (opt == NULL) return (ENOMEM); } *pktopt = NULL; if (!m || m->m_len == 0) { /* * Only turning off any previous options, regardless of * whether the opt is just created or given. */ free(opt, M_IP6OPT); return (0); } /* set options specified by user. */ NET_EPOCH_ENTER(et); if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ? td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) { ip6_clearpktopts(opt, -1); /* XXX: discard all options */ free(opt, M_IP6OPT); NET_EPOCH_EXIT(et); return (error); } NET_EPOCH_EXIT(et); *pktopt = opt; return (0); } /* * initialize ip6_pktopts. beware that there are non-zero default values in * the struct. */ void ip6_initpktopts(struct ip6_pktopts *opt) { bzero(opt, sizeof(*opt)); opt->ip6po_hlim = -1; /* -1 means default hop limit */ opt->ip6po_tclass = -1; /* -1 means default traffic class */ opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY; opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM; } static int ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, struct ucred *cred, int uproto) { struct epoch_tracker et; struct ip6_pktopts *opt; int ret; if (*pktopt == NULL) { *pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT, M_NOWAIT); if (*pktopt == NULL) return (ENOBUFS); ip6_initpktopts(*pktopt); } opt = *pktopt; NET_EPOCH_ENTER(et); ret = ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto); NET_EPOCH_EXIT(et); return (ret); } #define GET_PKTOPT_VAR(field, lenexpr) do { \ if (pktopt && pktopt->field) { \ INP_RUNLOCK(inp); \ optdata = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK); \ malloc_optdata = true; \ INP_RLOCK(inp); \ if (inp->inp_flags & INP_DROPPED) { \ INP_RUNLOCK(inp); \ free(optdata, M_TEMP); \ return (ECONNRESET); \ } \ pktopt = inp->in6p_outputopts; \ if (pktopt && pktopt->field) { \ optdatalen = min(lenexpr, sopt->sopt_valsize); \ bcopy(pktopt->field, optdata, optdatalen); \ } else { \ free(optdata, M_TEMP); \ optdata = NULL; \ malloc_optdata = false; \ } \ } \ } while(0) #define GET_PKTOPT_EXT_HDR(field) GET_PKTOPT_VAR(field, \ (((struct ip6_ext *)pktopt->field)->ip6e_len + 1) << 3) #define GET_PKTOPT_SOCKADDR(field) GET_PKTOPT_VAR(field, \ pktopt->field->sa_len) static int ip6_getpcbopt(struct inpcb *inp, int optname, struct sockopt *sopt) { void *optdata = NULL; bool malloc_optdata = false; int optdatalen = 0; int error = 0; struct in6_pktinfo null_pktinfo; int deftclass = 0, on; int defminmtu = IP6PO_MINMTU_MCASTONLY; int defpreftemp = IP6PO_TEMPADDR_SYSTEM; struct ip6_pktopts *pktopt; INP_RLOCK(inp); pktopt = inp->in6p_outputopts; switch (optname) { case IPV6_PKTINFO: optdata = (void *)&null_pktinfo; if (pktopt && pktopt->ip6po_pktinfo) { bcopy(pktopt->ip6po_pktinfo, &null_pktinfo, sizeof(null_pktinfo)); in6_clearscope(&null_pktinfo.ipi6_addr); } else { /* XXX: we don't have to do this every time... */ bzero(&null_pktinfo, sizeof(null_pktinfo)); } optdatalen = sizeof(struct in6_pktinfo); break; case IPV6_TCLASS: if (pktopt && pktopt->ip6po_tclass >= 0) deftclass = pktopt->ip6po_tclass; optdata = (void *)&deftclass; optdatalen = sizeof(int); break; case IPV6_HOPOPTS: GET_PKTOPT_EXT_HDR(ip6po_hbh); break; case IPV6_RTHDR: GET_PKTOPT_EXT_HDR(ip6po_rthdr); break; case IPV6_RTHDRDSTOPTS: GET_PKTOPT_EXT_HDR(ip6po_dest1); break; case IPV6_DSTOPTS: GET_PKTOPT_EXT_HDR(ip6po_dest2); break; case IPV6_NEXTHOP: GET_PKTOPT_SOCKADDR(ip6po_nexthop); break; case IPV6_USE_MIN_MTU: if (pktopt) defminmtu = pktopt->ip6po_minmtu; optdata = (void *)&defminmtu; optdatalen = sizeof(int); break; case IPV6_DONTFRAG: if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG)) on = 1; else on = 0; optdata = (void *)&on; optdatalen = sizeof(on); break; case IPV6_PREFER_TEMPADDR: if (pktopt) defpreftemp = pktopt->ip6po_prefer_tempaddr; optdata = (void *)&defpreftemp; optdatalen = sizeof(int); break; default: /* should not happen */ #ifdef DIAGNOSTIC panic("ip6_getpcbopt: unexpected option\n"); #endif INP_RUNLOCK(inp); return (ENOPROTOOPT); } INP_RUNLOCK(inp); error = sooptcopyout(sopt, optdata, optdatalen); if (malloc_optdata) free(optdata, M_TEMP); return (error); } void ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname) { if (pktopt == NULL) return; if (optname == -1 || optname == IPV6_PKTINFO) { if (pktopt->ip6po_pktinfo) free(pktopt->ip6po_pktinfo, M_IP6OPT); pktopt->ip6po_pktinfo = NULL; } if (optname == -1 || optname == IPV6_HOPLIMIT) { pktopt->ip6po_hlim = -1; pktopt->ip6po_valid &= ~IP6PO_VALID_HLIM; } if (optname == -1 || optname == IPV6_TCLASS) { pktopt->ip6po_tclass = -1; pktopt->ip6po_valid &= ~IP6PO_VALID_TC; } if (optname == -1 || optname == IPV6_NEXTHOP) { if (pktopt->ip6po_nextroute.ro_nh) { NH_FREE(pktopt->ip6po_nextroute.ro_nh); pktopt->ip6po_nextroute.ro_nh = NULL; } if (pktopt->ip6po_nexthop) free(pktopt->ip6po_nexthop, M_IP6OPT); pktopt->ip6po_nexthop = NULL; pktopt->ip6po_valid &= ~IP6PO_VALID_NHINFO; } if (optname == -1 || optname == IPV6_HOPOPTS) { if (pktopt->ip6po_hbh) free(pktopt->ip6po_hbh, M_IP6OPT); pktopt->ip6po_hbh = NULL; pktopt->ip6po_valid &= ~IP6PO_VALID_HBH; } if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) { if (pktopt->ip6po_dest1) free(pktopt->ip6po_dest1, M_IP6OPT); pktopt->ip6po_dest1 = NULL; pktopt->ip6po_valid &= ~IP6PO_VALID_DEST1; } if (optname == -1 || optname == IPV6_RTHDR) { if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT); pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL; if (pktopt->ip6po_route.ro_nh) { NH_FREE(pktopt->ip6po_route.ro_nh); pktopt->ip6po_route.ro_nh = NULL; } pktopt->ip6po_valid &= ~IP6PO_VALID_RHINFO; } if (optname == -1 || optname == IPV6_DSTOPTS) { if (pktopt->ip6po_dest2) free(pktopt->ip6po_dest2, M_IP6OPT); pktopt->ip6po_dest2 = NULL; pktopt->ip6po_valid &= ~IP6PO_VALID_DEST2; } } #define PKTOPT_EXTHDRCPY(type) \ do {\ if (src->type) {\ int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\ dst->type = malloc(hlen, M_IP6OPT, canwait);\ if (dst->type == NULL)\ goto bad;\ bcopy(src->type, dst->type, hlen);\ }\ } while (/*CONSTCOND*/ 0) static int copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait) { if (dst == NULL || src == NULL) { printf("ip6_clearpktopts: invalid argument\n"); return (EINVAL); } dst->ip6po_hlim = src->ip6po_hlim; dst->ip6po_tclass = src->ip6po_tclass; dst->ip6po_flags = src->ip6po_flags; dst->ip6po_minmtu = src->ip6po_minmtu; dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr; if (src->ip6po_pktinfo) { dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo), M_IP6OPT, canwait); if (dst->ip6po_pktinfo == NULL) goto bad; *dst->ip6po_pktinfo = *src->ip6po_pktinfo; } if (src->ip6po_nexthop) { dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len, M_IP6OPT, canwait); if (dst->ip6po_nexthop == NULL) goto bad; bcopy(src->ip6po_nexthop, dst->ip6po_nexthop, src->ip6po_nexthop->sa_len); } PKTOPT_EXTHDRCPY(ip6po_hbh); PKTOPT_EXTHDRCPY(ip6po_dest1); PKTOPT_EXTHDRCPY(ip6po_dest2); PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */ dst->ip6po_valid = src->ip6po_valid; return (0); bad: ip6_clearpktopts(dst, -1); return (ENOBUFS); } #undef PKTOPT_EXTHDRCPY struct ip6_pktopts * ip6_copypktopts(struct ip6_pktopts *src, int canwait) { int error; struct ip6_pktopts *dst; dst = malloc(sizeof(*dst), M_IP6OPT, canwait); if (dst == NULL) return (NULL); ip6_initpktopts(dst); if ((error = copypktopts(dst, src, canwait)) != 0) { free(dst, M_IP6OPT); return (NULL); } return (dst); } void ip6_freepcbopts(struct ip6_pktopts *pktopt) { if (pktopt == NULL) return; ip6_clearpktopts(pktopt, -1); free(pktopt, M_IP6OPT); } /* * Set IPv6 outgoing packet options based on advanced API. */ int ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt, struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto) { struct cmsghdr *cm = NULL; if (control == NULL || opt == NULL) return (EINVAL); /* * ip6_setpktopt can call ifnet_byindex(), so it's imperative that we * are in the network epoch here. */ NET_EPOCH_ASSERT(); ip6_initpktopts(opt); if (stickyopt) { int error; /* * If stickyopt is provided, make a local copy of the options * for this particular packet, then override them by ancillary * objects. * XXX: copypktopts() does not copy the cached route to a next * hop (if any). This is not very good in terms of efficiency, * but we can allow this since this option should be rarely * used. */ if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0) return (error); } /* * XXX: Currently, we assume all the optional information is stored * in a single mbuf. */ if (control->m_next) return (EINVAL); for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { int error; if (control->m_len < CMSG_LEN(0)) return (EINVAL); cm = mtod(control, struct cmsghdr *); if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) return (EINVAL); if (cm->cmsg_level != IPPROTO_IPV6) continue; error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm), cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto); if (error) return (error); } return (0); } /* * Set a particular packet option, as a sticky option or an ancillary data * item. "len" can be 0 only when it's a sticky option. * We have 4 cases of combination of "sticky" and "cmsg": * "sticky=0, cmsg=0": impossible * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data * "sticky=1, cmsg=0": RFC3542 socket option * "sticky=1, cmsg=1": RFC2292 socket option */ static int ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, struct ucred *cred, int sticky, int cmsg, int uproto) { int minmtupolicy, preftemp; int error; NET_EPOCH_ASSERT(); if (!sticky && !cmsg) { #ifdef DIAGNOSTIC printf("ip6_setpktopt: impossible case\n"); #endif return (EINVAL); } /* * IPV6_2292xxx is for backward compatibility to RFC2292, and should * not be specified in the context of RFC3542. Conversely, * RFC3542 types should not be specified in the context of RFC2292. */ if (!cmsg) { switch (optname) { case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292NEXTHOP: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: case IPV6_2292PKTOPTIONS: return (ENOPROTOOPT); } } if (sticky && cmsg) { switch (optname) { case IPV6_PKTINFO: case IPV6_HOPLIMIT: case IPV6_NEXTHOP: case IPV6_HOPOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_USE_MIN_MTU: case IPV6_DONTFRAG: case IPV6_TCLASS: case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */ return (ENOPROTOOPT); } } switch (optname) { case IPV6_2292PKTINFO: case IPV6_PKTINFO: { struct ifnet *ifp = NULL; struct in6_pktinfo *pktinfo; if (len != sizeof(struct in6_pktinfo)) return (EINVAL); pktinfo = (struct in6_pktinfo *)buf; /* * An application can clear any sticky IPV6_PKTINFO option by * doing a "regular" setsockopt with ipi6_addr being * in6addr_any and ipi6_ifindex being zero. * [RFC 3542, Section 6] */ if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo && pktinfo->ipi6_ifindex == 0 && IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ip6_clearpktopts(opt, optname); break; } if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO && sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { return (EINVAL); } if (IN6_IS_ADDR_MULTICAST(&pktinfo->ipi6_addr)) return (EINVAL); /* validate the interface index if specified. */ if (pktinfo->ipi6_ifindex) { ifp = ifnet_byindex(pktinfo->ipi6_ifindex); if (ifp == NULL) return (ENXIO); } if (ifp != NULL && (ifp->if_afdata[AF_INET6] == NULL || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) != 0)) return (ENETDOWN); if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { struct in6_ifaddr *ia; in6_setscope(&pktinfo->ipi6_addr, ifp, NULL); ia = in6ifa_ifpwithaddr(ifp, &pktinfo->ipi6_addr); if (ia == NULL) return (EADDRNOTAVAIL); ifa_free(&ia->ia_ifa); } /* * We store the address anyway, and let in6_selectsrc() * validate the specified address. This is because ipi6_addr * may not have enough information about its scope zone, and * we may need additional information (such as outgoing * interface or the scope zone of a destination address) to * disambiguate the scope. * XXX: the delay of the validation may confuse the * application when it is used as a sticky option. */ if (opt->ip6po_pktinfo == NULL) { opt->ip6po_pktinfo = malloc(sizeof(*pktinfo), M_IP6OPT, M_NOWAIT); if (opt->ip6po_pktinfo == NULL) return (ENOBUFS); } bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo)); opt->ip6po_valid |= IP6PO_VALID_PKTINFO; break; } case IPV6_2292HOPLIMIT: case IPV6_HOPLIMIT: { int *hlimp; /* * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT * to simplify the ordering among hoplimit options. */ if (optname == IPV6_HOPLIMIT && sticky) return (ENOPROTOOPT); if (len != sizeof(int)) return (EINVAL); hlimp = (int *)buf; if (*hlimp < -1 || *hlimp > 255) return (EINVAL); opt->ip6po_hlim = *hlimp; opt->ip6po_valid |= IP6PO_VALID_HLIM; break; } case IPV6_TCLASS: { int tclass; if (len != sizeof(int)) return (EINVAL); tclass = *(int *)buf; if (tclass < -1 || tclass > 255) return (EINVAL); opt->ip6po_tclass = tclass; opt->ip6po_valid |= IP6PO_VALID_TC; break; } case IPV6_2292NEXTHOP: case IPV6_NEXTHOP: if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS); if (error) return (error); } if (len == 0) { /* just remove the option */ ip6_clearpktopts(opt, IPV6_NEXTHOP); break; } /* check if cmsg_len is large enough for sa_len */ if (len < sizeof(struct sockaddr) || len < *buf) return (EINVAL); switch (((struct sockaddr *)buf)->sa_family) { case AF_INET6: { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf; int error; if (sa6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) { return (EINVAL); } if ((error = sa6_embedscope(sa6, V_ip6_use_defzone)) != 0) { return (error); } break; } case AF_LINK: /* should eventually be supported */ default: return (EAFNOSUPPORT); } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_NEXTHOP); opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT); if (opt->ip6po_nexthop == NULL) return (ENOBUFS); bcopy(buf, opt->ip6po_nexthop, *buf); opt->ip6po_valid |= IP6PO_VALID_NHINFO; break; case IPV6_2292HOPOPTS: case IPV6_HOPOPTS: { struct ip6_hbh *hbh; int hbhlen; /* * XXX: We don't allow a non-privileged user to set ANY HbH * options, since per-option restriction has too much * overhead. */ if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, IPV6_HOPOPTS); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_hbh)) return (EINVAL); hbh = (struct ip6_hbh *)buf; hbhlen = (hbh->ip6h_len + 1) << 3; if (len != hbhlen) return (EINVAL); /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_HOPOPTS); opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_hbh == NULL) return (ENOBUFS); bcopy(hbh, opt->ip6po_hbh, hbhlen); opt->ip6po_valid |= IP6PO_VALID_HBH; break; } case IPV6_2292DSTOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: { struct ip6_dest *dest, **newdest = NULL; int destlen; if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */ error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, optname); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_dest)) return (EINVAL); dest = (struct ip6_dest *)buf; destlen = (dest->ip6d_len + 1) << 3; if (len != destlen) return (EINVAL); /* * Determine the position that the destination options header * should be inserted; before or after the routing header. */ switch (optname) { case IPV6_2292DSTOPTS: /* * The old advacned API is ambiguous on this point. * Our approach is to determine the position based * according to the existence of a routing header. * Note, however, that this depends on the order of the * extension headers in the ancillary data; the 1st * part of the destination options header must appear * before the routing header in the ancillary data, * too. * RFC3542 solved the ambiguity by introducing * separate ancillary data or option types. */ if (opt->ip6po_rthdr == NULL) newdest = &opt->ip6po_dest1; else newdest = &opt->ip6po_dest2; break; case IPV6_RTHDRDSTOPTS: newdest = &opt->ip6po_dest1; break; case IPV6_DSTOPTS: newdest = &opt->ip6po_dest2; break; } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, optname); *newdest = malloc(destlen, M_IP6OPT, M_NOWAIT); if (*newdest == NULL) return (ENOBUFS); bcopy(dest, *newdest, destlen); if (newdest == &opt->ip6po_dest1) opt->ip6po_valid |= IP6PO_VALID_DEST1; else opt->ip6po_valid |= IP6PO_VALID_DEST2; break; } case IPV6_2292RTHDR: case IPV6_RTHDR: { struct ip6_rthdr *rth; int rthlen; if (len == 0) { ip6_clearpktopts(opt, IPV6_RTHDR); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_rthdr)) return (EINVAL); rth = (struct ip6_rthdr *)buf; rthlen = (rth->ip6r_len + 1) << 3; if (len != rthlen) return (EINVAL); switch (rth->ip6r_type) { case IPV6_RTHDR_TYPE_0: if (rth->ip6r_len == 0) /* must contain one addr */ return (EINVAL); if (rth->ip6r_len % 2) /* length must be even */ return (EINVAL); if (rth->ip6r_len / 2 != rth->ip6r_segleft) return (EINVAL); break; default: return (EINVAL); /* not supported */ } /* turn off the previous option */ ip6_clearpktopts(opt, IPV6_RTHDR); opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_rthdr == NULL) return (ENOBUFS); bcopy(rth, opt->ip6po_rthdr, rthlen); opt->ip6po_valid |= IP6PO_VALID_RHINFO; break; } case IPV6_USE_MIN_MTU: if (len != sizeof(int)) return (EINVAL); minmtupolicy = *(int *)buf; if (minmtupolicy != IP6PO_MINMTU_MCASTONLY && minmtupolicy != IP6PO_MINMTU_DISABLE && minmtupolicy != IP6PO_MINMTU_ALL) { return (EINVAL); } opt->ip6po_minmtu = minmtupolicy; break; case IPV6_DONTFRAG: if (len != sizeof(int)) return (EINVAL); if (uproto == IPPROTO_TCP || *(int *)buf == 0) { /* * we ignore this option for TCP sockets. * (RFC3542 leaves this case unspecified.) */ opt->ip6po_flags &= ~IP6PO_DONTFRAG; } else opt->ip6po_flags |= IP6PO_DONTFRAG; break; case IPV6_PREFER_TEMPADDR: if (len != sizeof(int)) return (EINVAL); preftemp = *(int *)buf; if (preftemp != IP6PO_TEMPADDR_SYSTEM && preftemp != IP6PO_TEMPADDR_NOTPREFER && preftemp != IP6PO_TEMPADDR_PREFER) { return (EINVAL); } opt->ip6po_prefer_tempaddr = preftemp; break; default: return (ENOPROTOOPT); } /* end of switch */ return (0); } /* * Routine called from ip6_output() to loop back a copy of an IP6 multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be &loif -- easier than replicating that code here. */ void ip6_mloopback(struct ifnet *ifp, struct mbuf *m) { struct mbuf *copym; struct ip6_hdr *ip6; copym = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (copym == NULL) return; /* * Make sure to deep-copy IPv6 header portion in case the data * is in an mbuf cluster, so that we can safely override the IPv6 * header portion later. */ if (!M_WRITABLE(copym) || copym->m_len < sizeof(struct ip6_hdr)) { copym = m_pullup(copym, sizeof(struct ip6_hdr)); if (copym == NULL) return; } ip6 = mtod(copym, struct ip6_hdr *); /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } if_simloop(ifp, copym, AF_INET6, 0); } /* * Chop IPv6 header off from the payload. */ static int ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs) { struct mbuf *mh; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); if (m->m_len > sizeof(*ip6)) { mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { m_freem(m); return ENOBUFS; } m_move_pkthdr(mh, m); M_ALIGN(mh, sizeof(*ip6)); m->m_len -= sizeof(*ip6); m->m_data += sizeof(*ip6); mh->m_next = m; m = mh; m->m_len = sizeof(*ip6); bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6)); } exthdrs->ip6e_ip6 = m; return 0; } /* * Compute IPv6 extension header length. */ int ip6_optlen(struct inpcb *inp) { int len; if (!inp->in6p_outputopts) return 0; len = 0; #define elen(x) \ (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) len += elen(inp->in6p_outputopts->ip6po_hbh); if (inp->in6p_outputopts->ip6po_rthdr) /* dest1 is valid with rthdr only */ len += elen(inp->in6p_outputopts->ip6po_dest1); len += elen(inp->in6p_outputopts->ip6po_rthdr); len += elen(inp->in6p_outputopts->ip6po_dest2); return len; #undef elen }