Index: head/sys/net/if.h =================================================================== --- head/sys/net/if.h (revision 162083) +++ head/sys/net/if.h (revision 162084) @@ -1,448 +1,451 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NET_IF_H_ #define _NET_IF_H_ #include #ifdef _KERNEL #include #endif #if __BSD_VISIBLE /* * does not depend on on most other systems. This * helps userland compatibility. (struct timeval ifi_lastchange) */ #ifndef _KERNEL #include #endif struct ifnet; #endif /* * Length of interface external name, including terminating '\0'. * Note: this is the same size as a generic device's external name. */ #define IF_NAMESIZE 16 #if __BSD_VISIBLE #define IFNAMSIZ IF_NAMESIZE #define IF_MAXUNIT 0x7fff /* historical value */ #endif #if __BSD_VISIBLE /* * Structure used to query names of interface cloners. */ struct if_clonereq { int ifcr_total; /* total cloners (out) */ int ifcr_count; /* room for this many in user buffer */ char *ifcr_buffer; /* buffer for cloner names */ }; /* * Structure describing information about an interface * which may be of interest to management entities. */ struct if_data { /* generic interface information */ u_char ifi_type; /* ethernet, tokenring, etc */ u_char ifi_physical; /* e.g., AUI, Thinnet, 10base-T, etc */ u_char ifi_addrlen; /* media address length */ u_char ifi_hdrlen; /* media header length */ u_char ifi_link_state; /* current link state */ u_char ifi_recvquota; /* polling quota for receive intrs */ u_char ifi_xmitquota; /* polling quota for xmit intrs */ u_char ifi_datalen; /* length of this data struct */ u_long ifi_mtu; /* maximum transmission unit */ u_long ifi_metric; /* routing metric (external only) */ u_long ifi_baudrate; /* linespeed */ /* volatile statistics */ u_long ifi_ipackets; /* packets received on interface */ u_long ifi_ierrors; /* input errors on interface */ u_long ifi_opackets; /* packets sent on interface */ u_long ifi_oerrors; /* output errors on interface */ u_long ifi_collisions; /* collisions on csma interfaces */ u_long ifi_ibytes; /* total number of octets received */ u_long ifi_obytes; /* total number of octets sent */ u_long ifi_imcasts; /* packets received via multicast */ u_long ifi_omcasts; /* packets sent via multicast */ u_long ifi_iqdrops; /* dropped on input, this interface */ u_long ifi_noproto; /* destined for unsupported protocol */ u_long ifi_hwassist; /* HW offload capabilities, see IFCAP */ time_t ifi_epoch; /* uptime at attach or stat reset */ struct timeval ifi_lastchange; /* time of last administrative change */ }; /*- * Interface flags are of two types: network stack owned flags, and driver * owned flags. Historically, these values were stored in the same ifnet * flags field, but with the advent of fine-grained locking, they have been * broken out such that the network stack is responsible for synchronizing * the stack-owned fields, and the device driver the device-owned fields. * Both halves can perform lockless reads of the other half's field, subject * to accepting the involved races. * * Both sets of flags come from the same number space, and should not be * permitted to conflict, as they are exposed to user space via a single * field. * * The following symbols identify read and write requirements for fields: * * (i) if_flags field set by device driver before attach, read-only there * after. * (n) if_flags field written only by the network stack, read by either the * stack or driver. * (d) if_drv_flags field written only by the device driver, read by either * the stack or driver. */ #define IFF_UP 0x1 /* (n) interface is up */ #define IFF_BROADCAST 0x2 /* (i) broadcast address valid */ #define IFF_DEBUG 0x4 /* (n) turn on debugging */ #define IFF_LOOPBACK 0x8 /* (i) is a loopback net */ #define IFF_POINTOPOINT 0x10 /* (i) is a point-to-point link */ #define IFF_SMART 0x20 /* (i) interface manages own routes */ #define IFF_DRV_RUNNING 0x40 /* (d) resources allocated */ #define IFF_NOARP 0x80 /* (n) no address resolution protocol */ #define IFF_PROMISC 0x100 /* (n) receive all packets */ #define IFF_ALLMULTI 0x200 /* (n) receive all multicast packets */ #define IFF_DRV_OACTIVE 0x400 /* (d) tx hardware queue is full */ #define IFF_SIMPLEX 0x800 /* (i) can't hear own transmissions */ #define IFF_LINK0 0x1000 /* per link layer defined bit */ #define IFF_LINK1 0x2000 /* per link layer defined bit */ #define IFF_LINK2 0x4000 /* per link layer defined bit */ #define IFF_ALTPHYS IFF_LINK2 /* use alternate physical connection */ #define IFF_MULTICAST 0x8000 /* (i) supports multicast */ /* 0x10000 */ #define IFF_PPROMISC 0x20000 /* (n) user-requested promisc mode */ #define IFF_MONITOR 0x40000 /* (n) user-requested monitor mode */ #define IFF_STATICARP 0x80000 /* (n) static ARP */ #define IFF_NEEDSGIANT 0x100000 /* (i) hold Giant over if_start calls */ /* * Old names for driver flags so that user space tools can continue to use * the old (portable) names. */ #ifndef _KERNEL #define IFF_RUNNING IFF_DRV_RUNNING #define IFF_OACTIVE IFF_DRV_OACTIVE #endif /* flags set internally only: */ #define IFF_CANTCHANGE \ (IFF_BROADCAST|IFF_POINTOPOINT|IFF_DRV_RUNNING|IFF_DRV_OACTIVE|\ IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI|IFF_SMART|IFF_PROMISC) /* * Values for if_link_state. */ #define LINK_STATE_UNKNOWN 0 /* link invalid/unknown */ #define LINK_STATE_DOWN 1 /* link is down */ #define LINK_STATE_UP 2 /* link is up */ /* * Some convenience macros used for setting ifi_baudrate. * XXX 1000 vs. 1024? --thorpej@netbsd.org */ #define IF_Kbps(x) ((x) * 1000) /* kilobits/sec. */ #define IF_Mbps(x) (IF_Kbps((x) * 1000)) /* megabits/sec. */ #define IF_Gbps(x) (IF_Mbps((x) * 1000)) /* gigabits/sec. */ /* * Capabilities that interfaces can advertise. * * struct ifnet.if_capabilities * contains the optional features & capabilities a particular interface * supports (not only the driver but also the detected hw revision). * Capabilities are defined by IFCAP_* below. * struct ifnet.if_capenabled * contains the enabled (either by default or through ifconfig) optional * features & capabilities on this interface. * Capabilities are defined by IFCAP_* below. * struct if_data.ifi_hwassist in mbuf CSUM_ flag form, controlled by above * contains the enabled optional feature & capabilites that can be used * individually per packet and are specified in the mbuf pkthdr.csum_flags * field. IFCAP_* and CSUM_* do not match one to one and CSUM_* may be * more detailed or differenciated that IFCAP_*. * Hwassist features are defined CSUM_* in sys/mbuf.h */ #define IFCAP_RXCSUM 0x0001 /* can offload checksum on RX */ #define IFCAP_TXCSUM 0x0002 /* can offload checksum on TX */ #define IFCAP_NETCONS 0x0004 /* can be a network console */ #define IFCAP_VLAN_MTU 0x0008 /* VLAN-compatible MTU */ #define IFCAP_VLAN_HWTAGGING 0x0010 /* hardware VLAN tag support */ #define IFCAP_JUMBO_MTU 0x0020 /* 9000 byte MTU supported */ #define IFCAP_POLLING 0x0040 /* driver supports polling */ #define IFCAP_VLAN_HWCSUM 0x0080 /* can do IFCAP_HWCSUM on VLANs */ +#define IFCAP_TSO4 0x0100 /* can do TCP Segmentation Offload */ +#define IFCAP_TSO6 0x0200 /* can do TCP6 Segmentation Offload */ #define IFCAP_HWCSUM (IFCAP_RXCSUM | IFCAP_TXCSUM) +#define IFCAP_TSO (IFCAP_TSO4 | IFCAP_TSO6) #define IFQ_MAXLEN 50 #define IFNET_SLOWHZ 1 /* granularity is 1 second */ /* * Message format for use in obtaining information about interfaces * from getkerninfo and the routing socket */ struct if_msghdr { u_short ifm_msglen; /* to skip over non-understood messages */ u_char ifm_version; /* future binary compatibility */ u_char ifm_type; /* message type */ int ifm_addrs; /* like rtm_addrs */ int ifm_flags; /* value of if_flags */ u_short ifm_index; /* index for associated ifp */ struct if_data ifm_data;/* statistics and other data about if */ }; /* * Message format for use in obtaining information about interface addresses * from getkerninfo and the routing socket */ struct ifa_msghdr { u_short ifam_msglen; /* to skip over non-understood messages */ u_char ifam_version; /* future binary compatibility */ u_char ifam_type; /* message type */ int ifam_addrs; /* like rtm_addrs */ int ifam_flags; /* value of ifa_flags */ u_short ifam_index; /* index for associated ifp */ int ifam_metric; /* value of ifa_metric */ }; /* * Message format for use in obtaining information about multicast addresses * from the routing socket */ struct ifma_msghdr { u_short ifmam_msglen; /* to skip over non-understood messages */ u_char ifmam_version; /* future binary compatibility */ u_char ifmam_type; /* message type */ int ifmam_addrs; /* like rtm_addrs */ int ifmam_flags; /* value of ifa_flags */ u_short ifmam_index; /* index for associated ifp */ }; /* * Message format announcing the arrival or departure of a network interface. */ struct if_announcemsghdr { u_short ifan_msglen; /* to skip over non-understood messages */ u_char ifan_version; /* future binary compatibility */ u_char ifan_type; /* message type */ u_short ifan_index; /* index for associated ifp */ char ifan_name[IFNAMSIZ]; /* if name, e.g. "en0" */ u_short ifan_what; /* what type of announcement */ }; #define IFAN_ARRIVAL 0 /* interface arrival */ #define IFAN_DEPARTURE 1 /* interface departure */ /* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The * remainder may be interface specific. */ struct ifreq { char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ union { struct sockaddr ifru_addr; struct sockaddr ifru_dstaddr; struct sockaddr ifru_broadaddr; short ifru_flags[2]; short ifru_index; int ifru_metric; int ifru_mtu; int ifru_phys; int ifru_media; caddr_t ifru_data; int ifru_cap[2]; } ifr_ifru; #define ifr_addr ifr_ifru.ifru_addr /* address */ #define ifr_dstaddr ifr_ifru.ifru_dstaddr /* other end of p-to-p link */ #define ifr_broadaddr ifr_ifru.ifru_broadaddr /* broadcast address */ #define ifr_flags ifr_ifru.ifru_flags[0] /* flags (low 16 bits) */ #define ifr_flagshigh ifr_ifru.ifru_flags[1] /* flags (high 16 bits) */ #define ifr_metric ifr_ifru.ifru_metric /* metric */ #define ifr_mtu ifr_ifru.ifru_mtu /* mtu */ #define ifr_phys ifr_ifru.ifru_phys /* physical wire */ #define ifr_media ifr_ifru.ifru_media /* physical media */ #define ifr_data ifr_ifru.ifru_data /* for use by interface */ #define ifr_reqcap ifr_ifru.ifru_cap[0] /* requested capabilities */ #define ifr_curcap ifr_ifru.ifru_cap[1] /* current capabilities */ #define ifr_index ifr_ifru.ifru_index /* interface index */ }; #define _SIZEOF_ADDR_IFREQ(ifr) \ ((ifr).ifr_addr.sa_len > sizeof(struct sockaddr) ? \ (sizeof(struct ifreq) - sizeof(struct sockaddr) + \ (ifr).ifr_addr.sa_len) : sizeof(struct ifreq)) struct ifaliasreq { char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ struct sockaddr ifra_addr; struct sockaddr ifra_broadaddr; struct sockaddr ifra_mask; }; struct ifmediareq { char ifm_name[IFNAMSIZ]; /* if name, e.g. "en0" */ int ifm_current; /* current media options */ int ifm_mask; /* don't care mask */ int ifm_status; /* media status */ int ifm_active; /* active options */ int ifm_count; /* # entries in ifm_ulist array */ int *ifm_ulist; /* media words */ }; struct ifdrv { char ifd_name[IFNAMSIZ]; /* if name, e.g. "en0" */ unsigned long ifd_cmd; size_t ifd_len; void *ifd_data; }; /* * Structure used to retrieve aux status data from interfaces. * Kernel suppliers to this interface should respect the formatting * needed by ifconfig(8): each line starts with a TAB and ends with * a newline. The canonical example to copy and paste is in if_tun.c. */ #define IFSTATMAX 800 /* 10 lines of text */ struct ifstat { char ifs_name[IFNAMSIZ]; /* if name, e.g. "en0" */ char ascii[IFSTATMAX + 1]; }; /* * Structure used in SIOCGIFCONF request. * Used to retrieve interface configuration * for machine (useful for programs which * must know all networks accessible). */ struct ifconf { int ifc_len; /* size of associated buffer */ union { caddr_t ifcu_buf; struct ifreq *ifcu_req; } ifc_ifcu; #define ifc_buf ifc_ifcu.ifcu_buf /* buffer address */ #define ifc_req ifc_ifcu.ifcu_req /* array of structures returned */ }; #if defined (__amd64__) || defined (COMPAT_32BIT) struct ifconf32 { int ifc_len; /* size of associated buffer */ union { u_int ifcu_buf; u_int ifcu_req; } ifc_ifcu; }; #endif /* * interface groups */ #define IFG_ALL "all" /* group contains all interfaces */ /* XXX: will we implement this? */ #define IFG_EGRESS "egress" /* if(s) default route(s) point to */ struct ifg_req { union { char ifgrqu_group[IFNAMSIZ]; char ifgrqu_member[IFNAMSIZ]; } ifgrq_ifgrqu; #define ifgrq_group ifgrq_ifgrqu.ifgrqu_group #define ifgrq_member ifgrq_ifgrqu.ifgrqu_member }; /* * Used to lookup groups for an interface */ struct ifgroupreq { char ifgr_name[IFNAMSIZ]; u_int ifgr_len; union { char ifgru_group[IFNAMSIZ]; struct ifg_req *ifgru_groups; } ifgr_ifgru; #define ifgr_group ifgr_ifgru.ifgru_group #define ifgr_groups ifgr_ifgru.ifgru_groups }; /* * Structure for SIOC[AGD]LIFADDR */ struct if_laddrreq { char iflr_name[IFNAMSIZ]; u_int flags; #define IFLR_PREFIX 0x8000 /* in: prefix given out: kernel fills id */ u_int prefixlen; /* in/out */ struct sockaddr_storage addr; /* in/out */ struct sockaddr_storage dstaddr; /* out */ }; #endif /* __BSD_VISIBLE */ #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_IFADDR); MALLOC_DECLARE(M_IFMADDR); #endif #endif #ifndef _KERNEL struct if_nameindex { unsigned int if_index; /* 1, 2, ... */ char *if_name; /* null terminated name: "le0", ... */ }; __BEGIN_DECLS void if_freenameindex(struct if_nameindex *); char *if_indextoname(unsigned int, char *); struct if_nameindex *if_nameindex(void); unsigned int if_nametoindex(const char *); __END_DECLS #endif #ifdef _KERNEL struct thread; /* XXX - this should go away soon. */ #include #endif #endif /* !_NET_IF_H_ */ Index: head/sys/netinet/ip_output.c =================================================================== --- head/sys/netinet/ip_output.c (revision 162083) +++ head/sys/netinet/ip_output.c (revision 162084) @@ -1,1641 +1,1652 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 * $FreeBSD$ */ #include "opt_ipfw.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_mbuf_stress_test.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(IPSEC) || defined(FAST_IPSEC) #include #ifdef IPSEC #include #endif #ifdef FAST_IPSEC #include #endif #endif /*IPSEC*/ #include static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); #define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\ x, (ntohl(a.s_addr)>>24)&0xFF,\ (ntohl(a.s_addr)>>16)&0xFF,\ (ntohl(a.s_addr)>>8)&0xFF,\ (ntohl(a.s_addr))&0xFF, y); u_short ip_id; #ifdef MBUF_STRESS_TEST int mbuf_frag_size = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); #endif static struct ifnet *ip_multicast_if(struct in_addr *, int *); static void ip_mloopback (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); static int ip_getmoptions(struct inpcb *, struct sockopt *); static int ip_setmoptions(struct inpcb *, struct sockopt *); extern struct protosw inetsw[]; /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * In the IP forwarding case, the packet will arrive with options already * inserted, so must have a NULL opt pointer. */ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int len, error = 0; struct sockaddr_in *dst = NULL; /* keep compiler happy */ struct in_ifaddr *ia = NULL; int isbroadcast, sw_csum; struct route iproute; struct in_addr odst; #ifdef IPFIREWALL_FORWARD struct m_tag *fwd_tag = NULL; #endif M_ASSERTPKTHDR(m); if (ro == NULL) { ro = &iproute; bzero(ro, sizeof (*ro)); } if (inp != NULL) INP_LOCK_ASSERT(inp); if (opt) { len = 0; m = ip_insertoptions(m, opt, &len); if (len != 0) hlen = len; } ip = mtod(m, struct ip *); /* * Fill in IP header. If we are not allowing fragmentation, * then the ip_id field is meaningless, but we don't set it * to zero. Doing so causes various problems when devices along * the path (routers, load balancers, firewalls, etc.) illegally * disable DF on our packet. Note that a 16-bit counter * will wrap around in less than 10 seconds at 100 Mbit/s on a * medium with MTU 1500. See Steven M. Bellovin, "A Technique * for Counting NATted Hosts", Proc. IMW'02, available at * . */ if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip->ip_id = ip_newid(); ipstat.ips_localout++; } else { hlen = ip->ip_hl << 2; } dst = (struct sockaddr_in *)&ro->ro_dst; again: /* * If there is a cached route, * check that it is to the same destination * and is still up. If not, free it and try again. * The address family should also be checked in case of sharing the * cache with IPv6. */ if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { RTFREE(ro->ro_rt); ro->ro_rt = (struct rtentry *)0; } #ifdef IPFIREWALL_FORWARD if (ro->ro_rt == NULL && fwd_tag == NULL) { #else if (ro->ro_rt == NULL) { #endif bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } /* * If routing to interface only, * short circuit routing lookup. */ if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) { ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); } else if (flags & IP_SENDONES) { if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL) { ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } ifp = ia->ia_ifp; ip->ip_dst.s_addr = INADDR_BROADCAST; dst->sin_addr = ip->ip_dst; ip->ip_ttl = 1; isbroadcast = 1; } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; IFP_TO_IA(ifp, ia); isbroadcast = 0; /* fool gcc */ } else { /* * We want to do any cloning requested by the link layer, * as this is probably required in all cases for correct * operation (as it is for ARP). */ if (ro->ro_rt == NULL) rtalloc_ign(ro, 0); if (ro->ro_rt == NULL) { ipstat.ips_noroute++; error = EHOSTUNREACH; goto bad; } ia = ifatoia(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_rmx.rmx_pksent++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; if (ro->ro_rt->rt_flags & RTF_HOST) isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); else isbroadcast = in_broadcast(dst->sin_addr, ifp); } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct in_multi *inm; m->m_flags |= M_MCAST; /* * IP destination address is multicast. Make sure "dst" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ dst = (struct sockaddr_in *)&ro->ro_dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { ip->ip_ttl = imo->imo_multicast_ttl; if (imo->imo_multicast_vif != -1) ip->ip_src.s_addr = ip_mcast_src ? ip_mcast_src(imo->imo_multicast_vif) : INADDR_ANY; } else ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } } /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) ip->ip_src = IA_SIN(ia)->sin_addr; } IN_MULTI_LOCK(); IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); if (inm != NULL && (imo == NULL || imo->imo_multicast_loop)) { IN_MULTI_UNLOCK(); /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not * forbid loopback, loop back a copy. */ ip_mloopback(ifp, m, dst, hlen); } else { IN_MULTI_UNLOCK(); /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IP_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip_mloopback(), * above, will be forwarded by the ip_input() routine, * if necessary. */ if (ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * If rsvp daemon is not running, do not * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ if (!rsvp_on) imo = NULL; if (ip_mforward && ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); goto done; } goto sendit; } #ifndef notdef /* * If the source address is not specified yet, use the address * of the outoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) { ip->ip_src = IA_SIN(ia)->sin_addr; } } #endif /* notdef */ /* * Verify that we have any chance at all of being able to queue the * packet or packet fragments, unless ALTQ is enabled on the given * interface in which case packetdrop should be done by queueing. */ #ifdef ALTQ if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) && ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= ifp->if_snd.ifq_maxlen)) #else if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= ifp->if_snd.ifq_maxlen) #endif /* ALTQ */ { error = ENOBUFS; ipstat.ips_odropped++; ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1); goto bad; } /* * Look for broadcast address and * verify user is allowed to send * such a packet. */ if (isbroadcast) { if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EADDRNOTAVAIL; goto bad; } if ((flags & IP_ALLOWBROADCAST) == 0) { error = EACCES; goto bad; } /* don't allow broadcast messages to be fragmented */ if (ip->ip_len > ifp->if_mtu) { error = EMSGSIZE; goto bad; } m->m_flags |= M_BCAST; } else { m->m_flags &= ~M_BCAST; } sendit: #if defined(IPSEC) || defined(FAST_IPSEC) switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) { case 1: goto bad; case -1: goto done; case 0: default: break; /* Continue with packet processing. */ } /* Update variables that are affected by ipsec4_output(). */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&inet_pfil_hook)) goto passout; /* Run through list of hooks for output packets. */ odst.s_addr = ip->ip_dst.s_addr; error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp); if (error != 0 || m == NULL) goto done; ip = mtod(m, struct ip *); /* See if destination IP address was changed by packet filter. */ if (odst.s_addr != ip->ip_dst.s_addr) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip_input(). */ if (in_localip(ip->ip_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; error = netisr_queue(NETISR_IP, m); goto done; } else goto again; /* Redo the routing table lookup. */ } #ifdef IPFIREWALL_FORWARD /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; error = netisr_queue(NETISR_IP, m); goto done; } /* Or forward to some other address? */ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); if (fwd_tag) { dst = (struct sockaddr_in *)&ro->ro_dst; bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); m->m_flags |= M_SKIP_FIREWALL; m_tag_delete(m, fwd_tag); goto again; } #endif /* IPFIREWALL_FORWARD */ passout: /* 127/8 must not appear on wire - RFC1122. */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { ipstat.ips_badaddr++; error = EADDRNOTAVAIL; goto bad; } } m->m_pkthdr.csum_flags |= CSUM_IP; sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; if (sw_csum & CSUM_DELAY_DATA) { in_delayed_cksum(m); sw_csum &= ~CSUM_DELAY_DATA; } m->m_pkthdr.csum_flags &= ifp->if_hwassist; /* * If small enough for interface, or the interface will take - * care of the fragmentation for us, can just send directly. + * care of the fragmentation for us, we can just send directly. */ - if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT && - ((ip->ip_off & IP_DF) == 0))) { + if (ip->ip_len <= ifp->if_mtu || + (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 || + ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) { ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) ip->ip_sum = in_cksum(m, hlen); - /* Record statistics for this interface address. */ + /* + * Record statistics for this interface address. + * With CSUM_TSO the byte/packet count will be slightly + * incorrect because we count the IP+TCP headers only + * once instead of for every generated packet. + */ if (!(flags & IP_FORWARDING) && ia) { - ia->ia_ifa.if_opackets++; + if (m->m_pkthdr.csum_flags & CSUM_TSO) + ia->ia_ifa.if_opackets += + m->m_pkthdr.len / m->m_pkthdr.tso_segsz; + else + ia->ia_ifa.if_opackets++; ia->ia_ifa.if_obytes += m->m_pkthdr.len; } #ifdef IPSEC /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif #ifdef MBUF_STRESS_TEST if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) m = m_fragment(m, M_DONTWAIT, mbuf_frag_size); #endif /* * Reset layer specific mbuf flags * to avoid confusing lower layers. */ m->m_flags &= ~(M_PROTOFLAGS); error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro->ro_rt); goto done; } - if (ip->ip_off & IP_DF) { + /* Balk when DF bit is set or the interface didn't support TSO. */ + if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; /* * This case can happen if the user changed the MTU * of an interface after enabling IP on it. Because * most netifs don't keep track of routes pointing to * them, there is no way for one to update all its * routes when the MTU is changed. */ if (ro != NULL && (ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; } ipstat.ips_cantfrag++; goto bad; } /* * Too large for interface; fragment if possible. If successful, * on return, m will point to a list of packets to be sent. */ error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum); if (error) goto bad; for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; #ifdef IPSEC /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { ia->ia_ifa.if_opackets++; ia->ia_ifa.if_obytes += m->m_pkthdr.len; } /* * Reset layer specific mbuf flags * to avoid confusing upper layers. */ m->m_flags &= ~(M_PROTOFLAGS); error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro->ro_rt); } else m_freem(m); } if (error == 0) ipstat.ips_fragmented++; done: if (ro == &iproute && ro->ro_rt) { RTFREE(ro->ro_rt); } return (error); bad: m_freem(m); goto done; } /* * Create a chain of fragments which fit the given mtu. m_frag points to the * mbuf to be fragmented; on return it points to the chain with the fragments. * Return 0 if no error. If error, m_frag may contain a partially built * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags, int sw_csum) { int error = 0; int hlen = ip->ip_hl << 2; int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ int off; struct mbuf *m0 = *m_frag; /* the original packet */ int firstlen; struct mbuf **mnext; int nfrags; if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ ipstat.ips_cantfrag++; return EMSGSIZE; } /* * Must be able to put at least 8 bytes per fragment. */ if (len < 8) return EMSGSIZE; /* * If the interface will not calculate checksums on * fragmented packets, then do it here. */ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA && (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } if (len > PAGE_SIZE) { /* * Fragment large datagrams such that each segment * contains a multiple of PAGE_SIZE amount of data, * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. * * XXX When does this help given that sender and receiver * could have different page sizes, and also mtu could * be less than the receiver's page size ? */ int newlen; struct mbuf *m; for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) off += m->m_len; /* * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) goto smart_frag_failure; off = ((off - hlen) & ~7) + hlen; newlen = (~PAGE_MASK) & mtu; if ((newlen + sizeof (struct ip)) > mtu) { /* we failed, go back the default */ smart_frag_failure: newlen = len; off = hlen + len; } len = newlen; } else { off = hlen + len; } firstlen = off - hlen; mnext = &m0->m_nextpkt; /* pointer to next packet */ /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. * Here, m0 is the original packet, m is the fragment being created. * The fragments are linked off the m_nextpkt of the original * packet, which after processing serves as the first fragment. */ for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) { struct ip *mhip; /* ip header on the fragment */ struct mbuf *m; int mhlen = sizeof (struct ip); MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; ipstat.ips_odropped++; goto done; } m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; /* * In the first mbuf, leave room for the link header, then * copy the original IP header including options. The payload * goes into an additional mbuf chain returned by m_copy(). */ m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); *mhip = *ip; if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); mhip->ip_v = IPVERSION; mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; /* XXX do we need to add ip->ip_off below ? */ mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; if (off + len >= ip->ip_len) { /* last fragment */ len = ip->ip_len - off; m->m_flags |= M_LASTFRAG; } else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copy(m0, off, len); if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ ipstat.ips_odropped++; goto done; } m->m_pkthdr.len = mhlen + len; m->m_pkthdr.rcvif = NULL; #ifdef MAC mac_create_fragment(m0, m); #endif m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) mhip->ip_sum = in_cksum(m, mhlen); *mnext = m; mnext = &m->m_nextpkt; } ipstat.ips_ofragments += nfrags; /* set first marker for fragment chain */ m0->m_flags |= M_FIRSTFRAG | M_FRAG; m0->m_pkthdr.csum_data = nfrags; /* * Update first fragment by trimming what's been copied out * and updating header. */ m_adj(m0, hlen + firstlen - ip->ip_len); m0->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m0->m_pkthdr.len); ip->ip_off |= IP_MF; ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) ip->ip_sum = in_cksum(m0, hlen); done: *m_frag = m0; return error; } void in_delayed_cksum(struct mbuf *m) { struct ip *ip; u_short csum, offset; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; csum = in_cksum_skip(m, ip->ip_len, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(u_short) > m->m_len) { printf("delayed m_pullup, m->len: %d off: %d p: %d\n", m->m_len, offset, ip->ip_p); /* * XXX * this shouldn't happen, but if it does, the * correct behavior may be to insert the checksum * in the appropriate next mbuf in the chain. */ return; } *(u_short *)(m->m_data + offset) = csum; } /* * IP socket option processing. */ int ip_ctloutput(so, sopt) struct socket *so; struct sockopt *sopt; { struct inpcb *inp = sotoinpcb(so); int error, optval; error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { return (EINVAL); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif { struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); if (error) { m_free(m); break; } INP_LOCK(inp); error = ip_pcbopts(inp, sopt->sopt_name, m); INP_UNLOCK(inp); return (error); } case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_FAITH: case IP_ONESBCAST: case IP_DONTFRAG: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos = optval; break; case IP_TTL: inp->inp_ip_ttl = optval; break; case IP_MINTTL: if (optval > 0 && optval <= MAXTTL) inp->inp_ip_minttl = optval; else error = EINVAL; break; #define OPTSET(bit) do { \ INP_LOCK(inp); \ if (optval) \ inp->inp_flags |= bit; \ else \ inp->inp_flags &= ~bit; \ INP_UNLOCK(inp); \ } while (0) case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; case IP_RECVRETOPTS: OPTSET(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: OPTSET(INP_RECVDSTADDR); break; case IP_RECVTTL: OPTSET(INP_RECVTTL); break; case IP_RECVIF: OPTSET(INP_RECVIF); break; case IP_FAITH: OPTSET(INP_FAITH); break; case IP_ONESBCAST: OPTSET(INP_ONESBCAST); break; case IP_DONTFRAG: OPTSET(INP_DONTFRAG); break; } break; #undef OPTSET case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: error = ip_setmoptions(inp, sopt); break; case IP_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_LOCK(inp); switch (optval) { case IP_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IP_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IP_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_UNLOCK(inp); break; #if defined(IPSEC) || defined(FAST_IPSEC) case IP_IPSEC_POLICY: { caddr_t req; size_t len = 0; int priv; struct mbuf *m; int optname; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; priv = (sopt->sopt_td != NULL && suser(sopt->sopt_td) != 0) ? 0 : 1; req = mtod(m, caddr_t); len = m->m_len; optname = sopt->sopt_name; error = ipsec4_set_policy(inp, optname, req, len, priv); m_freem(m); break; } #endif /*IPSEC*/ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: if (inp->inp_options) error = sooptcopyout(sopt, mtod(inp->inp_options, char *), inp->inp_options->m_len); else sopt->sopt_valsize = 0; break; case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_PORTRANGE: case IP_FAITH: case IP_ONESBCAST: case IP_DONTFRAG: switch (sopt->sopt_name) { case IP_TOS: optval = inp->inp_ip_tos; break; case IP_TTL: optval = inp->inp_ip_ttl; break; case IP_MINTTL: optval = inp->inp_ip_minttl; break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); break; case IP_RECVRETOPTS: optval = OPTBIT(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: optval = OPTBIT(INP_RECVDSTADDR); break; case IP_RECVTTL: optval = OPTBIT(INP_RECVTTL); break; case IP_RECVIF: optval = OPTBIT(INP_RECVIF); break; case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; else if (inp->inp_flags & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = 0; break; case IP_FAITH: optval = OPTBIT(INP_FAITH); break; case IP_ONESBCAST: optval = OPTBIT(INP_ONESBCAST); break; case IP_DONTFRAG: optval = OPTBIT(INP_DONTFRAG); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: error = ip_getmoptions(inp, sopt); break; #if defined(IPSEC) || defined(FAST_IPSEC) case IP_IPSEC_POLICY: { struct mbuf *m = NULL; caddr_t req = NULL; size_t len = 0; if (m != 0) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0) m_freem(m); break; } #endif /*IPSEC*/ default: error = ENOPROTOOPT; break; } break; } return (error); } /* * XXX * The whole multicast option thing needs to be re-thought. * Several of these options are equally applicable to non-multicast * transmission, and one (IP_MULTICAST_TTL) totally duplicates a * standard option (IP_TTL). */ /* * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. */ static struct ifnet * ip_multicast_if(a, ifindexp) struct in_addr *a; int *ifindexp; { int ifindex; struct ifnet *ifp; if (ifindexp) *ifindexp = 0; if (ntohl(a->s_addr) >> 24 == 0) { ifindex = ntohl(a->s_addr) & 0xffffff; if (ifindex < 0 || if_index < ifindex) return NULL; ifp = ifnet_byindex(ifindex); if (ifindexp) *ifindexp = ifindex; } else { INADDR_TO_IFP(*a, ifp); } return ifp; } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. */ static struct ip_moptions * ip_findmoptions(struct inpcb *inp) { struct ip_moptions *imo; struct in_multi **immp; INP_LOCK(inp); if (inp->inp_moptions != NULL) return (inp->inp_moptions); INP_UNLOCK(inp); imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); immp = (struct in_multi **)malloc((sizeof(*immp) * IP_MIN_MEMBERSHIPS), M_IPMOPTS, M_WAITOK); imo->imo_multicast_ifp = NULL; imo->imo_multicast_addr.s_addr = INADDR_ANY; imo->imo_multicast_vif = -1; imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; imo->imo_num_memberships = 0; imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; imo->imo_membership = immp; INP_LOCK(inp); if (inp->inp_moptions != NULL) { free(immp, M_IPMOPTS); free(imo, M_IPMOPTS); return (inp->inp_moptions); } inp->inp_moptions = imo; return (imo); } /* * Set the IP multicast options in response to user setsockopt(). */ static int ip_setmoptions(struct inpcb *inp, struct sockopt *sopt) { int error = 0; int i; struct in_addr addr; struct ip_mreq mreq; struct ifnet *ifp; struct ip_moptions *imo; struct route ro; struct sockaddr_in *dst; int ifindex; int s; switch (sopt->sopt_name) { /* store an index number for the vif you wanna use in the send */ case IP_MULTICAST_VIF: if (legal_vif_num == 0) { error = EOPNOTSUPP; break; } error = sooptcopyin(sopt, &i, sizeof i, sizeof i); if (error) break; if (!legal_vif_num(i) && (i != -1)) { error = EINVAL; break; } imo = ip_findmoptions(inp); imo->imo_multicast_vif = i; INP_UNLOCK(inp); break; case IP_MULTICAST_IF: /* * Select the interface for outgoing multicast packets. */ error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr); if (error) break; /* * INADDR_ANY is used to remove a previous selection. * When no interface is selected, a default one is * chosen every time a multicast packet is sent. */ imo = ip_findmoptions(inp); if (addr.s_addr == INADDR_ANY) { imo->imo_multicast_ifp = NULL; INP_UNLOCK(inp); break; } /* * The selected interface is identified by its local * IP address. Find the interface and confirm that * it supports multicasting. */ s = splimp(); ifp = ip_multicast_if(&addr, &ifindex); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { INP_UNLOCK(inp); splx(s); error = EADDRNOTAVAIL; break; } imo->imo_multicast_ifp = ifp; if (ifindex) imo->imo_multicast_addr = addr; else imo->imo_multicast_addr.s_addr = INADDR_ANY; INP_UNLOCK(inp); splx(s); break; case IP_MULTICAST_TTL: /* * Set the IP time-to-live for outgoing multicast packets. * The original multicast API required a char argument, * which is inconsistent with the rest of the socket API. * We allow either a char or an int. */ if (sopt->sopt_valsize == 1) { u_char ttl; error = sooptcopyin(sopt, &ttl, 1, 1); if (error) break; imo = ip_findmoptions(inp); imo->imo_multicast_ttl = ttl; INP_UNLOCK(inp); } else { u_int ttl; error = sooptcopyin(sopt, &ttl, sizeof ttl, sizeof ttl); if (error) break; if (ttl > 255) error = EINVAL; else { imo = ip_findmoptions(inp); imo->imo_multicast_ttl = ttl; INP_UNLOCK(inp); } } break; case IP_MULTICAST_LOOP: /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. The original multicast API required a * char argument, which is inconsistent with the rest * of the socket API. We allow either a char or an int. */ if (sopt->sopt_valsize == 1) { u_char loop; error = sooptcopyin(sopt, &loop, 1, 1); if (error) break; imo = ip_findmoptions(inp); imo->imo_multicast_loop = !!loop; INP_UNLOCK(inp); } else { u_int loop; error = sooptcopyin(sopt, &loop, sizeof loop, sizeof loop); if (error) break; imo = ip_findmoptions(inp); imo->imo_multicast_loop = !!loop; INP_UNLOCK(inp); } break; case IP_ADD_MEMBERSHIP: /* * Add a multicast group membership. * Group must be a valid IP multicast address. */ error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); if (error) break; if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { error = EINVAL; break; } s = splimp(); /* * If no interface address was provided, use the interface of * the route to the given multicast address. */ if (mreq.imr_interface.s_addr == INADDR_ANY) { bzero((caddr_t)&ro, sizeof(ro)); dst = (struct sockaddr_in *)&ro.ro_dst; dst->sin_len = sizeof(*dst); dst->sin_family = AF_INET; dst->sin_addr = mreq.imr_multiaddr; rtalloc_ign(&ro, RTF_CLONING); if (ro.ro_rt == NULL) { error = EADDRNOTAVAIL; splx(s); break; } ifp = ro.ro_rt->rt_ifp; RTFREE(ro.ro_rt); } else { ifp = ip_multicast_if(&mreq.imr_interface, NULL); } /* * See if we found an interface, and confirm that it * supports multicast. */ if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; splx(s); break; } /* * See if the membership already exists or if all the * membership slots are full. */ imo = ip_findmoptions(inp); for (i = 0; i < imo->imo_num_memberships; ++i) { if (imo->imo_membership[i]->inm_ifp == ifp && imo->imo_membership[i]->inm_addr.s_addr == mreq.imr_multiaddr.s_addr) break; } if (i < imo->imo_num_memberships) { INP_UNLOCK(inp); error = EADDRINUSE; splx(s); break; } if (imo->imo_num_memberships == imo->imo_max_memberships) { struct in_multi **nmships, **omships; size_t newmax; /* * Resize the vector to next power-of-two minus 1. If the * size would exceed the maximum then we know we've really * run out of entries. Otherwise, we realloc() the vector * with the INP lock held to avoid introducing a race. */ nmships = NULL; omships = imo->imo_membership; newmax = ((imo->imo_max_memberships + 1) * 2) - 1; if (newmax <= IP_MAX_MEMBERSHIPS) { nmships = (struct in_multi **)realloc(omships, sizeof(*nmships) * newmax, M_IPMOPTS, M_NOWAIT); if (nmships != NULL) { imo->imo_membership = nmships; imo->imo_max_memberships = newmax; } } if (nmships == NULL) { INP_UNLOCK(inp); error = ETOOMANYREFS; splx(s); break; } } /* * Everything looks good; add a new record to the multicast * address list for the given interface. */ if ((imo->imo_membership[i] = in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) { INP_UNLOCK(inp); error = ENOBUFS; splx(s); break; } ++imo->imo_num_memberships; INP_UNLOCK(inp); splx(s); break; case IP_DROP_MEMBERSHIP: /* * Drop a multicast group membership. * Group must be a valid IP multicast address. */ error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); if (error) break; if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { error = EINVAL; break; } s = splimp(); /* * If an interface address was specified, get a pointer * to its ifnet structure. */ if (mreq.imr_interface.s_addr == INADDR_ANY) ifp = NULL; else { ifp = ip_multicast_if(&mreq.imr_interface, NULL); if (ifp == NULL) { error = EADDRNOTAVAIL; splx(s); break; } } /* * Find the membership in the membership array. */ imo = ip_findmoptions(inp); for (i = 0; i < imo->imo_num_memberships; ++i) { if ((ifp == NULL || imo->imo_membership[i]->inm_ifp == ifp) && imo->imo_membership[i]->inm_addr.s_addr == mreq.imr_multiaddr.s_addr) break; } if (i == imo->imo_num_memberships) { INP_UNLOCK(inp); error = EADDRNOTAVAIL; splx(s); break; } /* * Give up the multicast address record to which the * membership points. */ in_delmulti(imo->imo_membership[i]); /* * Remove the gap in the membership array. */ for (++i; i < imo->imo_num_memberships; ++i) imo->imo_membership[i-1] = imo->imo_membership[i]; --imo->imo_num_memberships; INP_UNLOCK(inp); splx(s); break; default: error = EOPNOTSUPP; break; } return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ static int ip_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip_moptions *imo; struct in_addr addr; struct in_ifaddr *ia; int error, optval; u_char coptval; INP_LOCK(inp); imo = inp->inp_moptions; error = 0; switch (sopt->sopt_name) { case IP_MULTICAST_VIF: if (imo != NULL) optval = imo->imo_multicast_vif; else optval = -1; INP_UNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_MULTICAST_IF: if (imo == NULL || imo->imo_multicast_ifp == NULL) addr.s_addr = INADDR_ANY; else if (imo->imo_multicast_addr.s_addr) { /* return the value user has set */ addr = imo->imo_multicast_addr; } else { IFP_TO_IA(imo->imo_multicast_ifp, ia); addr.s_addr = (ia == NULL) ? INADDR_ANY : IA_SIN(ia)->sin_addr.s_addr; } INP_UNLOCK(inp); error = sooptcopyout(sopt, &addr, sizeof addr); break; case IP_MULTICAST_TTL: if (imo == 0) optval = coptval = IP_DEFAULT_MULTICAST_TTL; else optval = coptval = imo->imo_multicast_ttl; INP_UNLOCK(inp); if (sopt->sopt_valsize == 1) error = sooptcopyout(sopt, &coptval, 1); else error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_MULTICAST_LOOP: if (imo == 0) optval = coptval = IP_DEFAULT_MULTICAST_LOOP; else optval = coptval = imo->imo_multicast_loop; INP_UNLOCK(inp); if (sopt->sopt_valsize == 1) error = sooptcopyout(sopt, &coptval, 1); else error = sooptcopyout(sopt, &optval, sizeof optval); break; default: INP_UNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Discard the IP multicast options. */ void ip_freemoptions(imo) register struct ip_moptions *imo; { register int i; if (imo != NULL) { for (i = 0; i < imo->imo_num_memberships; ++i) in_delmulti(imo->imo_membership[i]); free(imo->imo_membership, M_IPMOPTS); free(imo, M_IPMOPTS); } } /* * Routine called from ip_output() to loop back a copy of an IP multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be a loopback interface -- evil, but easier than * replicating that code here. */ static void ip_mloopback(ifp, m, dst, hlen) struct ifnet *ifp; register struct mbuf *m; register struct sockaddr_in *dst; int hlen; { register struct ip *ip; struct mbuf *copym; copym = m_copy(m, 0, M_COPYALL); if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* If needed, compute the checksum and mark it as valid. */ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(copym); copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } /* * We don't bother to fragment if the IP length is greater * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); /* * NB: * It's not clear whether there are any lingering * reentrancy problems in other areas which might * be exposed by using ip_input directly (in * particular, everything which modifies the packet * in-place). Yet another option is using the * protosw directly to deliver the looped back * packet. For the moment, we'll err on the side * of safety by using if_simloop(). */ #if 1 /* XXX */ if (dst->sin_family != AF_INET) { printf("ip_mloopback: bad address family %d\n", dst->sin_family); dst->sin_family = AF_INET; } #endif #ifdef notdef copym->m_pkthdr.rcvif = ifp; ip_input(copym); #else if_simloop(ifp, copym, dst->sin_family, 0); #endif } } Index: head/sys/netinet/tcp_input.c =================================================================== --- head/sys/netinet/tcp_input.c (revision 162083) +++ head/sys/netinet/tcp_input.c (revision 162084) @@ -1,3305 +1,3310 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 * $FreeBSD$ */ #include "opt_ipfw.h" /* for ipfw_fwd */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include "opt_tcp_input.h" #include "opt_tcp_sack.h" #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef FAST_IPSEC #include #include #endif /*FAST_IPSEC*/ #ifdef IPSEC #include #include #include #endif /*IPSEC*/ #include static const int tcprexmtthresh = 3; struct tcpstat tcpstat; SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); static int log_in_vain = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, &log_in_vain, 0, "Log all incoming TCP connections"); static int blackhole = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, &blackhole, 0, "Do not send RST when dropping refused connections"); int tcp_delack_enabled = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, &tcp_delack_enabled, 0, "Delay ACK to try and piggyback it onto a data packet"); #ifdef TCP_DROP_SYNFIN static int drop_synfin = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); #endif static int tcp_do_rfc3042 = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, &tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); static int tcp_do_rfc3390 = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, &tcp_do_rfc3390, 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); static int tcp_insecure_rst = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, &tcp_insecure_rst, 0, "Follow the old (insecure) criteria for accepting RST packets."); SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); static int tcp_reass_maxseg = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, &tcp_reass_maxseg, 0, "Global maximum number of TCP Segments in Reassembly Queue"); int tcp_reass_qsize = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD, &tcp_reass_qsize, 0, "Global number of TCP Segments currently in Reassembly Queue"); static int tcp_reass_maxqlen = 48; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxqlen, CTLFLAG_RW, &tcp_reass_maxqlen, 0, "Maximum number of TCP Segments per individual Reassembly Queue"); static int tcp_reass_overflows = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, &tcp_reass_overflows, 0, "Global number of TCP Segment Reassembly Queue Overflows"); struct inpcbhead tcb; #define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; struct mtx *tcbinfo_mtx; static void tcp_dooptions(struct tcpopt *, u_char *, int, int); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); static void tcp_xmit_timer(struct tcpcb *, int); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); static int tcp_timewait(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #ifdef INET6 #define ND6_HINT(tp) \ do { \ if ((tp) && (tp)->t_inpcb && \ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ nd6_nud_hint(NULL, NULL, 0); \ } while (0) #else #define ND6_HINT(tp) #endif /* * Indicate whether this ack should be delayed. We can delay the ack if * - there is no delayed ack timer in progress and * - our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window and * - delayed acks are enabled or * - this is a half-synchronized T/TCP connection. */ #define DELAY_ACK(tp) \ ((!callout_active(tp->tt_delack) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) /* Initialize TCP reassembly queue */ static void tcp_reass_zone_change(void *tag) { tcp_reass_maxseg = nmbclusters / 16; uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); } uma_zone_t tcp_reass_zone; void tcp_reass_init() { tcp_reass_maxseg = nmbclusters / 16; TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments", &tcp_reass_maxseg); tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); EVENTHANDLER_REGISTER(nmbclusters_change, tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); } static int tcp_reass(tp, th, tlenp, m) register struct tcpcb *tp; register struct tcphdr *th; int *tlenp; struct mbuf *m; { struct tseg_qent *q; struct tseg_qent *p = NULL; struct tseg_qent *nq; struct tseg_qent *te = NULL; struct socket *so = tp->t_inpcb->inp_socket; int flags; INP_LOCK_ASSERT(tp->t_inpcb); /* * XXX: tcp_reass() is rather inefficient with its data structures * and should be rewritten (see NetBSD for optimizations). While * doing that it should move to its own file tcp_reass.c. */ /* * Call with th==NULL after become established to * force pre-ESTABLISHED data up to user socket. */ if (th == NULL) goto present; /* * Limit the number of segments in the reassembly queue to prevent * holding on to too many segments (and thus running out of mbufs). * Make sure to let the missing segment through which caused this * queue. Always keep one global queue entry spare to be able to * process the missing segment. */ if (th->th_seq != tp->rcv_nxt && (tcp_reass_qsize + 1 >= tcp_reass_maxseg || tp->t_segqlen >= tcp_reass_maxqlen)) { tcp_reass_overflows++; tcpstat.tcps_rcvmemdrop++; m_freem(m); *tlenp = 0; return (0); } /* * Allocate a new queue entry. If we can't, or hit the zone limit * just drop the pkt. */ te = uma_zalloc(tcp_reass_zone, M_NOWAIT); if (te == NULL) { tcpstat.tcps_rcvmemdrop++; m_freem(m); *tlenp = 0; return (0); } tp->t_segqlen++; tcp_reass_qsize++; /* * Find a segment which begins after this one does. */ LIST_FOREACH(q, &tp->t_segq, tqe_q) { if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) break; p = q; } /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us. */ if (p != NULL) { register int i; /* conversion to int (in i) handles seq wraparound */ i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; if (i > 0) { if (i >= *tlenp) { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += *tlenp; m_freem(m); uma_zfree(tcp_reass_zone, te); tp->t_segqlen--; tcp_reass_qsize--; /* * Try to present any queued data * at the left window edge to the user. * This is needed after the 3-WHS * completes. */ goto present; /* ??? */ } m_adj(m, i); *tlenp -= i; th->th_seq += i; } } tcpstat.tcps_rcvoopack++; tcpstat.tcps_rcvoobyte += *tlenp; /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ while (q) { register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; if (i <= 0) break; if (i < q->tqe_len) { q->tqe_th->th_seq += i; q->tqe_len -= i; m_adj(q->tqe_m, i); break; } nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; tcp_reass_qsize--; q = nq; } /* Insert the new segment queue entry into place. */ te->tqe_m = m; te->tqe_th = th; te->tqe_len = *tlenp; if (p == NULL) { LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); } else { LIST_INSERT_AFTER(p, te, tqe_q); } present: /* * Present data to user, advancing rcv_nxt through * completed sequence space. */ if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); q = LIST_FIRST(&tp->t_segq); if (!q || q->tqe_th->th_seq != tp->rcv_nxt) return (0); SOCKBUF_LOCK(&so->so_rcv); do { tp->rcv_nxt += q->tqe_len; flags = q->tqe_th->th_flags & TH_FIN; nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(q->tqe_m); else sbappendstream_locked(&so->so_rcv, q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; tcp_reass_qsize--; q = nq; } while (q && q->tqe_th->th_seq == tp->rcv_nxt); ND6_HINT(tp); sorwakeup_locked(so); return (flags); } /* * TCP input routine, follows pages 65-76 of the * protocol specification dated September, 1981 very closely. */ #ifdef INET6 int tcp6_input(mp, offp, proto) struct mbuf **mp; int *offp, proto; { register struct mbuf *m = *mp; struct in6_ifaddr *ia6; IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); /* * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ ia6 = ip6_getdstifaddr(m); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); return IPPROTO_DONE; } tcp_input(m, *offp); return IPPROTO_DONE; } #endif void tcp_input(m, off0) register struct mbuf *m; int off0; { register struct tcphdr *th; register struct ip *ip = NULL; register struct ipovly *ipov; register struct inpcb *inp = NULL; u_char *optp = NULL; int optlen = 0; int len, tlen, off; int drop_hdrlen; register struct tcpcb *tp = 0; register int thflags; struct socket *so = 0; int todrop, acked, ourfinisacked, needoutput = 0; u_long tiwin; struct tcpopt to; /* options in this segment */ int headlocked = 0; #ifdef IPFIREWALL_FORWARD struct m_tag *fwd_tag; #endif int rstreason; /* For badport_bandlim accounting purposes */ struct ip6_hdr *ip6 = NULL; #ifdef INET6 int isipv6; #else const int isipv6 = 0; #endif #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[40]; struct tcphdr tcp_savetcp; short ostate = 0; #endif #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif bzero((char *)&to, sizeof(to)); tcpstat.tcps_rcvtotal++; if (isipv6) { #ifdef INET6 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ ip6 = mtod(m, struct ip6_hdr *); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { tcpstat.tcps_rcvbadsum++; goto drop; } th = (struct tcphdr *)((caddr_t)ip6 + off0); /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ goto drop; } #else th = NULL; /* XXX: avoid compiler warning */ #endif } else { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { ip_stripoptions(m, (struct mbuf *)0); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { tcpstat.tcps_rcvshort++; return; } } ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ip->ip_len; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + ip->ip_len + IPPROTO_TCP)); th->th_sum ^= 0xffff; #ifdef TCPDEBUG ipov->ih_len = (u_short)tlen; ipov->ih_len = htons(ipov->ih_len); #endif } else { /* * Checksum extended TCP header and data. */ len = sizeof (struct ip) + tlen; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = (u_short)tlen; ipov->ih_len = htons(ipov->ih_len); th->th_sum = in_cksum(m, len); } if (th->th_sum) { tcpstat.tcps_rcvbadsum++; goto drop; } #ifdef INET6 /* Re-initialization for later version check */ ip->ip_v = IPVERSION; #endif } /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { tcpstat.tcps_rcvbadoff++; goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { if (isipv6) { #ifdef INET6 IP6_EXTHDR_CHECK(m, off0, off, ); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); #endif } else { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { tcpstat.tcps_rcvshort++; return; } ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); } } optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); } thflags = th->th_flags; #ifdef TCP_DROP_SYNFIN /* * If the drop_synfin option is enabled, drop all packets with * both the SYN and FIN bits set. This prevents e.g. nmap from * identifying the TCP/IP stack. * * This is a violation of the TCP specification. */ if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) goto drop; #endif /* * Convert TCP protocol specific fields to host format. */ th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); /* * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options, * until after ip6_savecontrol() is called and before other functions * which don't want those proto headers. * Because ip6_savecontrol() is going to parse the mbuf to * search for data to be passed up to user-land, it wants mbuf * parameters to be unchanged. * XXX: the call of ip6_savecontrol() has been obsoleted based on * latest version of the advanced API (20020110). */ drop_hdrlen = off0 + off; /* * Locate pcb for segment. */ INP_INFO_WLOCK(&tcbinfo); headlocked = 1; findpcb: KASSERT(headlocked, ("tcp_input: findpcb: head not locked")); #ifdef IPFIREWALL_FORWARD /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); if (fwd_tag != NULL && isipv6 == 0) { /* IPv6 support is not yet */ struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag+1); /* * Transparently forwarded. Pretend to be the destination. * already got one like this? */ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); if (!inp) { /* It's new. Try to find the ambushing socket. */ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : th->th_dport, INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); } else { #endif /* IPFIREWALL_FORWARD */ if (isipv6) { #ifdef INET6 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif); #endif } else inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif); #ifdef IPFIREWALL_FORWARD } #endif /* IPFIREWALL_FORWARD */ #if defined(IPSEC) || defined(FAST_IPSEC) #ifdef INET6 if (isipv6) { if (inp != NULL && ipsec6_in_reject(m, inp)) { #ifdef IPSEC ipsec6stat.in_polvio++; #endif goto drop; } } else #endif /* INET6 */ if (inp != NULL && ipsec4_in_reject(m, inp)) { #ifdef IPSEC ipsecstat.in_polvio++; #endif goto drop; } #endif /*IPSEC || FAST_IPSEC*/ /* * If the state is CLOSED (i.e., TCB does not exist) then * all data in the incoming segment is discarded. * If the TCB exists but is in CLOSED state, it is embryonic, * but should either do a listen or a connect soon. */ if (inp == NULL) { if (log_in_vain) { #ifdef INET6 char dbuf[INET6_ADDRSTRLEN+2], sbuf[INET6_ADDRSTRLEN+2]; #else char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"]; #endif if (isipv6) { #ifdef INET6 strcpy(dbuf, "["); strcpy(sbuf, "["); strcat(dbuf, ip6_sprintf(&ip6->ip6_dst)); strcat(sbuf, ip6_sprintf(&ip6->ip6_src)); strcat(dbuf, "]"); strcat(sbuf, "]"); #endif } else { strcpy(dbuf, inet_ntoa(ip->ip_dst)); strcpy(sbuf, inet_ntoa(ip->ip_src)); } switch (log_in_vain) { case 1: if ((thflags & TH_SYN) == 0) break; /* FALLTHROUGH */ case 2: log(LOG_INFO, "Connection attempt to TCP %s:%d " "from %s:%d flags:0x%02x\n", dbuf, ntohs(th->th_dport), sbuf, ntohs(th->th_sport), thflags); break; default: break; } } if (blackhole) { switch (blackhole) { case 1: if (thflags & TH_SYN) goto drop; break; case 2: goto drop; default: goto drop; } } rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } INP_LOCK(inp); /* Check the minimum TTL for socket. */ if (inp->inp_ip_minttl != 0) { #ifdef INET6 if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) goto drop; else #endif if (inp->inp_ip_minttl > ip->ip_ttl) goto drop; } if (inp->inp_vflag & INP_TIMEWAIT) { /* * The only option of relevance is TOF_CC, and only if * present in a SYN segment. See tcp_timewait(). */ if (thflags & TH_SYN) tcp_dooptions(&to, optp, optlen, TO_SYN); if (tcp_timewait(inp, &to, th, m, tlen)) goto findpcb; /* * tcp_timewait unlocks inp. */ INP_INFO_WUNLOCK(&tcbinfo); return; } tp = intotcpcb(inp); if (tp == 0) { INP_UNLOCK(inp); rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } if (tp->t_state == TCPS_CLOSED) goto drop; #ifdef MAC INP_LOCK_ASSERT(inp); if (mac_check_inpcb_deliver(inp, m)) goto drop; #endif so = inp->inp_socket; KASSERT(so != NULL, ("tcp_input: so == NULL")); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; if (isipv6) bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); else bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); tcp_savetcp = *th; } #endif if (so->so_options & SO_ACCEPTCONN) { struct in_conninfo inc; bzero(&inc, sizeof(inc)); #ifdef INET6 inc.inc_isipv6 = isipv6; #endif if (isipv6) { inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; } else { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; /* * If the state is LISTEN then ignore segment if it contains * a RST. If the segment contains an ACK then it is bad and * send a RST. If it does not contain a SYN then it is not * interesting; drop it. * * If the state is SYN_RECEIVED (syncache) and seg contains * an ACK, but not for our SYN/ACK, send a RST. If the seg * contains a RST, check the sequence number to see if it * is a valid reset segment. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { if (!syncache_expand(&inc, th, &so, m)) { /* * No syncache entry, or ACK was not * for our SYN/ACK. Send a RST. */ tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (so == NULL) { /* * Could not complete 3-way handshake, * connection is being closed down, and * syncache has free'd mbuf. */ INP_UNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return; } /* * Socket is created in state SYN_RECEIVED. * Continue processing segment. */ INP_UNLOCK(inp); inp = sotoinpcb(so); INP_LOCK(inp); tp = intotcpcb(inp); /* * This is what would have happened in * tcp_output() when the SYN,ACK was sent. */ tp->snd_up = tp->snd_una; tp->snd_max = tp->snd_nxt = tp->iss + 1; tp->last_ack_sent = tp->rcv_nxt; goto after_listen; } if (thflags & TH_RST) { syncache_chkrst(&inc, th); goto drop; } if (thflags & TH_ACK) { syncache_badack(&inc); tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } goto drop; } /* * Segment's flags are (SYN) or (SYN|FIN). */ #ifdef INET6 /* * If deprecated address is forbidden, * we do not accept SYN to deprecated interface * address to prevent any new inbound connection from * getting established. * When we do not accept SYN, we send a TCP RST, * with deprecated source address (instead of dropping * it). We compromise it as it is much better for peer * to send a RST, and RST will be the final packet * for the exchange. * * If we do not forbid deprecated addresses, we accept * the SYN packet. RFC2462 does not suggest dropping * SYN in this case. * If we decipher RFC2462 5.5.4, it says like this: * 1. use of deprecated addr with existing * communication is okay - "SHOULD continue to be * used" * 2. use of it with new communication: * (2a) "SHOULD NOT be used if alternate address * with sufficient scope is available" * (2b) nothing mentioned otherwise. * Here we fall into (2b) case as we have no choice in * our source address selection - we must obey the peer. * * The wording in RFC2462 is confusing, and there are * multiple description text for deprecated address * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ if (isipv6 && !ip6_use_deprecated) { struct in6_ifaddr *ia6; if ((ia6 = ip6_getdstifaddr(m)) && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { INP_UNLOCK(inp); tp = NULL; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } } #endif /* * If it is from this socket, drop it, it must be forged. * Don't bother responding if the destination was a broadcast. */ if (th->th_dport == th->th_sport) { if (isipv6) { if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) goto drop; } else { if (ip->ip_dst.s_addr == ip->ip_src.s_addr) goto drop; } } /* * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN * * Note that it is quite possible to receive unicast * link-layer packets with a broadcast IP address. Use * in_broadcast() to find them. */ if (m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; } else { if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } /* * SYN appears to be valid; create compressed TCP state * for syncache, or perform t/tcp connection. */ if (so->so_qlen <= so->so_qlimit) { #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif tcp_dooptions(&to, optp, optlen, TO_SYN); if (!syncache_add(&inc, &to, th, inp, &so, m)) goto drop; /* XXX: does not happen */ if (so == NULL) { /* * Entry added to syncache, mbuf used to * send SYN,ACK packet. Everything unlocked * already. */ return; } panic("T/TCP not supported at the moment"); #if 0 /* T/TCP */ /* * Segment passed TAO tests. * XXX: Can't happen at the moment. */ INP_UNLOCK(inp); inp = sotoinpcb(so); INP_LOCK(inp); tp = intotcpcb(inp); tp->t_starttime = ticks; tp->t_state = TCPS_ESTABLISHED; /* * T/TCP logic: * If there is a FIN or if there is data, then * delay SYN,ACK(SYN) in the hope of piggy-backing * it on a response segment. Otherwise must send * ACK now in case the other side is slow starting. */ if (thflags & TH_FIN || tlen != 0) tp->t_flags |= (TF_DELACK | TF_NEEDSYN); else tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tiwin = th->th_win << tp->snd_scale; tcpstat.tcps_connects++; soisconnected(so); goto trimthenstep6; #endif /* T/TCP */ } goto drop; } after_listen: KASSERT(headlocked, ("tcp_input: after_listen: head not locked")); INP_LOCK_ASSERT(inp); /* Syncache takes care of sockets in the listen state. */ KASSERT(tp->t_state != TCPS_LISTEN, ("tcp_input: TCPS_LISTEN")); /* * This is the second part of the MSS DoS prevention code (after * minmss on the sending side) and it deals with too many too small * tcp packets in a too short timeframe (1 second). * * For every full second we count the number of received packets * and bytes. If we get a lot of packets per second for this connection * (tcp_minmssoverload) we take a closer look at it and compute the * average packet size for the past second. If that is less than * tcp_minmss we get too many packets with very small payload which * is not good and burdens our system (and every packet generates * a wakeup to the process connected to our socket). We can reasonable * expect this to be small packet DoS attack to exhaust our CPU * cycles. * * Care has to be taken for the minimum packet overload value. This * value defines the minimum number of packets per second before we * start to worry. This must not be too low to avoid killing for * example interactive connections with many small packets like * telnet or SSH. * * Setting either tcp_minmssoverload or tcp_minmss to "0" disables * this check. * * Account for packet if payload packet, skip over ACK, etc. */ if (tcp_minmss && tcp_minmssoverload && tp->t_state == TCPS_ESTABLISHED && tlen > 0) { if ((unsigned int)(tp->rcv_second - ticks) < hz) { tp->rcv_pps++; tp->rcv_byps += tlen + off; if (tp->rcv_pps > tcp_minmssoverload) { if ((tp->rcv_byps / tp->rcv_pps) < tcp_minmss) { printf("too many small tcp packets from " "%s:%u, av. %lubyte/packet, " "dropping connection\n", #ifdef INET6 isipv6 ? ip6_sprintf(&inp->inp_inc.inc6_faddr) : #endif inet_ntoa(inp->inp_inc.inc_faddr), inp->inp_inc.inc_fport, tp->rcv_byps / tp->rcv_pps); KASSERT(headlocked, ("tcp_input: " "after_listen: tcp_drop: head " "not locked")); tp = tcp_drop(tp, ECONNRESET); tcpstat.tcps_minmssdrops++; goto drop; } } } else { tp->rcv_second = ticks + hz; tp->rcv_pps = 1; tp->rcv_byps = tlen + off; } } /* * Segment received on connection. * Reset idle time and keep-alive timer. */ tp->t_rcvtime = ticks; if (TCPS_HAVEESTABLISHED(tp->t_state)) callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); /* * Unscale the window into a 32-bit value. * This value is bogus for the TCPS_SYN_SENT state * and is overwritten later. */ tiwin = th->th_win << tp->snd_scale; /* * Parse options on any incoming segment. */ tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) ? TO_SYN : 0); /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0) && TSTMP_GT(to.to_tsecr, ticks)) to.to_tsecr = 0; /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_requested_s_scale; tp->snd_wnd = th->th_win << tp->snd_scale; tiwin = tp->snd_wnd; } if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = ticks; } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if (tp->sack_enable) { if (!(to.to_flags & TOF_SACK)) tp->sack_enable = 0; else tp->t_flags |= TF_SACK_PERMIT; } } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED above, it can only * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && th->th_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd && tp->snd_nxt == tp->snd_max) { /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && ((!tcp_do_newreno && !tp->sack_enable && tp->t_dupacks < tcprexmtthresh) || ((tcp_do_newreno || tp->sack_enable) && !IN_FASTRECOVERY(tp) && to.to_nsacks == 0 && TAILQ_EMPTY(&tp->snd_holes)))) { KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; /* * this is a pure ack for outstanding data. */ ++tcpstat.tcps_predack; /* * "bad retransmit" recovery */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { ++tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr) tp->t_rttlow = ticks - to.to_tsecr; tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; tp->snd_una = th->th_ack; /* * pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); ND6_HINT(tp); /* some progress has been done */ /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif */ if (tp->snd_una == tp->snd_max) callout_stop(tp->tt_rexmt); else if (!callout_active(tp->tt_persist)) callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); sowwakeup(so); if (so->so_snd.sb_cc) (void) tcp_output(tp); goto check_delack; } } else if (th->th_ack == tp->snd_una && LIST_EMPTY(&tp->t_segq) && tlen <= sbspace(&so->so_rcv)) { KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; /* * this is a pure, in-sequence data packet * with nothing on the reassembly queue and * we have enough buffer space to take it. */ /* Clean receiver SACK report if present */ if (tp->sack_enable && tp->rcv_numsacks) tcp_clean_sackreport(tp); ++tcpstat.tcps_preddat; tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to * th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); /* some progress has been done */ /* #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif * Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m); } sorwakeup_locked(so); if (DELAY_ACK(tp)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; tcp_output(tp); } goto check_delack; } } /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ { int win; win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } break; /* * If the state is SYN_SENT: * if seg contains an ACK, but not for our SYN, drop the input. * if seg contains a RST, then drop the connection. * if seg does not contain SYN, then drop it. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } if (thflags & TH_RST) { if (thflags & TH_ACK) { KASSERT(headlocked, ("tcp_input: after_listen" ": tcp_drop.2: head not locked")); tp = tcp_drop(tp, ECONNREFUSED); } goto drop; } if ((thflags & TH_SYN) == 0) goto drop; /* Initial send window, already scaled. */ tp->snd_wnd = th->th_win; tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { tcpstat.tcps_connects++; soisconnected(so); #ifdef MAC SOCK_LOCK(so); mac_set_socket_peer_from_mbuf(m, so); SOCK_UNLOCK(so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += tp->rcv_wnd; tp->snd_una++; /* SYN is acked */ /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp) && tlen != 0) callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); else tp->t_flags |= TF_ACKNOW; /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. If segment contains CC option * and there is a cached CC, apply TAO test. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* * If there was no CC option, clear cached CC value. */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); callout_stop(tp->tt_rexmt); tp->t_state = TCPS_SYN_RECEIVED; } #if 0 /* T/TCP */ trimthenstep6: #endif KASSERT(headlocked, ("tcp_input: trimthenstep6: head not " "locked")); INP_LOCK_ASSERT(inp); /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; tcpstat.tcps_rcvpackafterwin++; tcpstat.tcps_rcvbyteafterwin += todrop; } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; /* * If the state is LAST_ACK or CLOSING or TIME_WAIT: * do normal processing. * * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. */ case TCPS_LAST_ACK: case TCPS_CLOSING: case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); break; /* continue normal processing */ } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. * * * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. * RFC 793 page 37: * In all states except SYN-SENT, all reset (RST) segments * are validated by checking their SEQ-fields. A reset is * valid if its sequence number is in the window. * Note: this does not take into account delayed ACKs, so * we should test against last_ack_sent instead of rcv_nxt. * The sequence number in the reset segment is normally an * echo of our outgoing acknowlegement numbers, but some hosts * send a reset with the sequence number at the rightmost edge * of our receive window, and we have to handle this case. * Note 2: Paul Watson's paper "Slipping in the Window" has shown * that brute force RST attacks are possible. To combat this, * we use a much stricter check while in the ESTABLISHED state, * only accepting RSTs where the sequence number is equal to * last_ack_sent. In all other states (the states in which a * RST is more likely), the more permissive check is used. * If we have multiple segments in flight, the intial reset * segment sequence numbers will be to the left of last_ack_sent, * but they will eventually catch up. * In any case, it never made sense to trim reset segments to * fit the receive window since RFC 1122 says: * 4.2.2.12 RST Segment: RFC-793 Section 3.4 * * A TCP SHOULD allow a received RST segment to include data. * * DISCUSSION * It has been suggested that a RST segment could contain * ASCII text that encoded and explained the cause of the * RST. No standard has yet been established for such * data. * * If the reset segment passes the sequence number test examine * the state: * SYN_RECEIVED STATE: * If passive open, return to LISTEN state. * If active open, inform user that connection was refused. * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: * Inform user that connection was reset, and close tcb. * CLOSING, LAST_ACK STATES: * Close the tcb. * TIME_WAIT STATE: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) { if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: if (tp->last_ack_sent != th->th_seq && tcp_insecure_rst == 0) { tcpstat.tcps_badrst++; goto drop; } case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: tp->t_state = TCPS_CLOSED; tcpstat.tcps_drops++; KASSERT(headlocked, ("tcp_input: " "trimthenstep6: tcp_close: head not " "locked")); tp = tcp_close(tp); break; case TCPS_CLOSING: case TCPS_LAST_ACK: KASSERT(headlocked, ("trimthenstep6: " "tcp_close.2: head not locked")); tp = tcp_close(tp); break; case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); break; } } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += tlen; tcpstat.tcps_pawsdrop++; if (tlen) goto dropafterack; goto drop; } } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += todrop; } else { tcpstat.tcps_rcvpartduppack++; tcpstat.tcps_rcvpartdupbyte += todrop; } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { KASSERT(headlocked, ("trimthenstep6: tcp_close.3: head not " "locked")); tp = tcp_close(tp); tcpstat.tcps_rcvafterclose++; rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd); if (todrop > 0) { tcpstat.tcps_rcvpackafterwin++; if (todrop >= tlen) { tcpstat.tcps_rcvbyteafterwin += tlen; /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); if (thflags & TH_SYN && tp->t_state == TCPS_TIME_WAIT && SEQ_GT(th->th_seq, tp->rcv_nxt)) { KASSERT(headlocked, ("trimthenstep6: " "tcp_close.4: head not locked")); tp = tcp_close(tp); goto findpcb; } /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; tcpstat.tcps_rcvwinprobe++; } else goto dropafterack; } else tcpstat.tcps_rcvbyteafterwin += todrop; m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE: * 1) That the test incorporates suggestions from the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). * 2) That updating only on newer timestamps interferes with * our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. * 3) That we modify the segment boundary check to be * Last.ACK.Sent <= SEG.SEQ + SEG.Len * instead of RFC1323's * Last.ACK.Sent < SEG.SEQ + SEG.Len, * This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated * Vol. 2 p.869. In such cases, we can still calculate the * RTT correctly when RCV.NXT == Last.ACK.Sent. */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN|TH_FIN)) != 0))) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } /* * If a SYN is in the window, then this is an * error and we send an RST and drop the connection. */ if (thflags & TH_SYN) { KASSERT(headlocked, ("tcp_input: tcp_drop: trimthenstep6: " "head not locked")); tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; goto drop; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) goto step6; else if (tp->t_flags & TF_ACKNOW) goto dropafterack; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: tcpstat.tcps_connects++; soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_wnd = tiwin; } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); } /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); if (SEQ_GT(th->th_ack, tp->snd_max)) { tcpstat.tcps_rcvacktoomuch++; goto dropafterack; } if (tp->sack_enable && (to.to_nsacks > 0 || !TAILQ_EMPTY(&tp->snd_holes))) tcp_sack_doack(tp, &to, th->th_ack); if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { tcpstat.tcps_rcvdupack++; /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change), the ack is the biggest we've * seen and we've seen exactly our rexmt * threshhold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. */ if (!callout_active(tp->tt_rexmt) || th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || ((tcp_do_newreno || tp->sack_enable) && IN_FASTRECOVERY(tp))) { if (tp->sack_enable && IN_FASTRECOVERY(tp)) { int awnd; /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff * we have less than 1/2 the original window's * worth of data in flight. */ awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; if (awnd < tp->snd_ssthresh) { tp->snd_cwnd += tp->t_maxseg; if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; u_int win; /* * If we're doing sack, check to * see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ if (tp->sack_enable) { if (IN_FASTRECOVERY(tp)) { tp->t_dupacks = 0; break; } } else if (tcp_do_newreno) { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; ENTER_FASTRECOVERY(tp); tp->snd_recover = tp->snd_max; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; if (tp->sack_enable) { tcpstat.tcps_sack_recovery_episode++; tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("tp->snd_limited too big")); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (tcp_do_rfc3042) { u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("dupacks not 1 or 2")); if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg; (void) tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > tp->t_maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == tp->t_maxseg + 1 && tp->t_flags & TF_SENTFIN), ("sent too much")); tp->snd_limited = 2; } else if (sent > 0) ++tp->snd_limited; tp->snd_cwnd = oldcwnd; goto drop; } } else tp->t_dupacks = 0; break; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("th_ack <= snd_una")); /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (tcp_do_newreno || tp->sack_enable) { if (IN_FASTRECOVERY(tp)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->sack_enable) tcp_sack_partialack(tp, th); else tcp_newreno_partial_ack(tp, th); } else { /* * Out of fast recovery. * Window inflation should have left us * with approximately snd_ssthresh * outstanding data. * But in case we would be inclined to * send a burst, better to do it via * the slow start mechanism. */ if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) tp->snd_cwnd = tp->snd_max - th->th_ack + tp->t_maxseg; else tp->snd_cwnd = tp->snd_ssthresh; } } } else { if (tp->t_dupacks >= tcprexmtthresh && tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } tp->t_dupacks = 0; /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } process_ACK: KASSERT(headlocked, ("tcp_input: process_ACK: head not " "locked")); INP_LOCK_ASSERT(inp); acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { ++tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; /* XXX probably not required */ } /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr) tp->t_rttlow = ticks - to.to_tsecr; tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { callout_stop(tp->tt_rexmt); needoutput = 1; } else if (!callout_active(tp->tt_persist)) callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) goto step6; /* * When new data is acked, open the congestion window. * If the window gives us less than ssthresh packets * in flight, open exponentially (maxseg per packet). * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). */ if ((!tcp_do_newreno && !tp->sack_enable) || !IN_FASTRECOVERY(tp)) { register u_int cw = tp->snd_cwnd; register u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) incr = incr * incr / cw; tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); } SOCKBUF_LOCK(&so->so_snd); if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); ourfinisacked = 1; } else { sbdrop_locked(&so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } sowwakeup_locked(so); /* detect una wraparound */ if ((tcp_do_newreno || tp->sack_enable) && !IN_FASTRECOVERY(tp) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; if ((tcp_do_newreno || tp->sack_enable) && IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover)) EXIT_FASTRECOVERY(tp); tp->snd_una = th->th_ack; if (tp->sack_enable) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. */ /* XXXjl * we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); callout_reset(tp->tt_2msl, tcp_maxidle, tcp_timer_2msl, tp); } tp->t_state = TCPS_FIN_WAIT_2; } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { KASSERT(headlocked, ("tcp_input: process_ACK: " "head not locked")); tcp_twstart(tp); INP_INFO_WUNLOCK(&tcbinfo); m_freem(m); return; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { KASSERT(headlocked, ("tcp_input: process_ACK:" " tcp_close: head not locked")); tp = tcp_close(tp); goto drop; } break; /* * In TIME_WAIT state the only thing that should arrive * is a retransmission of the remote FIN. Acknowledge * it and restart the finack timer. */ case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); callout_reset(tp->tt_2msl, 2 * tcp_msl, tcp_timer_2msl, tp); goto dropafterack; } } step6: KASSERT(headlocked, ("tcp_input: step6: head not locked")); INP_LOCK_ASSERT(inp); /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) tcpstat.tcps_rcvwinupd++; tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + so->so_rcv.sb_cc > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = so->so_rcv.sb_cc + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (u_long)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ KASSERT(headlocked, ("tcp_input: dodata: head not locked")); INP_LOCK_ASSERT(inp); /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ if ((tlen || (thflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; tcp_seq save_end = th->th_seq + tlen; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp)) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m); sorwakeup_locked(so); } else { thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } if (tlen > 0 && tp->sack_enable) tcp_update_sack_list(tp, save_start, save_end); /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. */ len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /*FALLTHROUGH*/ case TCPS_ESTABLISHED: tp->t_state = TCPS_CLOSE_WAIT; break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tp->t_state = TCPS_CLOSING; break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: KASSERT(headlocked == 1, ("tcp_input: dodata: " "TCP_FIN_WAIT_2: head not locked")); tcp_twstart(tp); INP_INFO_WUNLOCK(&tcbinfo); return; /* * In TIME_WAIT state restart the 2 MSL time_wait timer. */ case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); callout_reset(tp->tt_2msl, 2 * tcp_msl, tcp_timer_2msl, tp); break; } } INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tcp_output(tp); check_delack: KASSERT(headlocked == 0, ("tcp_input: check_delack: head locked")); INP_LOCK_ASSERT(inp); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); } INP_UNLOCK(inp); return; dropafterack: KASSERT(headlocked, ("tcp_input: dropafterack: head not locked")); /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif KASSERT(headlocked, ("headlocked should be 1")); INP_INFO_WUNLOCK(&tcbinfo); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); INP_UNLOCK(inp); m_freem(m); return; dropwithreset: KASSERT(headlocked, ("tcp_input: dropwithreset: head not locked")); /* * Generate a RST, dropping incoming segment. * Make ACK acceptable to originator of segment. * Don't bother to respond if destination was broadcast/multicast. */ if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; } else { if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } /* IPv6 anycast check is done at tcp6_input() */ /* * Perform bandwidth limiting. */ if (badport_bandlim(rstreason) < 0) goto drop; #ifdef TCPDEBUG if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (thflags & TH_ACK) /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, TH_RST); else { if (thflags & TH_SYN) tlen++; /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); } if (tp != NULL) INP_UNLOCK(inp); if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); return; drop: /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp != NULL) INP_UNLOCK(inp); if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); m_freem(m); return; } /* * Parse TCP options and place in tcpopt. */ static void tcp_dooptions(to, cp, cnt, flags) struct tcpopt *to; u_char *cp; int cnt; int flags; { int opt, optlen; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_MSS; bcopy((char *)cp + 2, (char *)&to->to_mss, sizeof(to->to_mss)); to->to_mss = ntohs(to->to_mss); break; case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_SCALE; to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); break; case TCPOPT_TIMESTAMP: if (optlen != TCPOLEN_TIMESTAMP) continue; to->to_flags |= TOF_TS; bcopy((char *)cp + 2, (char *)&to->to_tsval, sizeof(to->to_tsval)); to->to_tsval = ntohl(to->to_tsval); bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); to->to_tsecr = ntohl(to->to_tsecr); break; #ifdef TCP_SIGNATURE /* * XXX In order to reply to a host which has set the * TCP_SIGNATURE option in its initial SYN, we have to * record the fact that the option was observed here * for the syncache code to perform the correct response. */ case TCPOPT_SIGNATURE: if (optlen != TCPOLEN_SIGNATURE) continue; to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN); break; #endif case TCPOPT_SACK_PERMITTED: if (optlen != TCPOLEN_SACK_PERMITTED) continue; if (!(flags & TO_SYN)) continue; if (!tcp_do_sack) continue; to->to_flags |= TOF_SACK; break; case TCPOPT_SACK: if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) continue; to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; to->to_sacks = cp + 2; tcpstat.tcps_sack_rcv_blocks++; break; default: continue; } } } /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. * It is still reflected in the segment length for * sequencing purposes. */ static void tcp_pulloutofband(so, th, m, off) struct socket *so; struct tcphdr *th; register struct mbuf *m; int off; /* delayed to be droped hdrlen */ { int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { char *cp = mtod(m, caddr_t) + cnt; struct tcpcb *tp = sototcpcb(so); tp->t_iobc = *cp; tp->t_oobflags |= TCPOOB_HAVEDATA; bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); m->m_len--; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len--; return; } cnt -= m->m_len; m = m->m_next; if (m == 0) break; } panic("tcp_pulloutofband"); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ static void tcp_xmit_timer(tp, rtt) register struct tcpcb *tp; int rtt; { register int delta; INP_LOCK_ASSERT(tp->t_inpcb); tcpstat.tcps_rttupdated++; tp->t_rttupdated++; if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed * point). Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit * timer to smoothed rtt + 4 times the smoothed variance. * rttvar is stored as fixed point with 4 bits after the * binary point (scaled by 16). The following is * equivalent to rfc793 smoothing with an alpha of .75 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. * Set the variance to half the rtt (so our first * retransmit happens at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } tp->t_rtttime = 0; tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); /* * We received an ack for a packet that wasn't retransmitted; * it is probably safe to discard any error indications we've * received recently. This isn't quite right, but close enough * for now (a route might have failed after we sent a segment, * and the return path might not be symmetrical). */ tp->t_softerror = 0; } /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. * If none, use an mss that can be handled on the outgoing * interface without forcing IP to fragment; if bigger than * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES * to utilize large mbufs. If no route is found, route has no mtu, * or the destination isn't local, use a default, hopefully conservative * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening * gateways or networks. We also initialize the congestion/slow start * window to be a single segment if the destination isn't local. * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * * Also take into account the space needed for options that we * send regularly. Make maxseg shorter by that amount to assure * that we can send maxseg amount of data even when the options * are present. Store the upper limit of the length of options plus * data in maxopd. * * * In case of T/TCP, we call this routine during implicit connection * setup as well (offer = -1), to initialize maxseg from the cached * MSS of our peer. * * NOTE that this routine is only called when we process an incoming * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). */ void tcp_mss(tp, offer) struct tcpcb *tp; int offer; { int rtt, mss; u_long bufsize; u_long maxmtu; struct inpcb *inp = tp->t_inpcb; struct socket *so; struct hc_metrics_lite metrics; int origoffer = offer; + int mtuflags = 0; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else const size_t min_protoh = sizeof(struct tcpiphdr); #endif /* initialize */ #ifdef INET6 if (isipv6) { - maxmtu = tcp_maxmtu6(&inp->inp_inc); + maxmtu = tcp_maxmtu6(&inp->inp_inc, &mtuflags); tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt; } else #endif { - maxmtu = tcp_maxmtu(&inp->inp_inc); + maxmtu = tcp_maxmtu(&inp->inp_inc, &mtuflags); tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; } so = inp->inp_socket; /* * no route to sender, stay with default mss and return */ if (maxmtu == 0) return; /* what have we got? */ switch (offer) { case 0: /* * Offer == 0 means that there was no MSS on the SYN * segment, in this case we use tcp_mssdflt. */ offer = #ifdef INET6 isipv6 ? tcp_v6mssdflt : #endif tcp_mssdflt; break; case -1: /* * Offer == -1 means that we didn't receive SYN yet. */ /* FALLTHROUGH */ default: /* * Prevent DoS attack with too small MSS. Round up * to at least minmss. */ offer = max(offer, tcp_minmss); /* * Sanity check: make sure that maxopd will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. */ offer = max(offer, 64); } /* * rmx information is now retrieved from tcp_hostcache */ tcp_hc_get(&inp->inp_inc, &metrics); /* * if there's a discovered mtu int tcp hostcache, use it * else, use the link mtu. */ if (metrics.rmx_mtu) mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; else { #ifdef INET6 if (isipv6) { mss = maxmtu - min_protoh; if (!path_mtu_discovery && !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, tcp_v6mssdflt); } else #endif { mss = maxmtu - min_protoh; if (!path_mtu_discovery && !in_localaddr(inp->inp_faddr)) mss = min(mss, tcp_mssdflt); } } mss = min(mss, offer); /* * maxopd stores the maximum length of data AND options * in a segment; maxseg is the amount of data in a normal * segment. We need to store this value (maxopd) apart * from maxseg, because now every segment carries options * and thus we normally have somewhat less data in segments. */ tp->t_maxopd = mss; /* * origoffer==-1 indicates, that no segments were received yet. * In this case we just guess. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (origoffer == -1 || (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) mss -= TCPOLEN_TSTAMP_APPA; tp->t_maxseg = mss; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) mss &= ~(MCLBYTES-1); #else if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif tp->t_maxseg = mss; /* * If there's a pipesize, change the socket buffer to that size, * don't change if sb_hiwat is different than default (then it * has been changed on purpose with setsockopt). * Make the socket buffers an integral number of mss units; * if the mss is larger than the socket buffer, decrease the mss. */ SOCKBUF_LOCK(&so->so_snd); if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) bufsize = metrics.rmx_sendpipe; else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; else { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_snd.sb_hiwat) (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_snd); tp->t_maxseg = mss; SOCKBUF_LOCK(&so->so_rcv); if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) bufsize = metrics.rmx_recvpipe; else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_rcv); /* * While we're here, check the others too */ if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; tcpstat.tcps_usedrtt++; if (metrics.rmx_rttvar) { tp->t_rttvar = metrics.rmx_rttvar; tcpstat.tcps_usedrttvar++; } else { /* default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, tp->t_rttmin, TCPTV_REXMTMAX); } if (metrics.rmx_ssthresh) { /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set * the slow start threshhold, but set the * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); tcpstat.tcps_usedssthresh++; } if (metrics.rmx_bandwidth) tp->snd_bandwidth = metrics.rmx_bandwidth; /* * Set the slow-start flight size depending on whether this * is a local network or not. * * Extend this so we cache the cwnd too and retrieve it here. * Make cwnd even bigger than RFC3390 suggests but only if we * have previous experience with the remote host. Be careful * not make cwnd bigger than remote receive window or our own * send socket buffer. Maybe put some additional upper bound * on the retrieved cwnd. Should do incremental updates to * hostcache when cwnd collapses so next connection doesn't * overloads the path again. * * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. * We currently check only in syncache_socket for that. */ #define TCP_METRICS_CWND #ifdef TCP_METRICS_CWND if (metrics.rmx_cwnd) tp->snd_cwnd = max(mss, min(metrics.rmx_cwnd / 2, min(tp->snd_wnd, so->so_snd.sb_hiwat))); else #endif if (tcp_do_rfc3390) tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); #ifdef INET6 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || (!isipv6 && in_localaddr(inp->inp_faddr))) #else else if (in_localaddr(inp->inp_faddr)) #endif tp->snd_cwnd = mss * ss_fltsz_local; else tp->snd_cwnd = mss * ss_fltsz; + + /* Check the interface for TSO capabilities. */ + if (mtuflags & CSUM_TSO) + tp->t_flags |= TF_TSO; } /* * Determine the MSS option to send on an outgoing SYN. */ int tcp_mssopt(inc) struct in_conninfo *inc; { int mss = 0; u_long maxmtu = 0; u_long thcmtu = 0; size_t min_protoh; #ifdef INET6 int isipv6 = inc->inc_isipv6 ? 1 : 0; #endif KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); #ifdef INET6 if (isipv6) { mss = tcp_v6mssdflt; - maxmtu = tcp_maxmtu6(inc); + maxmtu = tcp_maxmtu6(inc, NULL); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } else #endif { mss = tcp_mssdflt; - maxmtu = tcp_maxmtu(inc); + maxmtu = tcp_maxmtu(inc, NULL); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct tcpiphdr); } if (maxmtu && thcmtu) mss = min(maxmtu, thcmtu) - min_protoh; else if (maxmtu || thcmtu) mss = max(maxmtu, thcmtu) - min_protoh; return (mss); } /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ static void tcp_newreno_partial_ack(tp, th) struct tcpcb *tp; struct tcphdr *th; { tcp_seq onxt = tp->snd_nxt; u_long ocwnd = tp->snd_cwnd; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ if (tp->snd_cwnd > th->th_ack - tp->snd_una) tp->snd_cwnd -= th->th_ack - tp->snd_una; else tp->snd_cwnd = 0; tp->snd_cwnd += tp->t_maxseg; } /* * Returns 1 if the TIME_WAIT state was killed and we should start over, * looking for a pcb in the listen state. Returns 0 otherwise. */ static int tcp_timewait(inp, to, th, m, tlen) struct inpcb *inp; struct tcpopt *to; struct tcphdr *th; struct mbuf *m; int tlen; { struct tcptw *tw; int thflags; tcp_seq seq; #ifdef INET6 int isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #else const int isipv6 = 0; #endif /* tcbinfo lock required for tcp_twclose(), tcp_2msl_reset. */ INP_INFO_WLOCK_ASSERT(&tcbinfo); INP_LOCK_ASSERT(inp); /* * XXXRW: Time wait state for inpcb has been recycled, but inpcb is * still present. This is undesirable, but temporarily necessary * until we work out how to handle inpcb's who's timewait state has * been removed. */ tw = intotw(inp); if (tw == NULL) goto drop; thflags = th->th_flags; /* * NOTE: for FIN_WAIT_2 (to be added later), * must validate sequence number before accepting RST */ /* * If the segment contains RST: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) goto drop; #if 0 /* PAWS not needed at the moment */ /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { if ((thflags & TH_ACK) == 0) goto drop; goto ack; } /* * ts_recent is never updated because we never accept new segments. */ #endif /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) { tcp_twclose(tw, 0); return (1); } /* * Drop the the segment if it does not contain an ACK. */ if ((thflags & TH_ACK) == 0) goto drop; /* * Reset the 2MSL timer if this is a duplicate FIN. */ if (thflags & TH_FIN) { seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0); if (seq + 1 == tw->rcv_nxt) tcp_timer_2msl_reset(tw, 2 * tcp_msl, 1); } /* * Acknowledge the segment if it has data or is not a duplicate ACK. */ if (thflags != TH_ACK || tlen != 0 || th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) tcp_twrespond(tw, TH_ACK); goto drop; /* * Generate a RST, dropping incoming segment. * Make ACK acceptable to originator of segment. * Don't bother to respond if destination was broadcast/multicast. */ if (m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { struct ip6_hdr *ip6; /* IPv6 anycast check is done at tcp6_input() */ ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; } else { struct ip *ip; ip = mtod(m, struct ip *); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } if (thflags & TH_ACK) { tcp_respond(NULL, mtod(m, void *), th, m, 0, th->th_ack, TH_RST); } else { seq = th->th_seq + (thflags & TH_SYN ? 1 : 0); tcp_respond(NULL, mtod(m, void *), th, m, seq, 0, TH_RST|TH_ACK); } INP_UNLOCK(inp); return (0); drop: INP_UNLOCK(inp); m_freem(m); return (0); } Index: head/sys/netinet/tcp_reass.c =================================================================== --- head/sys/netinet/tcp_reass.c (revision 162083) +++ head/sys/netinet/tcp_reass.c (revision 162084) @@ -1,3305 +1,3310 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 * $FreeBSD$ */ #include "opt_ipfw.h" /* for ipfw_fwd */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include "opt_tcp_input.h" #include "opt_tcp_sack.h" #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef FAST_IPSEC #include #include #endif /*FAST_IPSEC*/ #ifdef IPSEC #include #include #include #endif /*IPSEC*/ #include static const int tcprexmtthresh = 3; struct tcpstat tcpstat; SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); static int log_in_vain = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, &log_in_vain, 0, "Log all incoming TCP connections"); static int blackhole = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, &blackhole, 0, "Do not send RST when dropping refused connections"); int tcp_delack_enabled = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, &tcp_delack_enabled, 0, "Delay ACK to try and piggyback it onto a data packet"); #ifdef TCP_DROP_SYNFIN static int drop_synfin = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); #endif static int tcp_do_rfc3042 = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, &tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); static int tcp_do_rfc3390 = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, &tcp_do_rfc3390, 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); static int tcp_insecure_rst = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, &tcp_insecure_rst, 0, "Follow the old (insecure) criteria for accepting RST packets."); SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); static int tcp_reass_maxseg = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, &tcp_reass_maxseg, 0, "Global maximum number of TCP Segments in Reassembly Queue"); int tcp_reass_qsize = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD, &tcp_reass_qsize, 0, "Global number of TCP Segments currently in Reassembly Queue"); static int tcp_reass_maxqlen = 48; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxqlen, CTLFLAG_RW, &tcp_reass_maxqlen, 0, "Maximum number of TCP Segments per individual Reassembly Queue"); static int tcp_reass_overflows = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, &tcp_reass_overflows, 0, "Global number of TCP Segment Reassembly Queue Overflows"); struct inpcbhead tcb; #define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; struct mtx *tcbinfo_mtx; static void tcp_dooptions(struct tcpopt *, u_char *, int, int); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); static void tcp_xmit_timer(struct tcpcb *, int); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); static int tcp_timewait(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #ifdef INET6 #define ND6_HINT(tp) \ do { \ if ((tp) && (tp)->t_inpcb && \ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ nd6_nud_hint(NULL, NULL, 0); \ } while (0) #else #define ND6_HINT(tp) #endif /* * Indicate whether this ack should be delayed. We can delay the ack if * - there is no delayed ack timer in progress and * - our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window and * - delayed acks are enabled or * - this is a half-synchronized T/TCP connection. */ #define DELAY_ACK(tp) \ ((!callout_active(tp->tt_delack) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) /* Initialize TCP reassembly queue */ static void tcp_reass_zone_change(void *tag) { tcp_reass_maxseg = nmbclusters / 16; uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); } uma_zone_t tcp_reass_zone; void tcp_reass_init() { tcp_reass_maxseg = nmbclusters / 16; TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments", &tcp_reass_maxseg); tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); EVENTHANDLER_REGISTER(nmbclusters_change, tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); } static int tcp_reass(tp, th, tlenp, m) register struct tcpcb *tp; register struct tcphdr *th; int *tlenp; struct mbuf *m; { struct tseg_qent *q; struct tseg_qent *p = NULL; struct tseg_qent *nq; struct tseg_qent *te = NULL; struct socket *so = tp->t_inpcb->inp_socket; int flags; INP_LOCK_ASSERT(tp->t_inpcb); /* * XXX: tcp_reass() is rather inefficient with its data structures * and should be rewritten (see NetBSD for optimizations). While * doing that it should move to its own file tcp_reass.c. */ /* * Call with th==NULL after become established to * force pre-ESTABLISHED data up to user socket. */ if (th == NULL) goto present; /* * Limit the number of segments in the reassembly queue to prevent * holding on to too many segments (and thus running out of mbufs). * Make sure to let the missing segment through which caused this * queue. Always keep one global queue entry spare to be able to * process the missing segment. */ if (th->th_seq != tp->rcv_nxt && (tcp_reass_qsize + 1 >= tcp_reass_maxseg || tp->t_segqlen >= tcp_reass_maxqlen)) { tcp_reass_overflows++; tcpstat.tcps_rcvmemdrop++; m_freem(m); *tlenp = 0; return (0); } /* * Allocate a new queue entry. If we can't, or hit the zone limit * just drop the pkt. */ te = uma_zalloc(tcp_reass_zone, M_NOWAIT); if (te == NULL) { tcpstat.tcps_rcvmemdrop++; m_freem(m); *tlenp = 0; return (0); } tp->t_segqlen++; tcp_reass_qsize++; /* * Find a segment which begins after this one does. */ LIST_FOREACH(q, &tp->t_segq, tqe_q) { if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) break; p = q; } /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us. */ if (p != NULL) { register int i; /* conversion to int (in i) handles seq wraparound */ i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; if (i > 0) { if (i >= *tlenp) { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += *tlenp; m_freem(m); uma_zfree(tcp_reass_zone, te); tp->t_segqlen--; tcp_reass_qsize--; /* * Try to present any queued data * at the left window edge to the user. * This is needed after the 3-WHS * completes. */ goto present; /* ??? */ } m_adj(m, i); *tlenp -= i; th->th_seq += i; } } tcpstat.tcps_rcvoopack++; tcpstat.tcps_rcvoobyte += *tlenp; /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ while (q) { register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; if (i <= 0) break; if (i < q->tqe_len) { q->tqe_th->th_seq += i; q->tqe_len -= i; m_adj(q->tqe_m, i); break; } nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; tcp_reass_qsize--; q = nq; } /* Insert the new segment queue entry into place. */ te->tqe_m = m; te->tqe_th = th; te->tqe_len = *tlenp; if (p == NULL) { LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); } else { LIST_INSERT_AFTER(p, te, tqe_q); } present: /* * Present data to user, advancing rcv_nxt through * completed sequence space. */ if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); q = LIST_FIRST(&tp->t_segq); if (!q || q->tqe_th->th_seq != tp->rcv_nxt) return (0); SOCKBUF_LOCK(&so->so_rcv); do { tp->rcv_nxt += q->tqe_len; flags = q->tqe_th->th_flags & TH_FIN; nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(q->tqe_m); else sbappendstream_locked(&so->so_rcv, q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; tcp_reass_qsize--; q = nq; } while (q && q->tqe_th->th_seq == tp->rcv_nxt); ND6_HINT(tp); sorwakeup_locked(so); return (flags); } /* * TCP input routine, follows pages 65-76 of the * protocol specification dated September, 1981 very closely. */ #ifdef INET6 int tcp6_input(mp, offp, proto) struct mbuf **mp; int *offp, proto; { register struct mbuf *m = *mp; struct in6_ifaddr *ia6; IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); /* * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ ia6 = ip6_getdstifaddr(m); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); return IPPROTO_DONE; } tcp_input(m, *offp); return IPPROTO_DONE; } #endif void tcp_input(m, off0) register struct mbuf *m; int off0; { register struct tcphdr *th; register struct ip *ip = NULL; register struct ipovly *ipov; register struct inpcb *inp = NULL; u_char *optp = NULL; int optlen = 0; int len, tlen, off; int drop_hdrlen; register struct tcpcb *tp = 0; register int thflags; struct socket *so = 0; int todrop, acked, ourfinisacked, needoutput = 0; u_long tiwin; struct tcpopt to; /* options in this segment */ int headlocked = 0; #ifdef IPFIREWALL_FORWARD struct m_tag *fwd_tag; #endif int rstreason; /* For badport_bandlim accounting purposes */ struct ip6_hdr *ip6 = NULL; #ifdef INET6 int isipv6; #else const int isipv6 = 0; #endif #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[40]; struct tcphdr tcp_savetcp; short ostate = 0; #endif #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif bzero((char *)&to, sizeof(to)); tcpstat.tcps_rcvtotal++; if (isipv6) { #ifdef INET6 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ ip6 = mtod(m, struct ip6_hdr *); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { tcpstat.tcps_rcvbadsum++; goto drop; } th = (struct tcphdr *)((caddr_t)ip6 + off0); /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ goto drop; } #else th = NULL; /* XXX: avoid compiler warning */ #endif } else { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { ip_stripoptions(m, (struct mbuf *)0); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { tcpstat.tcps_rcvshort++; return; } } ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ip->ip_len; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + ip->ip_len + IPPROTO_TCP)); th->th_sum ^= 0xffff; #ifdef TCPDEBUG ipov->ih_len = (u_short)tlen; ipov->ih_len = htons(ipov->ih_len); #endif } else { /* * Checksum extended TCP header and data. */ len = sizeof (struct ip) + tlen; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = (u_short)tlen; ipov->ih_len = htons(ipov->ih_len); th->th_sum = in_cksum(m, len); } if (th->th_sum) { tcpstat.tcps_rcvbadsum++; goto drop; } #ifdef INET6 /* Re-initialization for later version check */ ip->ip_v = IPVERSION; #endif } /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { tcpstat.tcps_rcvbadoff++; goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { if (isipv6) { #ifdef INET6 IP6_EXTHDR_CHECK(m, off0, off, ); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); #endif } else { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { tcpstat.tcps_rcvshort++; return; } ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); } } optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); } thflags = th->th_flags; #ifdef TCP_DROP_SYNFIN /* * If the drop_synfin option is enabled, drop all packets with * both the SYN and FIN bits set. This prevents e.g. nmap from * identifying the TCP/IP stack. * * This is a violation of the TCP specification. */ if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) goto drop; #endif /* * Convert TCP protocol specific fields to host format. */ th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); /* * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options, * until after ip6_savecontrol() is called and before other functions * which don't want those proto headers. * Because ip6_savecontrol() is going to parse the mbuf to * search for data to be passed up to user-land, it wants mbuf * parameters to be unchanged. * XXX: the call of ip6_savecontrol() has been obsoleted based on * latest version of the advanced API (20020110). */ drop_hdrlen = off0 + off; /* * Locate pcb for segment. */ INP_INFO_WLOCK(&tcbinfo); headlocked = 1; findpcb: KASSERT(headlocked, ("tcp_input: findpcb: head not locked")); #ifdef IPFIREWALL_FORWARD /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); if (fwd_tag != NULL && isipv6 == 0) { /* IPv6 support is not yet */ struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag+1); /* * Transparently forwarded. Pretend to be the destination. * already got one like this? */ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); if (!inp) { /* It's new. Try to find the ambushing socket. */ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : th->th_dport, INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); } else { #endif /* IPFIREWALL_FORWARD */ if (isipv6) { #ifdef INET6 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif); #endif } else inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif); #ifdef IPFIREWALL_FORWARD } #endif /* IPFIREWALL_FORWARD */ #if defined(IPSEC) || defined(FAST_IPSEC) #ifdef INET6 if (isipv6) { if (inp != NULL && ipsec6_in_reject(m, inp)) { #ifdef IPSEC ipsec6stat.in_polvio++; #endif goto drop; } } else #endif /* INET6 */ if (inp != NULL && ipsec4_in_reject(m, inp)) { #ifdef IPSEC ipsecstat.in_polvio++; #endif goto drop; } #endif /*IPSEC || FAST_IPSEC*/ /* * If the state is CLOSED (i.e., TCB does not exist) then * all data in the incoming segment is discarded. * If the TCB exists but is in CLOSED state, it is embryonic, * but should either do a listen or a connect soon. */ if (inp == NULL) { if (log_in_vain) { #ifdef INET6 char dbuf[INET6_ADDRSTRLEN+2], sbuf[INET6_ADDRSTRLEN+2]; #else char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"]; #endif if (isipv6) { #ifdef INET6 strcpy(dbuf, "["); strcpy(sbuf, "["); strcat(dbuf, ip6_sprintf(&ip6->ip6_dst)); strcat(sbuf, ip6_sprintf(&ip6->ip6_src)); strcat(dbuf, "]"); strcat(sbuf, "]"); #endif } else { strcpy(dbuf, inet_ntoa(ip->ip_dst)); strcpy(sbuf, inet_ntoa(ip->ip_src)); } switch (log_in_vain) { case 1: if ((thflags & TH_SYN) == 0) break; /* FALLTHROUGH */ case 2: log(LOG_INFO, "Connection attempt to TCP %s:%d " "from %s:%d flags:0x%02x\n", dbuf, ntohs(th->th_dport), sbuf, ntohs(th->th_sport), thflags); break; default: break; } } if (blackhole) { switch (blackhole) { case 1: if (thflags & TH_SYN) goto drop; break; case 2: goto drop; default: goto drop; } } rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } INP_LOCK(inp); /* Check the minimum TTL for socket. */ if (inp->inp_ip_minttl != 0) { #ifdef INET6 if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) goto drop; else #endif if (inp->inp_ip_minttl > ip->ip_ttl) goto drop; } if (inp->inp_vflag & INP_TIMEWAIT) { /* * The only option of relevance is TOF_CC, and only if * present in a SYN segment. See tcp_timewait(). */ if (thflags & TH_SYN) tcp_dooptions(&to, optp, optlen, TO_SYN); if (tcp_timewait(inp, &to, th, m, tlen)) goto findpcb; /* * tcp_timewait unlocks inp. */ INP_INFO_WUNLOCK(&tcbinfo); return; } tp = intotcpcb(inp); if (tp == 0) { INP_UNLOCK(inp); rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } if (tp->t_state == TCPS_CLOSED) goto drop; #ifdef MAC INP_LOCK_ASSERT(inp); if (mac_check_inpcb_deliver(inp, m)) goto drop; #endif so = inp->inp_socket; KASSERT(so != NULL, ("tcp_input: so == NULL")); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; if (isipv6) bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); else bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); tcp_savetcp = *th; } #endif if (so->so_options & SO_ACCEPTCONN) { struct in_conninfo inc; bzero(&inc, sizeof(inc)); #ifdef INET6 inc.inc_isipv6 = isipv6; #endif if (isipv6) { inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; } else { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; /* * If the state is LISTEN then ignore segment if it contains * a RST. If the segment contains an ACK then it is bad and * send a RST. If it does not contain a SYN then it is not * interesting; drop it. * * If the state is SYN_RECEIVED (syncache) and seg contains * an ACK, but not for our SYN/ACK, send a RST. If the seg * contains a RST, check the sequence number to see if it * is a valid reset segment. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { if (!syncache_expand(&inc, th, &so, m)) { /* * No syncache entry, or ACK was not * for our SYN/ACK. Send a RST. */ tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (so == NULL) { /* * Could not complete 3-way handshake, * connection is being closed down, and * syncache has free'd mbuf. */ INP_UNLOCK(inp); INP_INFO_WUNLOCK(&tcbinfo); return; } /* * Socket is created in state SYN_RECEIVED. * Continue processing segment. */ INP_UNLOCK(inp); inp = sotoinpcb(so); INP_LOCK(inp); tp = intotcpcb(inp); /* * This is what would have happened in * tcp_output() when the SYN,ACK was sent. */ tp->snd_up = tp->snd_una; tp->snd_max = tp->snd_nxt = tp->iss + 1; tp->last_ack_sent = tp->rcv_nxt; goto after_listen; } if (thflags & TH_RST) { syncache_chkrst(&inc, th); goto drop; } if (thflags & TH_ACK) { syncache_badack(&inc); tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } goto drop; } /* * Segment's flags are (SYN) or (SYN|FIN). */ #ifdef INET6 /* * If deprecated address is forbidden, * we do not accept SYN to deprecated interface * address to prevent any new inbound connection from * getting established. * When we do not accept SYN, we send a TCP RST, * with deprecated source address (instead of dropping * it). We compromise it as it is much better for peer * to send a RST, and RST will be the final packet * for the exchange. * * If we do not forbid deprecated addresses, we accept * the SYN packet. RFC2462 does not suggest dropping * SYN in this case. * If we decipher RFC2462 5.5.4, it says like this: * 1. use of deprecated addr with existing * communication is okay - "SHOULD continue to be * used" * 2. use of it with new communication: * (2a) "SHOULD NOT be used if alternate address * with sufficient scope is available" * (2b) nothing mentioned otherwise. * Here we fall into (2b) case as we have no choice in * our source address selection - we must obey the peer. * * The wording in RFC2462 is confusing, and there are * multiple description text for deprecated address * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ if (isipv6 && !ip6_use_deprecated) { struct in6_ifaddr *ia6; if ((ia6 = ip6_getdstifaddr(m)) && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { INP_UNLOCK(inp); tp = NULL; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } } #endif /* * If it is from this socket, drop it, it must be forged. * Don't bother responding if the destination was a broadcast. */ if (th->th_dport == th->th_sport) { if (isipv6) { if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) goto drop; } else { if (ip->ip_dst.s_addr == ip->ip_src.s_addr) goto drop; } } /* * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN * * Note that it is quite possible to receive unicast * link-layer packets with a broadcast IP address. Use * in_broadcast() to find them. */ if (m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; } else { if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } /* * SYN appears to be valid; create compressed TCP state * for syncache, or perform t/tcp connection. */ if (so->so_qlen <= so->so_qlimit) { #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif tcp_dooptions(&to, optp, optlen, TO_SYN); if (!syncache_add(&inc, &to, th, inp, &so, m)) goto drop; /* XXX: does not happen */ if (so == NULL) { /* * Entry added to syncache, mbuf used to * send SYN,ACK packet. Everything unlocked * already. */ return; } panic("T/TCP not supported at the moment"); #if 0 /* T/TCP */ /* * Segment passed TAO tests. * XXX: Can't happen at the moment. */ INP_UNLOCK(inp); inp = sotoinpcb(so); INP_LOCK(inp); tp = intotcpcb(inp); tp->t_starttime = ticks; tp->t_state = TCPS_ESTABLISHED; /* * T/TCP logic: * If there is a FIN or if there is data, then * delay SYN,ACK(SYN) in the hope of piggy-backing * it on a response segment. Otherwise must send * ACK now in case the other side is slow starting. */ if (thflags & TH_FIN || tlen != 0) tp->t_flags |= (TF_DELACK | TF_NEEDSYN); else tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tiwin = th->th_win << tp->snd_scale; tcpstat.tcps_connects++; soisconnected(so); goto trimthenstep6; #endif /* T/TCP */ } goto drop; } after_listen: KASSERT(headlocked, ("tcp_input: after_listen: head not locked")); INP_LOCK_ASSERT(inp); /* Syncache takes care of sockets in the listen state. */ KASSERT(tp->t_state != TCPS_LISTEN, ("tcp_input: TCPS_LISTEN")); /* * This is the second part of the MSS DoS prevention code (after * minmss on the sending side) and it deals with too many too small * tcp packets in a too short timeframe (1 second). * * For every full second we count the number of received packets * and bytes. If we get a lot of packets per second for this connection * (tcp_minmssoverload) we take a closer look at it and compute the * average packet size for the past second. If that is less than * tcp_minmss we get too many packets with very small payload which * is not good and burdens our system (and every packet generates * a wakeup to the process connected to our socket). We can reasonable * expect this to be small packet DoS attack to exhaust our CPU * cycles. * * Care has to be taken for the minimum packet overload value. This * value defines the minimum number of packets per second before we * start to worry. This must not be too low to avoid killing for * example interactive connections with many small packets like * telnet or SSH. * * Setting either tcp_minmssoverload or tcp_minmss to "0" disables * this check. * * Account for packet if payload packet, skip over ACK, etc. */ if (tcp_minmss && tcp_minmssoverload && tp->t_state == TCPS_ESTABLISHED && tlen > 0) { if ((unsigned int)(tp->rcv_second - ticks) < hz) { tp->rcv_pps++; tp->rcv_byps += tlen + off; if (tp->rcv_pps > tcp_minmssoverload) { if ((tp->rcv_byps / tp->rcv_pps) < tcp_minmss) { printf("too many small tcp packets from " "%s:%u, av. %lubyte/packet, " "dropping connection\n", #ifdef INET6 isipv6 ? ip6_sprintf(&inp->inp_inc.inc6_faddr) : #endif inet_ntoa(inp->inp_inc.inc_faddr), inp->inp_inc.inc_fport, tp->rcv_byps / tp->rcv_pps); KASSERT(headlocked, ("tcp_input: " "after_listen: tcp_drop: head " "not locked")); tp = tcp_drop(tp, ECONNRESET); tcpstat.tcps_minmssdrops++; goto drop; } } } else { tp->rcv_second = ticks + hz; tp->rcv_pps = 1; tp->rcv_byps = tlen + off; } } /* * Segment received on connection. * Reset idle time and keep-alive timer. */ tp->t_rcvtime = ticks; if (TCPS_HAVEESTABLISHED(tp->t_state)) callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); /* * Unscale the window into a 32-bit value. * This value is bogus for the TCPS_SYN_SENT state * and is overwritten later. */ tiwin = th->th_win << tp->snd_scale; /* * Parse options on any incoming segment. */ tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) ? TO_SYN : 0); /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0) && TSTMP_GT(to.to_tsecr, ticks)) to.to_tsecr = 0; /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_requested_s_scale; tp->snd_wnd = th->th_win << tp->snd_scale; tiwin = tp->snd_wnd; } if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = ticks; } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if (tp->sack_enable) { if (!(to.to_flags & TOF_SACK)) tp->sack_enable = 0; else tp->t_flags |= TF_SACK_PERMIT; } } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED above, it can only * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && th->th_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd && tp->snd_nxt == tp->snd_max) { /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && ((!tcp_do_newreno && !tp->sack_enable && tp->t_dupacks < tcprexmtthresh) || ((tcp_do_newreno || tp->sack_enable) && !IN_FASTRECOVERY(tp) && to.to_nsacks == 0 && TAILQ_EMPTY(&tp->snd_holes)))) { KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; /* * this is a pure ack for outstanding data. */ ++tcpstat.tcps_predack; /* * "bad retransmit" recovery */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { ++tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr) tp->t_rttlow = ticks - to.to_tsecr; tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; tp->snd_una = th->th_ack; /* * pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); ND6_HINT(tp); /* some progress has been done */ /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif */ if (tp->snd_una == tp->snd_max) callout_stop(tp->tt_rexmt); else if (!callout_active(tp->tt_persist)) callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); sowwakeup(so); if (so->so_snd.sb_cc) (void) tcp_output(tp); goto check_delack; } } else if (th->th_ack == tp->snd_una && LIST_EMPTY(&tp->t_segq) && tlen <= sbspace(&so->so_rcv)) { KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; /* * this is a pure, in-sequence data packet * with nothing on the reassembly queue and * we have enough buffer space to take it. */ /* Clean receiver SACK report if present */ if (tp->sack_enable && tp->rcv_numsacks) tcp_clean_sackreport(tp); ++tcpstat.tcps_preddat; tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to * th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); /* some progress has been done */ /* #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif * Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m); } sorwakeup_locked(so); if (DELAY_ACK(tp)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; tcp_output(tp); } goto check_delack; } } /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ { int win; win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } break; /* * If the state is SYN_SENT: * if seg contains an ACK, but not for our SYN, drop the input. * if seg contains a RST, then drop the connection. * if seg does not contain SYN, then drop it. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } if (thflags & TH_RST) { if (thflags & TH_ACK) { KASSERT(headlocked, ("tcp_input: after_listen" ": tcp_drop.2: head not locked")); tp = tcp_drop(tp, ECONNREFUSED); } goto drop; } if ((thflags & TH_SYN) == 0) goto drop; /* Initial send window, already scaled. */ tp->snd_wnd = th->th_win; tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { tcpstat.tcps_connects++; soisconnected(so); #ifdef MAC SOCK_LOCK(so); mac_set_socket_peer_from_mbuf(m, so); SOCK_UNLOCK(so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += tp->rcv_wnd; tp->snd_una++; /* SYN is acked */ /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp) && tlen != 0) callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); else tp->t_flags |= TF_ACKNOW; /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. If segment contains CC option * and there is a cached CC, apply TAO test. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* * If there was no CC option, clear cached CC value. */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); callout_stop(tp->tt_rexmt); tp->t_state = TCPS_SYN_RECEIVED; } #if 0 /* T/TCP */ trimthenstep6: #endif KASSERT(headlocked, ("tcp_input: trimthenstep6: head not " "locked")); INP_LOCK_ASSERT(inp); /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; tcpstat.tcps_rcvpackafterwin++; tcpstat.tcps_rcvbyteafterwin += todrop; } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; /* * If the state is LAST_ACK or CLOSING or TIME_WAIT: * do normal processing. * * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. */ case TCPS_LAST_ACK: case TCPS_CLOSING: case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); break; /* continue normal processing */ } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. * * * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. * RFC 793 page 37: * In all states except SYN-SENT, all reset (RST) segments * are validated by checking their SEQ-fields. A reset is * valid if its sequence number is in the window. * Note: this does not take into account delayed ACKs, so * we should test against last_ack_sent instead of rcv_nxt. * The sequence number in the reset segment is normally an * echo of our outgoing acknowlegement numbers, but some hosts * send a reset with the sequence number at the rightmost edge * of our receive window, and we have to handle this case. * Note 2: Paul Watson's paper "Slipping in the Window" has shown * that brute force RST attacks are possible. To combat this, * we use a much stricter check while in the ESTABLISHED state, * only accepting RSTs where the sequence number is equal to * last_ack_sent. In all other states (the states in which a * RST is more likely), the more permissive check is used. * If we have multiple segments in flight, the intial reset * segment sequence numbers will be to the left of last_ack_sent, * but they will eventually catch up. * In any case, it never made sense to trim reset segments to * fit the receive window since RFC 1122 says: * 4.2.2.12 RST Segment: RFC-793 Section 3.4 * * A TCP SHOULD allow a received RST segment to include data. * * DISCUSSION * It has been suggested that a RST segment could contain * ASCII text that encoded and explained the cause of the * RST. No standard has yet been established for such * data. * * If the reset segment passes the sequence number test examine * the state: * SYN_RECEIVED STATE: * If passive open, return to LISTEN state. * If active open, inform user that connection was refused. * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: * Inform user that connection was reset, and close tcb. * CLOSING, LAST_ACK STATES: * Close the tcb. * TIME_WAIT STATE: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) { if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: if (tp->last_ack_sent != th->th_seq && tcp_insecure_rst == 0) { tcpstat.tcps_badrst++; goto drop; } case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: tp->t_state = TCPS_CLOSED; tcpstat.tcps_drops++; KASSERT(headlocked, ("tcp_input: " "trimthenstep6: tcp_close: head not " "locked")); tp = tcp_close(tp); break; case TCPS_CLOSING: case TCPS_LAST_ACK: KASSERT(headlocked, ("trimthenstep6: " "tcp_close.2: head not locked")); tp = tcp_close(tp); break; case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); break; } } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += tlen; tcpstat.tcps_pawsdrop++; if (tlen) goto dropafterack; goto drop; } } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += todrop; } else { tcpstat.tcps_rcvpartduppack++; tcpstat.tcps_rcvpartdupbyte += todrop; } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { KASSERT(headlocked, ("trimthenstep6: tcp_close.3: head not " "locked")); tp = tcp_close(tp); tcpstat.tcps_rcvafterclose++; rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd); if (todrop > 0) { tcpstat.tcps_rcvpackafterwin++; if (todrop >= tlen) { tcpstat.tcps_rcvbyteafterwin += tlen; /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); if (thflags & TH_SYN && tp->t_state == TCPS_TIME_WAIT && SEQ_GT(th->th_seq, tp->rcv_nxt)) { KASSERT(headlocked, ("trimthenstep6: " "tcp_close.4: head not locked")); tp = tcp_close(tp); goto findpcb; } /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; tcpstat.tcps_rcvwinprobe++; } else goto dropafterack; } else tcpstat.tcps_rcvbyteafterwin += todrop; m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE: * 1) That the test incorporates suggestions from the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). * 2) That updating only on newer timestamps interferes with * our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. * 3) That we modify the segment boundary check to be * Last.ACK.Sent <= SEG.SEQ + SEG.Len * instead of RFC1323's * Last.ACK.Sent < SEG.SEQ + SEG.Len, * This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated * Vol. 2 p.869. In such cases, we can still calculate the * RTT correctly when RCV.NXT == Last.ACK.Sent. */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN|TH_FIN)) != 0))) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } /* * If a SYN is in the window, then this is an * error and we send an RST and drop the connection. */ if (thflags & TH_SYN) { KASSERT(headlocked, ("tcp_input: tcp_drop: trimthenstep6: " "head not locked")); tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; goto drop; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) goto step6; else if (tp->t_flags & TF_ACKNOW) goto dropafterack; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: tcpstat.tcps_connects++; soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_wnd = tiwin; } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); } /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); if (SEQ_GT(th->th_ack, tp->snd_max)) { tcpstat.tcps_rcvacktoomuch++; goto dropafterack; } if (tp->sack_enable && (to.to_nsacks > 0 || !TAILQ_EMPTY(&tp->snd_holes))) tcp_sack_doack(tp, &to, th->th_ack); if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { tcpstat.tcps_rcvdupack++; /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change), the ack is the biggest we've * seen and we've seen exactly our rexmt * threshhold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. */ if (!callout_active(tp->tt_rexmt) || th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || ((tcp_do_newreno || tp->sack_enable) && IN_FASTRECOVERY(tp))) { if (tp->sack_enable && IN_FASTRECOVERY(tp)) { int awnd; /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff * we have less than 1/2 the original window's * worth of data in flight. */ awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; if (awnd < tp->snd_ssthresh) { tp->snd_cwnd += tp->t_maxseg; if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; u_int win; /* * If we're doing sack, check to * see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ if (tp->sack_enable) { if (IN_FASTRECOVERY(tp)) { tp->t_dupacks = 0; break; } } else if (tcp_do_newreno) { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; ENTER_FASTRECOVERY(tp); tp->snd_recover = tp->snd_max; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; if (tp->sack_enable) { tcpstat.tcps_sack_recovery_episode++; tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("tp->snd_limited too big")); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (tcp_do_rfc3042) { u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("dupacks not 1 or 2")); if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg; (void) tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > tp->t_maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == tp->t_maxseg + 1 && tp->t_flags & TF_SENTFIN), ("sent too much")); tp->snd_limited = 2; } else if (sent > 0) ++tp->snd_limited; tp->snd_cwnd = oldcwnd; goto drop; } } else tp->t_dupacks = 0; break; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("th_ack <= snd_una")); /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (tcp_do_newreno || tp->sack_enable) { if (IN_FASTRECOVERY(tp)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->sack_enable) tcp_sack_partialack(tp, th); else tcp_newreno_partial_ack(tp, th); } else { /* * Out of fast recovery. * Window inflation should have left us * with approximately snd_ssthresh * outstanding data. * But in case we would be inclined to * send a burst, better to do it via * the slow start mechanism. */ if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) tp->snd_cwnd = tp->snd_max - th->th_ack + tp->t_maxseg; else tp->snd_cwnd = tp->snd_ssthresh; } } } else { if (tp->t_dupacks >= tcprexmtthresh && tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } tp->t_dupacks = 0; /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } process_ACK: KASSERT(headlocked, ("tcp_input: process_ACK: head not " "locked")); INP_LOCK_ASSERT(inp); acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { ++tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; /* XXX probably not required */ } /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr) tp->t_rttlow = ticks - to.to_tsecr; tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } tcp_xmit_bandwidth_limit(tp, th->th_ack); /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { callout_stop(tp->tt_rexmt); needoutput = 1; } else if (!callout_active(tp->tt_persist)) callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) goto step6; /* * When new data is acked, open the congestion window. * If the window gives us less than ssthresh packets * in flight, open exponentially (maxseg per packet). * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). */ if ((!tcp_do_newreno && !tp->sack_enable) || !IN_FASTRECOVERY(tp)) { register u_int cw = tp->snd_cwnd; register u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) incr = incr * incr / cw; tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); } SOCKBUF_LOCK(&so->so_snd); if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); ourfinisacked = 1; } else { sbdrop_locked(&so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } sowwakeup_locked(so); /* detect una wraparound */ if ((tcp_do_newreno || tp->sack_enable) && !IN_FASTRECOVERY(tp) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; if ((tcp_do_newreno || tp->sack_enable) && IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover)) EXIT_FASTRECOVERY(tp); tp->snd_una = th->th_ack; if (tp->sack_enable) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. */ /* XXXjl * we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); callout_reset(tp->tt_2msl, tcp_maxidle, tcp_timer_2msl, tp); } tp->t_state = TCPS_FIN_WAIT_2; } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { KASSERT(headlocked, ("tcp_input: process_ACK: " "head not locked")); tcp_twstart(tp); INP_INFO_WUNLOCK(&tcbinfo); m_freem(m); return; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { KASSERT(headlocked, ("tcp_input: process_ACK:" " tcp_close: head not locked")); tp = tcp_close(tp); goto drop; } break; /* * In TIME_WAIT state the only thing that should arrive * is a retransmission of the remote FIN. Acknowledge * it and restart the finack timer. */ case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); callout_reset(tp->tt_2msl, 2 * tcp_msl, tcp_timer_2msl, tp); goto dropafterack; } } step6: KASSERT(headlocked, ("tcp_input: step6: head not locked")); INP_LOCK_ASSERT(inp); /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) tcpstat.tcps_rcvwinupd++; tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + so->so_rcv.sb_cc > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = so->so_rcv.sb_cc + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (u_long)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ KASSERT(headlocked, ("tcp_input: dodata: head not locked")); INP_LOCK_ASSERT(inp); /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ if ((tlen || (thflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; tcp_seq save_end = th->th_seq + tlen; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp)) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m); sorwakeup_locked(so); } else { thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } if (tlen > 0 && tp->sack_enable) tcp_update_sack_list(tp, save_start, save_end); /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. */ len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /*FALLTHROUGH*/ case TCPS_ESTABLISHED: tp->t_state = TCPS_CLOSE_WAIT; break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tp->t_state = TCPS_CLOSING; break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: KASSERT(headlocked == 1, ("tcp_input: dodata: " "TCP_FIN_WAIT_2: head not locked")); tcp_twstart(tp); INP_INFO_WUNLOCK(&tcbinfo); return; /* * In TIME_WAIT state restart the 2 MSL time_wait timer. */ case TCPS_TIME_WAIT: KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); callout_reset(tp->tt_2msl, 2 * tcp_msl, tcp_timer_2msl, tp); break; } } INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tcp_output(tp); check_delack: KASSERT(headlocked == 0, ("tcp_input: check_delack: head locked")); INP_LOCK_ASSERT(inp); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; callout_reset(tp->tt_delack, tcp_delacktime, tcp_timer_delack, tp); } INP_UNLOCK(inp); return; dropafterack: KASSERT(headlocked, ("tcp_input: dropafterack: head not locked")); /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif KASSERT(headlocked, ("headlocked should be 1")); INP_INFO_WUNLOCK(&tcbinfo); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); INP_UNLOCK(inp); m_freem(m); return; dropwithreset: KASSERT(headlocked, ("tcp_input: dropwithreset: head not locked")); /* * Generate a RST, dropping incoming segment. * Make ACK acceptable to originator of segment. * Don't bother to respond if destination was broadcast/multicast. */ if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; } else { if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } /* IPv6 anycast check is done at tcp6_input() */ /* * Perform bandwidth limiting. */ if (badport_bandlim(rstreason) < 0) goto drop; #ifdef TCPDEBUG if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (thflags & TH_ACK) /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, TH_RST); else { if (thflags & TH_SYN) tlen++; /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); } if (tp != NULL) INP_UNLOCK(inp); if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); return; drop: /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp != NULL) INP_UNLOCK(inp); if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); m_freem(m); return; } /* * Parse TCP options and place in tcpopt. */ static void tcp_dooptions(to, cp, cnt, flags) struct tcpopt *to; u_char *cp; int cnt; int flags; { int opt, optlen; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_MSS; bcopy((char *)cp + 2, (char *)&to->to_mss, sizeof(to->to_mss)); to->to_mss = ntohs(to->to_mss); break; case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_SCALE; to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); break; case TCPOPT_TIMESTAMP: if (optlen != TCPOLEN_TIMESTAMP) continue; to->to_flags |= TOF_TS; bcopy((char *)cp + 2, (char *)&to->to_tsval, sizeof(to->to_tsval)); to->to_tsval = ntohl(to->to_tsval); bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); to->to_tsecr = ntohl(to->to_tsecr); break; #ifdef TCP_SIGNATURE /* * XXX In order to reply to a host which has set the * TCP_SIGNATURE option in its initial SYN, we have to * record the fact that the option was observed here * for the syncache code to perform the correct response. */ case TCPOPT_SIGNATURE: if (optlen != TCPOLEN_SIGNATURE) continue; to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN); break; #endif case TCPOPT_SACK_PERMITTED: if (optlen != TCPOLEN_SACK_PERMITTED) continue; if (!(flags & TO_SYN)) continue; if (!tcp_do_sack) continue; to->to_flags |= TOF_SACK; break; case TCPOPT_SACK: if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) continue; to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; to->to_sacks = cp + 2; tcpstat.tcps_sack_rcv_blocks++; break; default: continue; } } } /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. * It is still reflected in the segment length for * sequencing purposes. */ static void tcp_pulloutofband(so, th, m, off) struct socket *so; struct tcphdr *th; register struct mbuf *m; int off; /* delayed to be droped hdrlen */ { int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { char *cp = mtod(m, caddr_t) + cnt; struct tcpcb *tp = sototcpcb(so); tp->t_iobc = *cp; tp->t_oobflags |= TCPOOB_HAVEDATA; bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); m->m_len--; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len--; return; } cnt -= m->m_len; m = m->m_next; if (m == 0) break; } panic("tcp_pulloutofband"); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ static void tcp_xmit_timer(tp, rtt) register struct tcpcb *tp; int rtt; { register int delta; INP_LOCK_ASSERT(tp->t_inpcb); tcpstat.tcps_rttupdated++; tp->t_rttupdated++; if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed * point). Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit * timer to smoothed rtt + 4 times the smoothed variance. * rttvar is stored as fixed point with 4 bits after the * binary point (scaled by 16). The following is * equivalent to rfc793 smoothing with an alpha of .75 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. * Set the variance to half the rtt (so our first * retransmit happens at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } tp->t_rtttime = 0; tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); /* * We received an ack for a packet that wasn't retransmitted; * it is probably safe to discard any error indications we've * received recently. This isn't quite right, but close enough * for now (a route might have failed after we sent a segment, * and the return path might not be symmetrical). */ tp->t_softerror = 0; } /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. * If none, use an mss that can be handled on the outgoing * interface without forcing IP to fragment; if bigger than * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES * to utilize large mbufs. If no route is found, route has no mtu, * or the destination isn't local, use a default, hopefully conservative * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening * gateways or networks. We also initialize the congestion/slow start * window to be a single segment if the destination isn't local. * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * * Also take into account the space needed for options that we * send regularly. Make maxseg shorter by that amount to assure * that we can send maxseg amount of data even when the options * are present. Store the upper limit of the length of options plus * data in maxopd. * * * In case of T/TCP, we call this routine during implicit connection * setup as well (offer = -1), to initialize maxseg from the cached * MSS of our peer. * * NOTE that this routine is only called when we process an incoming * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). */ void tcp_mss(tp, offer) struct tcpcb *tp; int offer; { int rtt, mss; u_long bufsize; u_long maxmtu; struct inpcb *inp = tp->t_inpcb; struct socket *so; struct hc_metrics_lite metrics; int origoffer = offer; + int mtuflags = 0; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else const size_t min_protoh = sizeof(struct tcpiphdr); #endif /* initialize */ #ifdef INET6 if (isipv6) { - maxmtu = tcp_maxmtu6(&inp->inp_inc); + maxmtu = tcp_maxmtu6(&inp->inp_inc, &mtuflags); tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt; } else #endif { - maxmtu = tcp_maxmtu(&inp->inp_inc); + maxmtu = tcp_maxmtu(&inp->inp_inc, &mtuflags); tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; } so = inp->inp_socket; /* * no route to sender, stay with default mss and return */ if (maxmtu == 0) return; /* what have we got? */ switch (offer) { case 0: /* * Offer == 0 means that there was no MSS on the SYN * segment, in this case we use tcp_mssdflt. */ offer = #ifdef INET6 isipv6 ? tcp_v6mssdflt : #endif tcp_mssdflt; break; case -1: /* * Offer == -1 means that we didn't receive SYN yet. */ /* FALLTHROUGH */ default: /* * Prevent DoS attack with too small MSS. Round up * to at least minmss. */ offer = max(offer, tcp_minmss); /* * Sanity check: make sure that maxopd will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. */ offer = max(offer, 64); } /* * rmx information is now retrieved from tcp_hostcache */ tcp_hc_get(&inp->inp_inc, &metrics); /* * if there's a discovered mtu int tcp hostcache, use it * else, use the link mtu. */ if (metrics.rmx_mtu) mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; else { #ifdef INET6 if (isipv6) { mss = maxmtu - min_protoh; if (!path_mtu_discovery && !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, tcp_v6mssdflt); } else #endif { mss = maxmtu - min_protoh; if (!path_mtu_discovery && !in_localaddr(inp->inp_faddr)) mss = min(mss, tcp_mssdflt); } } mss = min(mss, offer); /* * maxopd stores the maximum length of data AND options * in a segment; maxseg is the amount of data in a normal * segment. We need to store this value (maxopd) apart * from maxseg, because now every segment carries options * and thus we normally have somewhat less data in segments. */ tp->t_maxopd = mss; /* * origoffer==-1 indicates, that no segments were received yet. * In this case we just guess. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (origoffer == -1 || (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) mss -= TCPOLEN_TSTAMP_APPA; tp->t_maxseg = mss; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) mss &= ~(MCLBYTES-1); #else if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif tp->t_maxseg = mss; /* * If there's a pipesize, change the socket buffer to that size, * don't change if sb_hiwat is different than default (then it * has been changed on purpose with setsockopt). * Make the socket buffers an integral number of mss units; * if the mss is larger than the socket buffer, decrease the mss. */ SOCKBUF_LOCK(&so->so_snd); if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) bufsize = metrics.rmx_sendpipe; else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; else { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_snd.sb_hiwat) (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_snd); tp->t_maxseg = mss; SOCKBUF_LOCK(&so->so_rcv); if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) bufsize = metrics.rmx_recvpipe; else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_rcv); /* * While we're here, check the others too */ if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; tcpstat.tcps_usedrtt++; if (metrics.rmx_rttvar) { tp->t_rttvar = metrics.rmx_rttvar; tcpstat.tcps_usedrttvar++; } else { /* default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, tp->t_rttmin, TCPTV_REXMTMAX); } if (metrics.rmx_ssthresh) { /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set * the slow start threshhold, but set the * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); tcpstat.tcps_usedssthresh++; } if (metrics.rmx_bandwidth) tp->snd_bandwidth = metrics.rmx_bandwidth; /* * Set the slow-start flight size depending on whether this * is a local network or not. * * Extend this so we cache the cwnd too and retrieve it here. * Make cwnd even bigger than RFC3390 suggests but only if we * have previous experience with the remote host. Be careful * not make cwnd bigger than remote receive window or our own * send socket buffer. Maybe put some additional upper bound * on the retrieved cwnd. Should do incremental updates to * hostcache when cwnd collapses so next connection doesn't * overloads the path again. * * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. * We currently check only in syncache_socket for that. */ #define TCP_METRICS_CWND #ifdef TCP_METRICS_CWND if (metrics.rmx_cwnd) tp->snd_cwnd = max(mss, min(metrics.rmx_cwnd / 2, min(tp->snd_wnd, so->so_snd.sb_hiwat))); else #endif if (tcp_do_rfc3390) tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); #ifdef INET6 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || (!isipv6 && in_localaddr(inp->inp_faddr))) #else else if (in_localaddr(inp->inp_faddr)) #endif tp->snd_cwnd = mss * ss_fltsz_local; else tp->snd_cwnd = mss * ss_fltsz; + + /* Check the interface for TSO capabilities. */ + if (mtuflags & CSUM_TSO) + tp->t_flags |= TF_TSO; } /* * Determine the MSS option to send on an outgoing SYN. */ int tcp_mssopt(inc) struct in_conninfo *inc; { int mss = 0; u_long maxmtu = 0; u_long thcmtu = 0; size_t min_protoh; #ifdef INET6 int isipv6 = inc->inc_isipv6 ? 1 : 0; #endif KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); #ifdef INET6 if (isipv6) { mss = tcp_v6mssdflt; - maxmtu = tcp_maxmtu6(inc); + maxmtu = tcp_maxmtu6(inc, NULL); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } else #endif { mss = tcp_mssdflt; - maxmtu = tcp_maxmtu(inc); + maxmtu = tcp_maxmtu(inc, NULL); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct tcpiphdr); } if (maxmtu && thcmtu) mss = min(maxmtu, thcmtu) - min_protoh; else if (maxmtu || thcmtu) mss = max(maxmtu, thcmtu) - min_protoh; return (mss); } /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ static void tcp_newreno_partial_ack(tp, th) struct tcpcb *tp; struct tcphdr *th; { tcp_seq onxt = tp->snd_nxt; u_long ocwnd = tp->snd_cwnd; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ if (tp->snd_cwnd > th->th_ack - tp->snd_una) tp->snd_cwnd -= th->th_ack - tp->snd_una; else tp->snd_cwnd = 0; tp->snd_cwnd += tp->t_maxseg; } /* * Returns 1 if the TIME_WAIT state was killed and we should start over, * looking for a pcb in the listen state. Returns 0 otherwise. */ static int tcp_timewait(inp, to, th, m, tlen) struct inpcb *inp; struct tcpopt *to; struct tcphdr *th; struct mbuf *m; int tlen; { struct tcptw *tw; int thflags; tcp_seq seq; #ifdef INET6 int isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #else const int isipv6 = 0; #endif /* tcbinfo lock required for tcp_twclose(), tcp_2msl_reset. */ INP_INFO_WLOCK_ASSERT(&tcbinfo); INP_LOCK_ASSERT(inp); /* * XXXRW: Time wait state for inpcb has been recycled, but inpcb is * still present. This is undesirable, but temporarily necessary * until we work out how to handle inpcb's who's timewait state has * been removed. */ tw = intotw(inp); if (tw == NULL) goto drop; thflags = th->th_flags; /* * NOTE: for FIN_WAIT_2 (to be added later), * must validate sequence number before accepting RST */ /* * If the segment contains RST: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) goto drop; #if 0 /* PAWS not needed at the moment */ /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { if ((thflags & TH_ACK) == 0) goto drop; goto ack; } /* * ts_recent is never updated because we never accept new segments. */ #endif /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) { tcp_twclose(tw, 0); return (1); } /* * Drop the the segment if it does not contain an ACK. */ if ((thflags & TH_ACK) == 0) goto drop; /* * Reset the 2MSL timer if this is a duplicate FIN. */ if (thflags & TH_FIN) { seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0); if (seq + 1 == tw->rcv_nxt) tcp_timer_2msl_reset(tw, 2 * tcp_msl, 1); } /* * Acknowledge the segment if it has data or is not a duplicate ACK. */ if (thflags != TH_ACK || tlen != 0 || th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) tcp_twrespond(tw, TH_ACK); goto drop; /* * Generate a RST, dropping incoming segment. * Make ACK acceptable to originator of segment. * Don't bother to respond if destination was broadcast/multicast. */ if (m->m_flags & (M_BCAST|M_MCAST)) goto drop; if (isipv6) { struct ip6_hdr *ip6; /* IPv6 anycast check is done at tcp6_input() */ ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; } else { struct ip *ip; ip = mtod(m, struct ip *); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } if (thflags & TH_ACK) { tcp_respond(NULL, mtod(m, void *), th, m, 0, th->th_ack, TH_RST); } else { seq = th->th_seq + (thflags & TH_SYN ? 1 : 0); tcp_respond(NULL, mtod(m, void *), th, m, seq, 0, TH_RST|TH_ACK); } INP_UNLOCK(inp); return (0); drop: INP_UNLOCK(inp); m_freem(m); return (0); } Index: head/sys/netinet/tcp_subr.c =================================================================== --- head/sys/netinet/tcp_subr.c (revision 162083) +++ head/sys/netinet/tcp_subr.c (revision 162084) @@ -1,2383 +1,2397 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 * $FreeBSD$ */ #include "opt_compat.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include "opt_tcp_sack.h" #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef INET6 #include #endif #include #include #ifdef INET6 #include #include #include #endif #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif #include #ifdef IPSEC #include #ifdef INET6 #include #endif #include #endif /*IPSEC*/ #ifdef FAST_IPSEC #include #include #ifdef INET6 #include #endif #include #define IPSEC #endif /*FAST_IPSEC*/ #include #include int tcp_mssdflt = TCP_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); #ifdef INET6 int tcp_v6mssdflt = TCP6_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_RW, &tcp_v6mssdflt , 0, "Default TCP Maximum Segment Size for IPv6"); #endif /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds * of packets instead of one. The effect scales with the available * bandwidth and quickly saturates the CPU and network interface * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ int tcp_minmss = TCP_MINMSS; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, &tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); /* * Number of TCP segments per second we accept from remote host * before we start to calculate average segment size. If average * segment size drops below the minimum TCP MSS we assume a DoS * attack and reset+drop the connection. Care has to be taken not to * set this value too small to not kill interactive type connections * (telnet, SSH) which send many small packets. */ int tcp_minmssoverload = TCP_MINMSSOVERLOAD; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW, &tcp_minmssoverload , 0, "Number of TCP Segments per Second allowed to" "be under the MINMSS Size"); #if 0 static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time"); #endif int tcp_do_rfc1323 = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); static int tcp_tcbhashsize = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, &tcbinfo.ipi_count, 0, "Number of active PCBs"); static int icmp_may_rst = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); static int tcp_isn_reseed_interval = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); static uma_zone_t tcptw_zone; static int maxtcptw; static int sysctl_maxtcptw(SYSCTL_HANDLER_ARGS) { int error, new; if (maxtcptw == 0) new = maxsockets / 5; else new = maxtcptw; error = sysctl_handle_int(oidp, &new, sizeof(int), req); if (error == 0 && req->newptr) { if (new > maxtcptw) { maxtcptw = new; uma_zone_set_max(tcptw_zone, maxtcptw); } else error = EINVAL; } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW, &maxtcptw, 0, sysctl_maxtcptw, "IU", "Maximum number of compressed TCP TIME_WAIT entries"); /* * TCP bandwidth limiting sysctls. Note that the default lower bound of * 1024 exists only for debugging. A good production default would be * something like 6100. */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0, "TCP inflight data limiting"); static int tcp_inflight_enable = 1; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW, &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); static int tcp_inflight_debug = 0; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW, &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); static int tcp_inflight_rttthresh; SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW, &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I", "RTT threshold below which inflight will deactivate itself"); static int tcp_inflight_min = 6144; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW, &tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW, &tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); static int tcp_inflight_stab = 20; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW, &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); uma_zone_t sack_hole_zone; static struct inpcb *tcp_notify(struct inpcb *, int); static void tcp_isn_tick(void *); /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 512 #endif /* * XXX * Callouts should be moved into struct tcp directly. They are currently * separate because the tcpcb structure is exported to userland for sysctl * parsing purposes, which do not know about callouts. */ struct tcpcb_mem { struct tcpcb tcb; struct callout tcpcb_mem_rexmt, tcpcb_mem_persist, tcpcb_mem_keep; struct callout tcpcb_mem_2msl, tcpcb_mem_delack; }; static uma_zone_t tcpcb_zone; struct callout isn_callout; static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) /* * TCP initialization. */ static void tcp_zone_change(void *tag) { uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); uma_zone_set_max(tcpcb_zone, maxsockets); if (maxtcptw == 0) uma_zone_set_max(tcptw_zone, maxsockets / 5); } static int tcp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = (struct inpcb *) mem; INP_LOCK_INIT(inp, "inp", "tcpinp"); return (0); } void tcp_init(void) { int hashsize = TCBHASHSIZE; tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; tcp_rexmit_min = TCPTV_MIN; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); LIST_INIT(&tcb); tcbinfo.listhead = &tcb; TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); if (!powerof2(hashsize)) { printf("WARNING: TCB hash size not a power of 2\n"); hashsize = 512; /* safe default */ } tcp_tcbhashsize = hashsize; tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); tcbinfo.porthashbase = hashinit(hashsize, M_PCB, &tcbinfo.porthashmask); tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) #endif /* INET6 */ if (max_protohdr < TCP_MINPROTOHDR) max_protohdr = TCP_MINPROTOHDR; if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR /* * These have to be type stable for the benefit of the timers. */ tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcpcb_zone, maxsockets); tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw); if (maxtcptw == 0) uma_zone_set_max(tcptw_zone, maxsockets / 5); else uma_zone_set_max(tcptw_zone, maxtcptw); tcp_timer_init(); syncache_init(); tcp_hc_init(); tcp_reass_init(); ISN_LOCK_INIT(); callout_init(&isn_callout, CALLOUT_MPSAFE); tcp_isn_tick(NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } void tcp_fini(void *xtp) { callout_stop(&isn_callout); } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; INP_LOCK_ASSERT(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = sizeof(struct tcphdr); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } else #endif { struct ip *ip; ip = (struct ip *)ip_ptr; ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; th->th_ack = 0; th->th_x2 = 0; th->th_off = 5; th->th_flags = 0; th->th_win = 0; th->th_urp = 0; th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcpip_maketemplate(struct inpcb *inp) { struct mbuf *m; struct tcptemp *n; m = m_get(M_DONTWAIT, MT_DATA); if (m == NULL) return (0); m->m_len = sizeof(struct tcptemp); n = mtod(m, struct tcptemp *); tcpip_fillheaders(inp, (void *)&n->tt_ipgen, (void *)&n->tt_t); return (n); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy * of the tcpiphdr at ti and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the * segment ti, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then ti must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, register struct tcphdr *th, register struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { register int tlen; int win = 0; struct ip *ip; struct tcphdr *nth; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int ipflags = 0; struct inpcb *inp; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); #ifdef INET6 isipv6 = ((struct ip *)ipgen)->ip_v == 6; ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp != NULL) { inp = tp->t_inpcb; KASSERT(inp != NULL, ("tcp control block w/o inpcb")); INP_INFO_WLOCK_ASSERT(&tcbinfo); INP_LOCK_ASSERT(inp); } else inp = NULL; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; } } if (m == NULL) { m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) return; tlen = 0; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else { m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; /* m_len is set later */ tlen = 0; #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, n_short); #undef xchg } #ifdef INET6 if (isipv6) { ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + tlen)); tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); } else #endif { tlen += sizeof (struct tcpiphdr); ip->ip_len = tlen; ip->ip_ttl = ip_defttl; if (path_mtu_discovery) ip->ip_off |= IP_DF; } m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC if (inp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ INP_LOCK_ASSERT(inp); mac_create_mbuf_from_inpcb(inp, m); } else { /* * Packet is not associated with a socket, so possibly * update the label in place. */ mac_reflect_mbuf_tcp(m); } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; nth->th_off = sizeof (struct tcphdr) >> 2; nth->th_flags = flags; if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #ifdef INET6 if (isipv6) { nth->th_sum = 0; nth->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen - sizeof(struct ip6_hdr)); ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : NULL, NULL); } else #endif /* INET6 */ { nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); } #ifdef TCPDEBUG if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif #ifdef INET6 if (isipv6) (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp); else #endif /* INET6 */ (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); } /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. The `inp' parameter must have * come from the zone allocator set up in tcp_init(). */ struct tcpcb * tcp_newtcpcb(struct inpcb *inp) { struct tcpcb_mem *tm; struct tcpcb *tp; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO); if (tm == NULL) return (NULL); tp = &tm->tcb; /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = tp->t_maxopd = #ifdef INET6 isipv6 ? tcp_v6mssdflt : #endif /* INET6 */ tcp_mssdflt; /* Set up our timeouts. */ callout_init(tp->tt_rexmt = &tm->tcpcb_mem_rexmt, NET_CALLOUT_MPSAFE); callout_init(tp->tt_persist = &tm->tcpcb_mem_persist, NET_CALLOUT_MPSAFE); callout_init(tp->tt_keep = &tm->tcpcb_mem_keep, NET_CALLOUT_MPSAFE); callout_init(tp->tt_2msl = &tm->tcpcb_mem_2msl, NET_CALLOUT_MPSAFE); callout_init(tp->tt_delack = &tm->tcpcb_mem_delack, NET_CALLOUT_MPSAFE); if (tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); tp->sack_enable = tcp_do_sack; TAILQ_INIT(&tp->snd_holes); tp->t_inpcb = inp; /* XXX */ /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; tp->t_bw_rtttime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = ip_defttl; inp->inp_ppcb = tp; return (tp); /* XXX */ } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tp->t_inpcb->inp_socket; INP_INFO_WLOCK_ASSERT(&tcbinfo); INP_LOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_state = TCPS_CLOSED; (void) tcp_output(tp); tcpstat.tcps_drops++; } else tcpstat.tcps_conndrops++; if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } void tcp_discardcb(struct tcpcb *tp) { struct tseg_qent *q; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ INP_LOCK_ASSERT(inp); /* * Make sure that all of our timers are stopped before we * delete the PCB. */ callout_stop(tp->tt_rexmt); callout_stop(tp->tt_persist); callout_stop(tp->tt_keep); callout_stop(tp->tt_2msl); callout_stop(tp->tt_delack); /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as 4 rtt samples. * 4 samples is enough for the srtt filter to converge * to within enough % of the correct value; fewer samples * and we could save a bogus rtt. The danger is not high * as tcp quickly recovers from everything. * XXX: Works very well but needs some more statistics! */ if (tp->t_rttupdated >= 4) { struct hc_metrics_lite metrics; u_long ssthresh; bzero(&metrics, sizeof(metrics)); /* * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occured on a session. * * XXXRW: 'so' may be NULL here, and/or socket buffer may be * being torn down. Ideally this code would not use 'so'. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; if (ssthresh < 2) ssthresh = 2; ssthresh *= (u_long)(tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); } else ssthresh = 0; metrics.rmx_ssthresh = ssthresh; metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; /* XXX: This wraps if the pipe is more than 4 Gbit per second */ metrics.rmx_bandwidth = tp->snd_bandwidth; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } /* free the reassembly queue, if any */ while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; tcp_reass_qsize--; } tcp_free_sackholes(tp); inp->inp_ppcb = NULL; tp->t_inpcb = NULL; uma_zfree(tcpcb_zone, tp); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so; INP_INFO_WLOCK_ASSERT(&tcbinfo); INP_LOCK_ASSERT(inp); in_pcbdrop(inp); tcpstat.tcps_closed++; KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); if (inp->inp_vflag & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_close: !SS_PROTOREF")); inp->inp_vflag &= ~INP_SOCKREF; INP_UNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); return (NULL); } return (tp); } void tcp_drain(void) { if (do_tcpdrain) { struct inpcb *inpb; struct tcpcb *tcpb; struct tseg_qent *te; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * usefull. */ INP_INFO_RLOCK(&tcbinfo); LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) { if (inpb->inp_vflag & INP_TIMEWAIT) continue; INP_LOCK(inpb); if ((tcpb = intotcpcb(inpb)) != NULL) { while ((te = LIST_FIRST(&tcpb->t_segq)) != NULL) { LIST_REMOVE(te, tqe_q); m_freem(te->tqe_m); uma_zfree(tcp_reass_zone, te); tcpb->t_segqlen--; tcp_reass_qsize--; } tcp_clean_sackreport(tcpb); } INP_UNLOCK(inpb); } INP_INFO_RUNLOCK(&tcbinfo); } } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; INP_INFO_WLOCK_ASSERT(&tcbinfo); INP_LOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || (inp->inp_vflag & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tp = tcp_drop(tp, error); if (tp != NULL) return (inp); else return (NULL); } else { tp->t_softerror = error; return (inp); } #if 0 wakeup( &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { n = tcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xtcpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&tcbinfo); gencnt = tcbinfo.ipi_gencnt; n = tcbinfo.ipi_count; INP_INFO_RUNLOCK(&tcbinfo); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xtcpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == NULL) return (ENOMEM); INP_INFO_RLOCK(&tcbinfo); for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) { /* * XXX: This use of cr_cansee(), introduced with * TCP state changes, is not quite right, but for * now, better than nothing. */ if (inp->inp_vflag & INP_TIMEWAIT) { if (intotw(inp) != NULL) error = cr_cansee(req->td->td_ucred, intotw(inp)->tw_cred); else error = EINVAL; /* Skip this inp. */ } else error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error == 0) inp_list[i++] = inp; } INP_UNLOCK(inp); } INP_INFO_RUNLOCK(&tcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xtcpcb xt; void *inp_ppcb; bzero(&xt, sizeof(xt)); xt.xt_len = sizeof xt; /* XXX should avoid extra copy */ bcopy(inp, &xt.xt_inp, sizeof *inp); inp_ppcb = inp->inp_ppcb; if (inp_ppcb == NULL) bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); else if (inp->inp_vflag & INP_TIMEWAIT) { bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); xt.xt_tp.t_state = TCPS_TIME_WAIT; } else bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); if (inp->inp_socket != NULL) sotoxsocket(inp->inp_socket, &xt.xt_socket); else { bzero(&xt.xt_socket, sizeof xt.xt_socket); xt.xt_socket.xso_protocol = IPPROTO_TCP; } xt.xt_inp.inp_gencnt = inp->inp_gencnt; INP_UNLOCK(inp); error = SYSCTL_OUT(req, &xt, sizeof xt); } else INP_UNLOCK(inp); } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_INFO_RLOCK(&tcbinfo); xig.xig_gen = tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = tcbinfo.ipi_count; INP_INFO_RUNLOCK(&tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = suser_cred(req->td->td_ucred, SUSER_ALLOWJAIL); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); INP_INFO_RLOCK(&tcbinfo); inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); if (inp == NULL) { error = ENOENT; goto outunlocked; } INP_LOCK(inp); if (inp->inp_socket == NULL) { error = ENOENT; goto out; } error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error) goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: INP_UNLOCK(inp); outunlocked: INP_INFO_RUNLOCK(&tcbinfo); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error, mapped = 0; error = suser_cred(req->td->td_ucred, SUSER_ALLOWJAIL); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else return (EINVAL); } INP_INFO_RLOCK(&tcbinfo); if (mapped == 1) inp = in_pcblookup_hash(&tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, 0, NULL); else inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL); if (inp == NULL) { error = ENOENT; goto outunlocked; } INP_LOCK(inp); if (inp->inp_socket == NULL) { error = ENOENT; goto out; } error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error) goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: INP_UNLOCK(inp); outunlocked: INP_INFO_RUNLOCK(&tcbinfo); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif void tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct ip *ip = vip; struct tcphdr *th; struct in_addr faddr; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct icmp *icp; struct in_conninfo inc; tcp_seq icmp_tcp_seq; int mtu; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; /* * Redirects don't need to be handled up here. */ else if (PRC_IS_REDIRECT(cmd)) return; /* * Source quench is depreciated. */ else if (cmd == PRC_QUENCH) return; /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ else if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip != NULL) { icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_WLOCK(&tcbinfo); inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, 0, NULL); if (inp != NULL) { INP_LOCK(inp); if (!(inp->inp_vflag & INP_TIMEWAIT) && !(inp->inp_vflag & INP_DROPPED) && !(inp->inp_socket == NULL)) { icmp_tcp_seq = htonl(th->th_seq); tp = intotcpcb(inp); if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && SEQ_LT(icmp_tcp_seq, tp->snd_max)) { if (cmd == PRC_MSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ bzero(&inc, sizeof(inc)); inc.inc_flags = 0; /* IPv4 */ inc.inc_faddr = faddr; mtu = ntohs(icp->icmp_nextmtu); /* * If no alternative MTU was * proposed, try the next smaller * one. ip->ip_len has already * been swapped in icmp_input(). */ if (!mtu) mtu = ip_next_mtu(ip->ip_len, 1); if (mtu < max(296, (tcp_minmss) + sizeof(struct tcpiphdr))) mtu = 0; if (!mtu) mtu = tcp_mssdflt + sizeof(struct tcpiphdr); /* * Only cache the the MTU if it * is smaller than the interface * or route MTU. tcp_mtudisc() * will do right thing by itself. */ - if (mtu <= tcp_maxmtu(&inc)) + if (mtu <= tcp_maxmtu(&inc, NULL)) tcp_hc_updatemtu(&inc, mtu); } inp = (*notify)(inp, inetctlerrmap[cmd]); } } if (inp != NULL) INP_UNLOCK(inp); } else { inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = faddr; inc.inc_laddr = ip->ip_src; #ifdef INET6 inc.inc_isipv6 = 0; #endif syncache_unreach(&inc, th); } INP_INFO_WUNLOCK(&tcbinfo); } else in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); } #ifdef INET6 void tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { struct tcphdr th; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; struct mbuf *m; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; int off; struct tcp_portonly { u_int16_t th_sport; u_int16_t th_dport; } *thp; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; else if (!PRC_IS_REDIRECT(cmd) && ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) return; /* Source quench is depreciated. */ else if (cmd == PRC_QUENCH) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; off = 0; /* fool gcc */ sa6_src = &sa6_any; } if (ip6 != NULL) { struct in_conninfo inc; /* * XXX: We assume that when IPV6 is non NULL, * M and OFF are valid. */ /* check if we can safely examine src and dst ports */ if (m->m_pkthdr.len < off + sizeof(*thp)) return; bzero(&th, sizeof(th)); m_copydata(m, off, sizeof(*thp), (caddr_t)&th); in6_pcbnotify(&tcbinfo, sa, th.th_dport, (struct sockaddr *)ip6cp->ip6c_src, th.th_sport, cmd, NULL, notify); inc.inc_fport = th.th_dport; inc.inc_lport = th.th_sport; inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; inc.inc_isipv6 = 1; INP_INFO_WLOCK(&tcbinfo); syncache_unreach(&inc, &th); INP_INFO_WUNLOCK(&tcbinfo); } else in6_pcbnotify(&tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify); } #endif /* INET6 */ /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used, with only small modifications. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * As reading the *exact* system time is too expensive to be done * whenever setting up a TCP connection, we increment the time * offset in two ways. First, a small random positive increment * is added to isn_offset for each connection that is set up. * Second, the function tcp_isn_tick fires once per clock tick * and increments isn_offset as necessary so that sequence numbers * are incremented at approximately ISN_BYTES_PER_SECOND. The * random positive increments serve only to ensure that the same * exact sequence number is never sent out twice (as could otherwise * happen when a port is recycled in less than the system tick * interval.) * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) static u_char isn_secret[32]; static int isn_last_reseed; static u_int32_t isn_offset, isn_offset_old; static MD5_CTX isn_ctx; tcp_seq tcp_new_isn(struct tcpcb *tp) { u_int32_t md5_buffer[4]; tcp_seq new_isn; INP_LOCK_ASSERT(tp->t_inpcb); ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) && (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) < (u_int)ticks))) { read_random(&isn_secret, sizeof(isn_secret)); isn_last_reseed = ticks; } /* Compute the md5 hash and return the ISN. */ MD5Init(&isn_ctx); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, sizeof(struct in6_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, sizeof(struct in6_addr)); } else #endif { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, sizeof(struct in_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, sizeof(struct in_addr)); } MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); MD5Final((u_char *) &md5_buffer, &isn_ctx); new_isn = (tcp_seq) md5_buffer[0]; isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); new_isn += isn_offset; ISN_UNLOCK(); return (new_isn); } /* * Increment the offset to the next ISN_BYTES_PER_SECOND / hz boundary * to keep time flowing at a relatively constant rate. If the random * increments have already pushed us past the projected offset, do nothing. */ static void tcp_isn_tick(void *xtp) { u_int32_t projected_offset; ISN_LOCK(); projected_offset = isn_offset_old + ISN_BYTES_PER_SECOND / 100; if (projected_offset > isn_offset) isn_offset = projected_offset; isn_offset_old = isn_offset; callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL); ISN_UNLOCK(); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ struct inpcb * tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; INP_INFO_WLOCK_ASSERT(&tcbinfo); INP_LOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || (inp->inp_vflag & INP_DROPPED)) return (inp); tp = intotcpcb(inp); if (tp->t_state != TCPS_SYN_SENT) return (inp); tp = tcp_drop(tp, errno); if (tp != NULL) return (inp); else return (NULL); } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value in the route. Also nudge TCP to send something, * since we know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ struct inpcb * tcp_mtudisc(struct inpcb *inp, int errno) { struct tcpcb *tp; struct socket *so = inp->inp_socket; u_int maxmtu; u_int romtu; int mss; #ifdef INET6 int isipv6; #endif /* INET6 */ INP_LOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || (inp->inp_vflag & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */ romtu = #ifdef INET6 - isipv6 ? tcp_maxmtu6(&inp->inp_inc) : + isipv6 ? tcp_maxmtu6(&inp->inp_inc, NULL) : #endif /* INET6 */ - tcp_maxmtu(&inp->inp_inc); + tcp_maxmtu(&inp->inp_inc, NULL); if (!maxmtu) maxmtu = romtu; else maxmtu = min(maxmtu, romtu); if (!maxmtu) { tp->t_maxopd = tp->t_maxseg = #ifdef INET6 isipv6 ? tcp_v6mssdflt : #endif /* INET6 */ tcp_mssdflt; return (inp); } mss = maxmtu - #ifdef INET6 (isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : #endif /* INET6 */ sizeof(struct tcpiphdr) #ifdef INET6 ) #endif /* INET6 */ ; /* * XXX - The above conditional probably violates the TCP * spec. The problem is that, since we don't know the * other end's MSS, we are supposed to use a conservative * default. But, if we do that, then MTU discovery will * never actually take place, because the conservative * default is much less than the MTUs typically seen * on the Internet today. For the moment, we'll sweep * this under the carpet. * * The conservative default might not actually be a problem * if the only case this occurs is when sending an initial * SYN with options and data to a host we've never talked * to before. Then, they will reply with an MSS value which * will get recorded and the new parameters should get * recomputed. For Further Study. */ if (tp->t_maxopd <= mss) return (inp); tp->t_maxopd = mss; if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) mss -= TCPOLEN_TSTAMP_APPA; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) mss &= ~(MCLBYTES-1); #else if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif if (so->so_snd.sb_hiwat < mss) mss = so->so_snd.sb_hiwat; tp->t_maxseg = mss; tcpstat.tcps_mturesent++; tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->sack_enable) EXIT_FASTRECOVERY(tp); tcp_output(tp); return (inp); } /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return NULL. This routine * is called by TCP routines that access the rmx structure and by tcp_mss * to get the interface MTU. */ u_long -tcp_maxmtu(struct in_conninfo *inc) +tcp_maxmtu(struct in_conninfo *inc, int *flags) { struct route sro; struct sockaddr_in *dst; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); bzero(&sro, sizeof(sro)); if (inc->inc_faddr.s_addr != INADDR_ANY) { dst = (struct sockaddr_in *)&sro.ro_dst; dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = inc->inc_faddr; rtalloc_ign(&sro, RTF_CLONING); } if (sro.ro_rt != NULL) { ifp = sro.ro_rt->rt_ifp; if (sro.ro_rt->rt_rmx.rmx_mtu == 0) maxmtu = ifp->if_mtu; else maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); + + /* Report additional interface capabilities. */ + if (flags != NULL) { + if (ifp->if_capenable & IFCAP_TSO4 && + ifp->if_hwassist & CSUM_TSO) + *flags |= CSUM_TSO; + } RTFREE(sro.ro_rt); } return (maxmtu); } #ifdef INET6 u_long -tcp_maxmtu6(struct in_conninfo *inc) +tcp_maxmtu6(struct in_conninfo *inc, int *flags) { struct route_in6 sro6; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); bzero(&sro6, sizeof(sro6)); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { sro6.ro_dst.sin6_family = AF_INET6; sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); sro6.ro_dst.sin6_addr = inc->inc6_faddr; rtalloc_ign((struct route *)&sro6, RTF_CLONING); } if (sro6.ro_rt != NULL) { ifp = sro6.ro_rt->rt_ifp; if (sro6.ro_rt->rt_rmx.rmx_mtu == 0) maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp); else maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu, IN6_LINKMTU(sro6.ro_rt->rt_ifp)); + + /* Report additional interface capabilities. */ + if (flags != NULL) { + if (ifp->if_capenable & IFCAP_TSO6 && + ifp->if_hwassist & CSUM_TSO) + *flags |= CSUM_TSO; + } RTFREE(sro6.ro_rt); } return (maxmtu); } #endif /* INET6 */ #ifdef IPSEC /* compute ESP/AH header size for TCP, including outer IP header. */ size_t ipsec_hdrsiz_tcp(struct tcpcb *tp) { struct inpcb *inp; struct mbuf *m; size_t hdrsiz; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif struct tcphdr *th; if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) return (0); MGETHDR(m, M_DONTWAIT, MT_DATA); if (!m) return (0); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); tcpip_fillheaders(inp, ip6, th); hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); tcpip_fillheaders(inp, ip, th); hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } m_free(m); return (hdrsiz); } #endif /*IPSEC*/ /* * Move a TCP connection into TIME_WAIT state. * tcbinfo is locked. * inp is locked, and is unlocked before returning. */ void tcp_twstart(struct tcpcb *tp) { struct tcptw *tw; struct inpcb *inp; int tw_time, acknow; struct socket *so; INP_INFO_WLOCK_ASSERT(&tcbinfo); /* tcp_timer_2msl_reset(). */ INP_LOCK_ASSERT(tp->t_inpcb); tw = uma_zalloc(tcptw_zone, M_NOWAIT); if (tw == NULL) { tw = tcp_timer_2msl_tw(1); if (tw == NULL) { tp = tcp_close(tp); if (tp != NULL) INP_UNLOCK(tp->t_inpcb); return; } } inp = tp->t_inpcb; tw->tw_inpcb = inp; /* * Recover last window size sent. */ tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale; /* * Set t_recent if timestamps are used on the connection. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == (TF_REQ_TSTMP|TF_RCVD_TSTMP)) tw->t_recent = tp->ts_recent; else tw->t_recent = 0; tw->snd_nxt = tp->snd_nxt; tw->rcv_nxt = tp->rcv_nxt; tw->iss = tp->iss; tw->irs = tp->irs; tw->t_starttime = tp->t_starttime; tw->tw_time = 0; /* XXX * If this code will * be used for fin-wait-2 state also, then we may need * a ts_recent from the last segment. */ tw_time = 2 * tcp_msl; acknow = tp->t_flags & TF_ACKNOW; /* * First, discard tcpcb state, which includes stopping its timers and * freeing it. tcp_discardcb() used to also release the inpcb, but * that work is now done in the caller. * * Note: soisdisconnected() call used to be made in tcp_discardcb(), * and might not be needed here any longer. */ tcp_discardcb(tp); so = inp->inp_socket; soisdisconnected(so); SOCK_LOCK(so); tw->tw_cred = crhold(so->so_cred); tw->tw_so_options = so->so_options; SOCK_UNLOCK(so); if (acknow) tcp_twrespond(tw, TH_ACK); inp->inp_ppcb = tw; inp->inp_vflag |= INP_TIMEWAIT; tcp_timer_2msl_reset(tw, tw_time, 0); /* * If the inpcb owns the sole reference to the socket, then we can * detach and free the socket as it is not needed in time wait. */ if (inp->inp_vflag & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_twstart: !SS_PROTOREF")); inp->inp_vflag &= ~INP_SOCKREF; INP_UNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); } else INP_UNLOCK(inp); } #if 0 /* * The appromixate rate of ISN increase of Microsoft TCP stacks; * the actual rate is slightly higher due to the addition of * random positive increments. * * Most other new OSes use semi-randomized ISN values, so we * do not need to worry about them. */ #define MS_ISN_BYTES_PER_SECOND 250000 /* * Determine if the ISN we will generate has advanced beyond the last * sequence number used by the previous connection. If so, indicate * that it is safe to recycle this tw socket by returning 1. */ int tcp_twrecycleable(struct tcptw *tw) { tcp_seq new_iss = tw->iss; tcp_seq new_irs = tw->irs; INP_INFO_WLOCK_ASSERT(&tcbinfo); new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz); new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz); if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt)) return (1); else return (0); } #endif void tcp_twclose(struct tcptw *tw, int reuse) { struct socket *so; struct inpcb *inp; /* * At this point, we are in one of two situations: * * (1) We have no socket, just an inpcb<->twtcp pair. We can free * all state. * * (2) We have a socket -- if we own a reference, release it and * notify the socket layer. */ inp = tw->tw_inpcb; KASSERT((inp->inp_vflag & INP_TIMEWAIT), ("tcp_twclose: !timewait")); KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw")); INP_INFO_WLOCK_ASSERT(&tcbinfo); /* tcp_timer_2msl_stop(). */ INP_LOCK_ASSERT(inp); tw->tw_inpcb = NULL; tcp_timer_2msl_stop(tw); inp->inp_ppcb = NULL; in_pcbdrop(inp); so = inp->inp_socket; if (so != NULL) { /* * If there's a socket, handle two cases: first, we own a * strong reference, which we will now release, or we don't * in which case another reference exists (XXXRW: think * about this more), and we don't need to take action. */ if (inp->inp_vflag & INP_SOCKREF) { inp->inp_vflag &= ~INP_SOCKREF; INP_UNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); KASSERT(so->so_state & SS_PROTOREF, ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF")); so->so_state &= ~SS_PROTOREF; sofree(so); } else { /* * If we don't own the only reference, the socket and * inpcb need to be left around to be handled by * tcp_usr_detach() later. */ INP_UNLOCK(inp); } } else { #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) in6_pcbfree(inp); else #endif in_pcbfree(inp); } tcpstat.tcps_closed++; crfree(tw->tw_cred); tw->tw_cred = NULL; if (reuse) return; uma_zfree(tcptw_zone, tw); } int tcp_twrespond(struct tcptw *tw, int flags) { struct inpcb *inp = tw->tw_inpcb; struct tcphdr *th; struct mbuf *m; struct ip *ip = NULL; u_int8_t *optp; u_int hdrlen, optlen; int error; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6 = inp->inp_inc.inc_isipv6; #endif INP_LOCK_ASSERT(inp); m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_data += max_linkhdr; #ifdef MAC mac_create_mbuf_from_inpcb(inp, m); #endif #ifdef INET6 if (isipv6) { hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(inp, ip6, th); } else #endif { hdrlen = sizeof(struct tcpiphdr); ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(inp, ip, th); } optp = (u_int8_t *)(th + 1); /* * Send a timestamp and echo-reply if both our side and our peer * have sent timestamps in our SYN's and this is not a RST. */ if (tw->t_recent && flags == TH_ACK) { u_int32_t *lp = (u_int32_t *)optp; /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); *lp++ = htonl(ticks); *lp = htonl(tw->t_recent); optp += TCPOLEN_TSTAMP_APPA; } optlen = optp - (u_int8_t *)(th + 1); m->m_len = hdrlen + optlen; m->m_pkthdr.len = m->m_len; KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small")); th->th_seq = htonl(tw->snd_nxt); th->th_ack = htonl(tw->rcv_nxt); th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; th->th_flags = flags; th->th_win = htons(tw->last_win); #ifdef INET6 if (isipv6) { th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), sizeof(struct tcphdr) + optlen); ip6->ip6_hlim = in6_selecthlim(inp, NULL); error = ip6_output(m, inp->in6p_outputopts, NULL, (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); } else #endif { th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); ip->ip_len = m->m_pkthdr.len; if (path_mtu_discovery) ip->ip_off |= IP_DF; error = ip_output(m, inp->inp_options, NULL, ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, inp); } if (flags & TH_ACK) tcpstat.tcps_sndacks++; else tcpstat.tcps_sndctrl++; tcpstat.tcps_sndtotal++; return (error); } /* * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING * * This code attempts to calculate the bandwidth-delay product as a * means of determining the optimal window size to maximize bandwidth, * minimize RTT, and avoid the over-allocation of buffers on interfaces and * routers. This code also does a fairly good job keeping RTTs in check * across slow links like modems. We implement an algorithm which is very * similar (but not meant to be) TCP/Vegas. The code operates on the * transmitter side of a TCP connection and so only effects the transmit * side of the connection. * * BACKGROUND: TCP makes no provision for the management of buffer space * at the end points or at the intermediate routers and switches. A TCP * stream, whether using NewReno or not, will eventually buffer as * many packets as it is able and the only reason this typically works is * due to the fairly small default buffers made available for a connection * (typicaly 16K or 32K). As machines use larger windows and/or window * scaling it is now fairly easy for even a single TCP connection to blow-out * all available buffer space not only on the local interface, but on * intermediate routers and switches as well. NewReno makes a misguided * attempt to 'solve' this problem by waiting for an actual failure to occur, * then backing off, then steadily increasing the window again until another * failure occurs, ad-infinitum. This results in terrible oscillation that * is only made worse as network loads increase and the idea of intentionally * blowing out network buffers is, frankly, a terrible way to manage network * resources. * * It is far better to limit the transmit window prior to the failure * condition being achieved. There are two general ways to do this: First * you can 'scan' through different transmit window sizes and locate the * point where the RTT stops increasing, indicating that you have filled the * pipe, then scan backwards until you note that RTT stops decreasing, then * repeat ad-infinitum. This method works in principle but has severe * implementation issues due to RTT variances, timer granularity, and * instability in the algorithm which can lead to many false positives and * create oscillations as well as interact badly with other TCP streams * implementing the same algorithm. * * The second method is to limit the window to the bandwidth delay product * of the link. This is the method we implement. RTT variances and our * own manipulation of the congestion window, bwnd, can potentially * destabilize the algorithm. For this reason we have to stabilize the * elements used to calculate the window. We do this by using the minimum * observed RTT, the long term average of the observed bandwidth, and * by adding two segments worth of slop. It isn't perfect but it is able * to react to changing conditions and gives us a very stable basis on * which to extend the algorithm. */ void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) { u_long bw; u_long bwnd; int save_ticks; INP_LOCK_ASSERT(tp->t_inpcb); /* * If inflight_enable is disabled in the middle of a tcp connection, * make sure snd_bwnd is effectively disabled. */ if (tcp_inflight_enable == 0 || tp->t_rttlow < tcp_inflight_rttthresh) { tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_bandwidth = 0; return; } /* * Figure out the bandwidth. Due to the tick granularity this * is a very rough number and it MUST be averaged over a fairly * long period of time. XXX we need to take into account a link * that is not using all available bandwidth, but for now our * slop will ramp us up if this case occurs and the bandwidth later * increases. * * Note: if ticks rollover 'bw' may wind up negative. We must * effectively reset t_bw_rtttime for this case. */ save_ticks = ticks; if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) return; bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / (save_ticks - tp->t_bw_rtttime); tp->t_bw_rtttime = save_ticks; tp->t_bw_rtseq = ack_seq; if (tp->t_bw_rtttime == 0 || (int)bw < 0) return; bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; tp->snd_bandwidth = bw; /* * Calculate the semi-static bandwidth delay product, plus two maximal * segments. The additional slop puts us squarely in the sweet * spot and also handles the bandwidth run-up case and stabilization. * Without the slop we could be locking ourselves into a lower * bandwidth. * * Situations Handled: * (1) Prevents over-queueing of packets on LANs, especially on * high speed LANs, allowing larger TCP buffers to be * specified, and also does a good job preventing * over-queueing of packets over choke points like modems * (at least for the transmit side). * * (2) Is able to handle changing network loads (bandwidth * drops so bwnd drops, bandwidth increases so bwnd * increases). * * (3) Theoretically should stabilize in the face of multiple * connections implementing the same algorithm (this may need * a little work). * * (4) Stability value (defaults to 20 = 2 maximal packets) can * be adjusted with a sysctl but typically only needs to be * on very slow connections. A value no smaller then 5 * should be used, but only reduce this default if you have * no other choice. */ #define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10; #undef USERTT if (tcp_inflight_debug > 0) { static int ltime; if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { ltime = ticks; printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", tp, bw, tp->t_rttbest, tp->t_srtt, bwnd ); } } if ((long)bwnd < tcp_inflight_min) bwnd = tcp_inflight_min; if (bwnd > tcp_inflight_max) bwnd = tcp_inflight_max; if ((long)bwnd < tp->t_maxseg * 2) bwnd = tp->t_maxseg * 2; tp->snd_bwnd = bwnd; } #ifdef TCP_SIGNATURE /* * Callback function invoked by m_apply() to digest TCP segment data * contained within an mbuf chain. */ static int tcp_signature_apply(void *fstate, void *data, u_int len) { MD5Update(fstate, (u_char *)data, len); return (0); } /* * Compute TCP-MD5 hash of a TCPv4 segment. (RFC2385) * * Parameters: * m pointer to head of mbuf chain * off0 offset to TCP header within the mbuf chain * len length of TCP segment data, excluding options * optlen length of TCP segment options * buf pointer to storage for computed MD5 digest * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND) * * We do this over ip, tcphdr, segment data, and the key in the SADB. * When called from tcp_input(), we can be sure that th_sum has been * zeroed out and verified already. * * This function is for IPv4 use only. Calling this function with an * IPv6 packet in the mbuf chain will yield undefined results. * * Return 0 if successful, otherwise return -1. * * XXX The key is retrieved from the system's PF_KEY SADB, by keying a * search with the destination IP address, and a 'magic SPI' to be * determined by the application. This is hardcoded elsewhere to 1179 * right now. Another branch of this code exists which uses the SPD to * specify per-application flows but it is unstable. */ int tcp_signature_compute(struct mbuf *m, int off0, int len, int optlen, u_char *buf, u_int direction) { union sockaddr_union dst; struct ippseudo ippseudo; MD5_CTX ctx; int doff; struct ip *ip; struct ipovly *ipovly; struct secasvar *sav; struct tcphdr *th; u_short savecsum; KASSERT(m != NULL, ("NULL mbuf chain")); KASSERT(buf != NULL, ("NULL signature pointer")); /* Extract the destination from the IP header in the mbuf. */ ip = mtod(m, struct ip *); bzero(&dst, sizeof(union sockaddr_union)); dst.sa.sa_len = sizeof(struct sockaddr_in); dst.sa.sa_family = AF_INET; dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ? ip->ip_src : ip->ip_dst; /* Look up an SADB entry which matches the address of the peer. */ sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI)); if (sav == NULL) { printf("%s: SADB lookup failed for %s\n", __func__, inet_ntoa(dst.sin.sin_addr)); return (EINVAL); } MD5Init(&ctx); ipovly = (struct ipovly *)ip; th = (struct tcphdr *)((u_char *)ip + off0); doff = off0 + sizeof(struct tcphdr) + optlen; /* * Step 1: Update MD5 hash with IP pseudo-header. * * XXX The ippseudo header MUST be digested in network byte order, * or else we'll fail the regression test. Assume all fields we've * been doing arithmetic on have been in host byte order. * XXX One cannot depend on ipovly->ih_len here. When called from * tcp_output(), the underlying ip_len member has not yet been set. */ ippseudo.ippseudo_src = ipovly->ih_src; ippseudo.ippseudo_dst = ipovly->ih_dst; ippseudo.ippseudo_pad = 0; ippseudo.ippseudo_p = IPPROTO_TCP; ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) + optlen); MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo)); /* * Step 2: Update MD5 hash with TCP header, excluding options. * The TCP checksum must be set to zero. */ savecsum = th->th_sum; th->th_sum = 0; MD5Update(&ctx, (char *)th, sizeof(struct tcphdr)); th->th_sum = savecsum; /* * Step 3: Update MD5 hash with TCP segment data. * Use m_apply() to avoid an early m_pullup(). */ if (len > 0) m_apply(m, doff, len, tcp_signature_apply, &ctx); /* * Step 4: Update MD5 hash with shared secret. */ MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth)); MD5Final(buf, &ctx); key_sa_recordxfer(sav, m); KEY_FREESAV(&sav); return (0); } #endif /* TCP_SIGNATURE */ static int sysctl_drop(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; struct tcptw *tw; struct sockaddr_in *fin, *lin; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; struct in6_addr f6, l6; #endif int error; inp = NULL; fin = lin = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; break; } error = sa6_embedscope(fin6, ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, ip6_use_defzone); if (error) return (error); break; #endif case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; default: return (EINVAL); } INP_INFO_WLOCK(&tcbinfo); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup_hash(&tcbinfo, &f6, fin6->sin6_port, &l6, lin6->sin6_port, 0, NULL); break; #endif case AF_INET: inp = in_pcblookup_hash(&tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL); break; } if (inp != NULL) { INP_LOCK(inp); if (inp->inp_vflag & INP_TIMEWAIT) { /* * XXXRW: There currently exists a state where an * inpcb is present, but its timewait state has been * discarded. For now, don't allow dropping of this * type of inpcb. */ tw = intotw(inp); if (tw != NULL) tcp_twclose(tw, 0); } else if (!(inp->inp_vflag & INP_DROPPED) && !(inp->inp_socket->so_options & SO_ACCEPTCONN)) { tp = intotcpcb(inp); tcp_drop(tp, ECONNABORTED); } INP_UNLOCK(inp); } else error = ESRCH; INP_INFO_WUNLOCK(&tcbinfo); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL, 0, sysctl_drop, "", "Drop TCP connection"); Index: head/sys/netinet/tcp_timewait.c =================================================================== --- head/sys/netinet/tcp_timewait.c (revision 162083) +++ head/sys/netinet/tcp_timewait.c (revision 162084) @@ -1,2383 +1,2397 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 * $FreeBSD$ */ #include "opt_compat.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" #include "opt_tcp_sack.h" #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef INET6 #include #endif #include #include #ifdef INET6 #include #include #include #endif #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif #include #ifdef IPSEC #include #ifdef INET6 #include #endif #include #endif /*IPSEC*/ #ifdef FAST_IPSEC #include #include #ifdef INET6 #include #endif #include #define IPSEC #endif /*FAST_IPSEC*/ #include #include int tcp_mssdflt = TCP_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); #ifdef INET6 int tcp_v6mssdflt = TCP6_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_RW, &tcp_v6mssdflt , 0, "Default TCP Maximum Segment Size for IPv6"); #endif /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds * of packets instead of one. The effect scales with the available * bandwidth and quickly saturates the CPU and network interface * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ int tcp_minmss = TCP_MINMSS; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, &tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); /* * Number of TCP segments per second we accept from remote host * before we start to calculate average segment size. If average * segment size drops below the minimum TCP MSS we assume a DoS * attack and reset+drop the connection. Care has to be taken not to * set this value too small to not kill interactive type connections * (telnet, SSH) which send many small packets. */ int tcp_minmssoverload = TCP_MINMSSOVERLOAD; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW, &tcp_minmssoverload , 0, "Number of TCP Segments per Second allowed to" "be under the MINMSS Size"); #if 0 static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time"); #endif int tcp_do_rfc1323 = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); static int tcp_tcbhashsize = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, &tcbinfo.ipi_count, 0, "Number of active PCBs"); static int icmp_may_rst = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); static int tcp_isn_reseed_interval = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); static uma_zone_t tcptw_zone; static int maxtcptw; static int sysctl_maxtcptw(SYSCTL_HANDLER_ARGS) { int error, new; if (maxtcptw == 0) new = maxsockets / 5; else new = maxtcptw; error = sysctl_handle_int(oidp, &new, sizeof(int), req); if (error == 0 && req->newptr) { if (new > maxtcptw) { maxtcptw = new; uma_zone_set_max(tcptw_zone, maxtcptw); } else error = EINVAL; } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW, &maxtcptw, 0, sysctl_maxtcptw, "IU", "Maximum number of compressed TCP TIME_WAIT entries"); /* * TCP bandwidth limiting sysctls. Note that the default lower bound of * 1024 exists only for debugging. A good production default would be * something like 6100. */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0, "TCP inflight data limiting"); static int tcp_inflight_enable = 1; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW, &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); static int tcp_inflight_debug = 0; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW, &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); static int tcp_inflight_rttthresh; SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW, &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I", "RTT threshold below which inflight will deactivate itself"); static int tcp_inflight_min = 6144; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW, &tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW, &tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); static int tcp_inflight_stab = 20; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW, &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); uma_zone_t sack_hole_zone; static struct inpcb *tcp_notify(struct inpcb *, int); static void tcp_isn_tick(void *); /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 512 #endif /* * XXX * Callouts should be moved into struct tcp directly. They are currently * separate because the tcpcb structure is exported to userland for sysctl * parsing purposes, which do not know about callouts. */ struct tcpcb_mem { struct tcpcb tcb; struct callout tcpcb_mem_rexmt, tcpcb_mem_persist, tcpcb_mem_keep; struct callout tcpcb_mem_2msl, tcpcb_mem_delack; }; static uma_zone_t tcpcb_zone; struct callout isn_callout; static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) /* * TCP initialization. */ static void tcp_zone_change(void *tag) { uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); uma_zone_set_max(tcpcb_zone, maxsockets); if (maxtcptw == 0) uma_zone_set_max(tcptw_zone, maxsockets / 5); } static int tcp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = (struct inpcb *) mem; INP_LOCK_INIT(inp, "inp", "tcpinp"); return (0); } void tcp_init(void) { int hashsize = TCBHASHSIZE; tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; tcp_rexmit_min = TCPTV_MIN; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); LIST_INIT(&tcb); tcbinfo.listhead = &tcb; TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); if (!powerof2(hashsize)) { printf("WARNING: TCB hash size not a power of 2\n"); hashsize = 512; /* safe default */ } tcp_tcbhashsize = hashsize; tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); tcbinfo.porthashbase = hashinit(hashsize, M_PCB, &tcbinfo.porthashmask); tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) #endif /* INET6 */ if (max_protohdr < TCP_MINPROTOHDR) max_protohdr = TCP_MINPROTOHDR; if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR /* * These have to be type stable for the benefit of the timers. */ tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcpcb_zone, maxsockets); tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw); if (maxtcptw == 0) uma_zone_set_max(tcptw_zone, maxsockets / 5); else uma_zone_set_max(tcptw_zone, maxtcptw); tcp_timer_init(); syncache_init(); tcp_hc_init(); tcp_reass_init(); ISN_LOCK_INIT(); callout_init(&isn_callout, CALLOUT_MPSAFE); tcp_isn_tick(NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } void tcp_fini(void *xtp) { callout_stop(&isn_callout); } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; INP_LOCK_ASSERT(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = sizeof(struct tcphdr); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } else #endif { struct ip *ip; ip = (struct ip *)ip_ptr; ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; th->th_ack = 0; th->th_x2 = 0; th->th_off = 5; th->th_flags = 0; th->th_win = 0; th->th_urp = 0; th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcpip_maketemplate(struct inpcb *inp) { struct mbuf *m; struct tcptemp *n; m = m_get(M_DONTWAIT, MT_DATA); if (m == NULL) return (0); m->m_len = sizeof(struct tcptemp); n = mtod(m, struct tcptemp *); tcpip_fillheaders(inp, (void *)&n->tt_ipgen, (void *)&n->tt_t); return (n); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy * of the tcpiphdr at ti and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the * segment ti, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then ti must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, register struct tcphdr *th, register struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { register int tlen; int win = 0; struct ip *ip; struct tcphdr *nth; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int ipflags = 0; struct inpcb *inp; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); #ifdef INET6 isipv6 = ((struct ip *)ipgen)->ip_v == 6; ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp != NULL) { inp = tp->t_inpcb; KASSERT(inp != NULL, ("tcp control block w/o inpcb")); INP_INFO_WLOCK_ASSERT(&tcbinfo); INP_LOCK_ASSERT(inp); } else inp = NULL; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; } } if (m == NULL) { m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) return; tlen = 0; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else { m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; /* m_len is set later */ tlen = 0; #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, n_short); #undef xchg } #ifdef INET6 if (isipv6) { ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + tlen)); tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); } else #endif { tlen += sizeof (struct tcpiphdr); ip->ip_len = tlen; ip->ip_ttl = ip_defttl; if (path_mtu_discovery) ip->ip_off |= IP_DF; } m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC if (inp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ INP_LOCK_ASSERT(inp); mac_create_mbuf_from_inpcb(inp, m); } else { /* * Packet is not associated with a socket, so possibly * update the label in place. */ mac_reflect_mbuf_tcp(m); } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; nth->th_off = sizeof (struct tcphdr) >> 2; nth->th_flags = flags; if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #ifdef INET6 if (isipv6) { nth->th_sum = 0; nth->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen - sizeof(struct ip6_hdr)); ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : NULL, NULL); } else #endif /* INET6 */ { nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); } #ifdef TCPDEBUG if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif #ifdef INET6 if (isipv6) (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp); else #endif /* INET6 */ (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); } /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. The `inp' parameter must have * come from the zone allocator set up in tcp_init(). */ struct tcpcb * tcp_newtcpcb(struct inpcb *inp) { struct tcpcb_mem *tm; struct tcpcb *tp; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO); if (tm == NULL) return (NULL); tp = &tm->tcb; /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = tp->t_maxopd = #ifdef INET6 isipv6 ? tcp_v6mssdflt : #endif /* INET6 */ tcp_mssdflt; /* Set up our timeouts. */ callout_init(tp->tt_rexmt = &tm->tcpcb_mem_rexmt, NET_CALLOUT_MPSAFE); callout_init(tp->tt_persist = &tm->tcpcb_mem_persist, NET_CALLOUT_MPSAFE); callout_init(tp->tt_keep = &tm->tcpcb_mem_keep, NET_CALLOUT_MPSAFE); callout_init(tp->tt_2msl = &tm->tcpcb_mem_2msl, NET_CALLOUT_MPSAFE); callout_init(tp->tt_delack = &tm->tcpcb_mem_delack, NET_CALLOUT_MPSAFE); if (tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); tp->sack_enable = tcp_do_sack; TAILQ_INIT(&tp->snd_holes); tp->t_inpcb = inp; /* XXX */ /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; tp->t_bw_rtttime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = ip_defttl; inp->inp_ppcb = tp; return (tp); /* XXX */ } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tp->t_inpcb->inp_socket; INP_INFO_WLOCK_ASSERT(&tcbinfo); INP_LOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_state = TCPS_CLOSED; (void) tcp_output(tp); tcpstat.tcps_drops++; } else tcpstat.tcps_conndrops++; if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } void tcp_discardcb(struct tcpcb *tp) { struct tseg_qent *q; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ INP_LOCK_ASSERT(inp); /* * Make sure that all of our timers are stopped before we * delete the PCB. */ callout_stop(tp->tt_rexmt); callout_stop(tp->tt_persist); callout_stop(tp->tt_keep); callout_stop(tp->tt_2msl); callout_stop(tp->tt_delack); /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as 4 rtt samples. * 4 samples is enough for the srtt filter to converge * to within enough % of the correct value; fewer samples * and we could save a bogus rtt. The danger is not high * as tcp quickly recovers from everything. * XXX: Works very well but needs some more statistics! */ if (tp->t_rttupdated >= 4) { struct hc_metrics_lite metrics; u_long ssthresh; bzero(&metrics, sizeof(metrics)); /* * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occured on a session. * * XXXRW: 'so' may be NULL here, and/or socket buffer may be * being torn down. Ideally this code would not use 'so'. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; if (ssthresh < 2) ssthresh = 2; ssthresh *= (u_long)(tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); } else ssthresh = 0; metrics.rmx_ssthresh = ssthresh; metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; /* XXX: This wraps if the pipe is more than 4 Gbit per second */ metrics.rmx_bandwidth = tp->snd_bandwidth; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } /* free the reassembly queue, if any */ while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; tcp_reass_qsize--; } tcp_free_sackholes(tp); inp->inp_ppcb = NULL; tp->t_inpcb = NULL; uma_zfree(tcpcb_zone, tp); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so; INP_INFO_WLOCK_ASSERT(&tcbinfo); INP_LOCK_ASSERT(inp); in_pcbdrop(inp); tcpstat.tcps_closed++; KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); if (inp->inp_vflag & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_close: !SS_PROTOREF")); inp->inp_vflag &= ~INP_SOCKREF; INP_UNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); return (NULL); } return (tp); } void tcp_drain(void) { if (do_tcpdrain) { struct inpcb *inpb; struct tcpcb *tcpb; struct tseg_qent *te; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * usefull. */ INP_INFO_RLOCK(&tcbinfo); LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) { if (inpb->inp_vflag & INP_TIMEWAIT) continue; INP_LOCK(inpb); if ((tcpb = intotcpcb(inpb)) != NULL) { while ((te = LIST_FIRST(&tcpb->t_segq)) != NULL) { LIST_REMOVE(te, tqe_q); m_freem(te->tqe_m); uma_zfree(tcp_reass_zone, te); tcpb->t_segqlen--; tcp_reass_qsize--; } tcp_clean_sackreport(tcpb); } INP_UNLOCK(inpb); } INP_INFO_RUNLOCK(&tcbinfo); } } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; INP_INFO_WLOCK_ASSERT(&tcbinfo); INP_LOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || (inp->inp_vflag & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tp = tcp_drop(tp, error); if (tp != NULL) return (inp); else return (NULL); } else { tp->t_softerror = error; return (inp); } #if 0 wakeup( &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { n = tcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xtcpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&tcbinfo); gencnt = tcbinfo.ipi_gencnt; n = tcbinfo.ipi_count; INP_INFO_RUNLOCK(&tcbinfo); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xtcpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == NULL) return (ENOMEM); INP_INFO_RLOCK(&tcbinfo); for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) { /* * XXX: This use of cr_cansee(), introduced with * TCP state changes, is not quite right, but for * now, better than nothing. */ if (inp->inp_vflag & INP_TIMEWAIT) { if (intotw(inp) != NULL) error = cr_cansee(req->td->td_ucred, intotw(inp)->tw_cred); else error = EINVAL; /* Skip this inp. */ } else error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error == 0) inp_list[i++] = inp; } INP_UNLOCK(inp); } INP_INFO_RUNLOCK(&tcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xtcpcb xt; void *inp_ppcb; bzero(&xt, sizeof(xt)); xt.xt_len = sizeof xt; /* XXX should avoid extra copy */ bcopy(inp, &xt.xt_inp, sizeof *inp); inp_ppcb = inp->inp_ppcb; if (inp_ppcb == NULL) bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); else if (inp->inp_vflag & INP_TIMEWAIT) { bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); xt.xt_tp.t_state = TCPS_TIME_WAIT; } else bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); if (inp->inp_socket != NULL) sotoxsocket(inp->inp_socket, &xt.xt_socket); else { bzero(&xt.xt_socket, sizeof xt.xt_socket); xt.xt_socket.xso_protocol = IPPROTO_TCP; } xt.xt_inp.inp_gencnt = inp->inp_gencnt; INP_UNLOCK(inp); error = SYSCTL_OUT(req, &xt, sizeof xt); } else INP_UNLOCK(inp); } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_INFO_RLOCK(&tcbinfo); xig.xig_gen = tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = tcbinfo.ipi_count; INP_INFO_RUNLOCK(&tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = suser_cred(req->td->td_ucred, SUSER_ALLOWJAIL); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); INP_INFO_RLOCK(&tcbinfo); inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); if (inp == NULL) { error = ENOENT; goto outunlocked; } INP_LOCK(inp); if (inp->inp_socket == NULL) { error = ENOENT; goto out; } error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error) goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: INP_UNLOCK(inp); outunlocked: INP_INFO_RUNLOCK(&tcbinfo); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error, mapped = 0; error = suser_cred(req->td->td_ucred, SUSER_ALLOWJAIL); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else return (EINVAL); } INP_INFO_RLOCK(&tcbinfo); if (mapped == 1) inp = in_pcblookup_hash(&tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, 0, NULL); else inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL); if (inp == NULL) { error = ENOENT; goto outunlocked; } INP_LOCK(inp); if (inp->inp_socket == NULL) { error = ENOENT; goto out; } error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error) goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: INP_UNLOCK(inp); outunlocked: INP_INFO_RUNLOCK(&tcbinfo); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif void tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct ip *ip = vip; struct tcphdr *th; struct in_addr faddr; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct icmp *icp; struct in_conninfo inc; tcp_seq icmp_tcp_seq; int mtu; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; /* * Redirects don't need to be handled up here. */ else if (PRC_IS_REDIRECT(cmd)) return; /* * Source quench is depreciated. */ else if (cmd == PRC_QUENCH) return; /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ else if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip != NULL) { icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_WLOCK(&tcbinfo); inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, 0, NULL); if (inp != NULL) { INP_LOCK(inp); if (!(inp->inp_vflag & INP_TIMEWAIT) && !(inp->inp_vflag & INP_DROPPED) && !(inp->inp_socket == NULL)) { icmp_tcp_seq = htonl(th->th_seq); tp = intotcpcb(inp); if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && SEQ_LT(icmp_tcp_seq, tp->snd_max)) { if (cmd == PRC_MSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ bzero(&inc, sizeof(inc)); inc.inc_flags = 0; /* IPv4 */ inc.inc_faddr = faddr; mtu = ntohs(icp->icmp_nextmtu); /* * If no alternative MTU was * proposed, try the next smaller * one. ip->ip_len has already * been swapped in icmp_input(). */ if (!mtu) mtu = ip_next_mtu(ip->ip_len, 1); if (mtu < max(296, (tcp_minmss) + sizeof(struct tcpiphdr))) mtu = 0; if (!mtu) mtu = tcp_mssdflt + sizeof(struct tcpiphdr); /* * Only cache the the MTU if it * is smaller than the interface * or route MTU. tcp_mtudisc() * will do right thing by itself. */ - if (mtu <= tcp_maxmtu(&inc)) + if (mtu <= tcp_maxmtu(&inc, NULL)) tcp_hc_updatemtu(&inc, mtu); } inp = (*notify)(inp, inetctlerrmap[cmd]); } } if (inp != NULL) INP_UNLOCK(inp); } else { inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = faddr; inc.inc_laddr = ip->ip_src; #ifdef INET6 inc.inc_isipv6 = 0; #endif syncache_unreach(&inc, th); } INP_INFO_WUNLOCK(&tcbinfo); } else in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); } #ifdef INET6 void tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { struct tcphdr th; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; struct mbuf *m; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; int off; struct tcp_portonly { u_int16_t th_sport; u_int16_t th_dport; } *thp; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; else if (!PRC_IS_REDIRECT(cmd) && ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) return; /* Source quench is depreciated. */ else if (cmd == PRC_QUENCH) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; off = 0; /* fool gcc */ sa6_src = &sa6_any; } if (ip6 != NULL) { struct in_conninfo inc; /* * XXX: We assume that when IPV6 is non NULL, * M and OFF are valid. */ /* check if we can safely examine src and dst ports */ if (m->m_pkthdr.len < off + sizeof(*thp)) return; bzero(&th, sizeof(th)); m_copydata(m, off, sizeof(*thp), (caddr_t)&th); in6_pcbnotify(&tcbinfo, sa, th.th_dport, (struct sockaddr *)ip6cp->ip6c_src, th.th_sport, cmd, NULL, notify); inc.inc_fport = th.th_dport; inc.inc_lport = th.th_sport; inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; inc.inc_isipv6 = 1; INP_INFO_WLOCK(&tcbinfo); syncache_unreach(&inc, &th); INP_INFO_WUNLOCK(&tcbinfo); } else in6_pcbnotify(&tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify); } #endif /* INET6 */ /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used, with only small modifications. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * As reading the *exact* system time is too expensive to be done * whenever setting up a TCP connection, we increment the time * offset in two ways. First, a small random positive increment * is added to isn_offset for each connection that is set up. * Second, the function tcp_isn_tick fires once per clock tick * and increments isn_offset as necessary so that sequence numbers * are incremented at approximately ISN_BYTES_PER_SECOND. The * random positive increments serve only to ensure that the same * exact sequence number is never sent out twice (as could otherwise * happen when a port is recycled in less than the system tick * interval.) * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) static u_char isn_secret[32]; static int isn_last_reseed; static u_int32_t isn_offset, isn_offset_old; static MD5_CTX isn_ctx; tcp_seq tcp_new_isn(struct tcpcb *tp) { u_int32_t md5_buffer[4]; tcp_seq new_isn; INP_LOCK_ASSERT(tp->t_inpcb); ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) && (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) < (u_int)ticks))) { read_random(&isn_secret, sizeof(isn_secret)); isn_last_reseed = ticks; } /* Compute the md5 hash and return the ISN. */ MD5Init(&isn_ctx); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, sizeof(struct in6_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, sizeof(struct in6_addr)); } else #endif { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, sizeof(struct in_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, sizeof(struct in_addr)); } MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); MD5Final((u_char *) &md5_buffer, &isn_ctx); new_isn = (tcp_seq) md5_buffer[0]; isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); new_isn += isn_offset; ISN_UNLOCK(); return (new_isn); } /* * Increment the offset to the next ISN_BYTES_PER_SECOND / hz boundary * to keep time flowing at a relatively constant rate. If the random * increments have already pushed us past the projected offset, do nothing. */ static void tcp_isn_tick(void *xtp) { u_int32_t projected_offset; ISN_LOCK(); projected_offset = isn_offset_old + ISN_BYTES_PER_SECOND / 100; if (projected_offset > isn_offset) isn_offset = projected_offset; isn_offset_old = isn_offset; callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL); ISN_UNLOCK(); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ struct inpcb * tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; INP_INFO_WLOCK_ASSERT(&tcbinfo); INP_LOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || (inp->inp_vflag & INP_DROPPED)) return (inp); tp = intotcpcb(inp); if (tp->t_state != TCPS_SYN_SENT) return (inp); tp = tcp_drop(tp, errno); if (tp != NULL) return (inp); else return (NULL); } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value in the route. Also nudge TCP to send something, * since we know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ struct inpcb * tcp_mtudisc(struct inpcb *inp, int errno) { struct tcpcb *tp; struct socket *so = inp->inp_socket; u_int maxmtu; u_int romtu; int mss; #ifdef INET6 int isipv6; #endif /* INET6 */ INP_LOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || (inp->inp_vflag & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */ romtu = #ifdef INET6 - isipv6 ? tcp_maxmtu6(&inp->inp_inc) : + isipv6 ? tcp_maxmtu6(&inp->inp_inc, NULL) : #endif /* INET6 */ - tcp_maxmtu(&inp->inp_inc); + tcp_maxmtu(&inp->inp_inc, NULL); if (!maxmtu) maxmtu = romtu; else maxmtu = min(maxmtu, romtu); if (!maxmtu) { tp->t_maxopd = tp->t_maxseg = #ifdef INET6 isipv6 ? tcp_v6mssdflt : #endif /* INET6 */ tcp_mssdflt; return (inp); } mss = maxmtu - #ifdef INET6 (isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : #endif /* INET6 */ sizeof(struct tcpiphdr) #ifdef INET6 ) #endif /* INET6 */ ; /* * XXX - The above conditional probably violates the TCP * spec. The problem is that, since we don't know the * other end's MSS, we are supposed to use a conservative * default. But, if we do that, then MTU discovery will * never actually take place, because the conservative * default is much less than the MTUs typically seen * on the Internet today. For the moment, we'll sweep * this under the carpet. * * The conservative default might not actually be a problem * if the only case this occurs is when sending an initial * SYN with options and data to a host we've never talked * to before. Then, they will reply with an MSS value which * will get recorded and the new parameters should get * recomputed. For Further Study. */ if (tp->t_maxopd <= mss) return (inp); tp->t_maxopd = mss; if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) mss -= TCPOLEN_TSTAMP_APPA; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) mss &= ~(MCLBYTES-1); #else if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif if (so->so_snd.sb_hiwat < mss) mss = so->so_snd.sb_hiwat; tp->t_maxseg = mss; tcpstat.tcps_mturesent++; tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->sack_enable) EXIT_FASTRECOVERY(tp); tcp_output(tp); return (inp); } /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return NULL. This routine * is called by TCP routines that access the rmx structure and by tcp_mss * to get the interface MTU. */ u_long -tcp_maxmtu(struct in_conninfo *inc) +tcp_maxmtu(struct in_conninfo *inc, int *flags) { struct route sro; struct sockaddr_in *dst; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); bzero(&sro, sizeof(sro)); if (inc->inc_faddr.s_addr != INADDR_ANY) { dst = (struct sockaddr_in *)&sro.ro_dst; dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = inc->inc_faddr; rtalloc_ign(&sro, RTF_CLONING); } if (sro.ro_rt != NULL) { ifp = sro.ro_rt->rt_ifp; if (sro.ro_rt->rt_rmx.rmx_mtu == 0) maxmtu = ifp->if_mtu; else maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); + + /* Report additional interface capabilities. */ + if (flags != NULL) { + if (ifp->if_capenable & IFCAP_TSO4 && + ifp->if_hwassist & CSUM_TSO) + *flags |= CSUM_TSO; + } RTFREE(sro.ro_rt); } return (maxmtu); } #ifdef INET6 u_long -tcp_maxmtu6(struct in_conninfo *inc) +tcp_maxmtu6(struct in_conninfo *inc, int *flags) { struct route_in6 sro6; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); bzero(&sro6, sizeof(sro6)); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { sro6.ro_dst.sin6_family = AF_INET6; sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); sro6.ro_dst.sin6_addr = inc->inc6_faddr; rtalloc_ign((struct route *)&sro6, RTF_CLONING); } if (sro6.ro_rt != NULL) { ifp = sro6.ro_rt->rt_ifp; if (sro6.ro_rt->rt_rmx.rmx_mtu == 0) maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp); else maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu, IN6_LINKMTU(sro6.ro_rt->rt_ifp)); + + /* Report additional interface capabilities. */ + if (flags != NULL) { + if (ifp->if_capenable & IFCAP_TSO6 && + ifp->if_hwassist & CSUM_TSO) + *flags |= CSUM_TSO; + } RTFREE(sro6.ro_rt); } return (maxmtu); } #endif /* INET6 */ #ifdef IPSEC /* compute ESP/AH header size for TCP, including outer IP header. */ size_t ipsec_hdrsiz_tcp(struct tcpcb *tp) { struct inpcb *inp; struct mbuf *m; size_t hdrsiz; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif struct tcphdr *th; if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) return (0); MGETHDR(m, M_DONTWAIT, MT_DATA); if (!m) return (0); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); tcpip_fillheaders(inp, ip6, th); hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); tcpip_fillheaders(inp, ip, th); hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } m_free(m); return (hdrsiz); } #endif /*IPSEC*/ /* * Move a TCP connection into TIME_WAIT state. * tcbinfo is locked. * inp is locked, and is unlocked before returning. */ void tcp_twstart(struct tcpcb *tp) { struct tcptw *tw; struct inpcb *inp; int tw_time, acknow; struct socket *so; INP_INFO_WLOCK_ASSERT(&tcbinfo); /* tcp_timer_2msl_reset(). */ INP_LOCK_ASSERT(tp->t_inpcb); tw = uma_zalloc(tcptw_zone, M_NOWAIT); if (tw == NULL) { tw = tcp_timer_2msl_tw(1); if (tw == NULL) { tp = tcp_close(tp); if (tp != NULL) INP_UNLOCK(tp->t_inpcb); return; } } inp = tp->t_inpcb; tw->tw_inpcb = inp; /* * Recover last window size sent. */ tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale; /* * Set t_recent if timestamps are used on the connection. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == (TF_REQ_TSTMP|TF_RCVD_TSTMP)) tw->t_recent = tp->ts_recent; else tw->t_recent = 0; tw->snd_nxt = tp->snd_nxt; tw->rcv_nxt = tp->rcv_nxt; tw->iss = tp->iss; tw->irs = tp->irs; tw->t_starttime = tp->t_starttime; tw->tw_time = 0; /* XXX * If this code will * be used for fin-wait-2 state also, then we may need * a ts_recent from the last segment. */ tw_time = 2 * tcp_msl; acknow = tp->t_flags & TF_ACKNOW; /* * First, discard tcpcb state, which includes stopping its timers and * freeing it. tcp_discardcb() used to also release the inpcb, but * that work is now done in the caller. * * Note: soisdisconnected() call used to be made in tcp_discardcb(), * and might not be needed here any longer. */ tcp_discardcb(tp); so = inp->inp_socket; soisdisconnected(so); SOCK_LOCK(so); tw->tw_cred = crhold(so->so_cred); tw->tw_so_options = so->so_options; SOCK_UNLOCK(so); if (acknow) tcp_twrespond(tw, TH_ACK); inp->inp_ppcb = tw; inp->inp_vflag |= INP_TIMEWAIT; tcp_timer_2msl_reset(tw, tw_time, 0); /* * If the inpcb owns the sole reference to the socket, then we can * detach and free the socket as it is not needed in time wait. */ if (inp->inp_vflag & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_twstart: !SS_PROTOREF")); inp->inp_vflag &= ~INP_SOCKREF; INP_UNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); } else INP_UNLOCK(inp); } #if 0 /* * The appromixate rate of ISN increase of Microsoft TCP stacks; * the actual rate is slightly higher due to the addition of * random positive increments. * * Most other new OSes use semi-randomized ISN values, so we * do not need to worry about them. */ #define MS_ISN_BYTES_PER_SECOND 250000 /* * Determine if the ISN we will generate has advanced beyond the last * sequence number used by the previous connection. If so, indicate * that it is safe to recycle this tw socket by returning 1. */ int tcp_twrecycleable(struct tcptw *tw) { tcp_seq new_iss = tw->iss; tcp_seq new_irs = tw->irs; INP_INFO_WLOCK_ASSERT(&tcbinfo); new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz); new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz); if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt)) return (1); else return (0); } #endif void tcp_twclose(struct tcptw *tw, int reuse) { struct socket *so; struct inpcb *inp; /* * At this point, we are in one of two situations: * * (1) We have no socket, just an inpcb<->twtcp pair. We can free * all state. * * (2) We have a socket -- if we own a reference, release it and * notify the socket layer. */ inp = tw->tw_inpcb; KASSERT((inp->inp_vflag & INP_TIMEWAIT), ("tcp_twclose: !timewait")); KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw")); INP_INFO_WLOCK_ASSERT(&tcbinfo); /* tcp_timer_2msl_stop(). */ INP_LOCK_ASSERT(inp); tw->tw_inpcb = NULL; tcp_timer_2msl_stop(tw); inp->inp_ppcb = NULL; in_pcbdrop(inp); so = inp->inp_socket; if (so != NULL) { /* * If there's a socket, handle two cases: first, we own a * strong reference, which we will now release, or we don't * in which case another reference exists (XXXRW: think * about this more), and we don't need to take action. */ if (inp->inp_vflag & INP_SOCKREF) { inp->inp_vflag &= ~INP_SOCKREF; INP_UNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); KASSERT(so->so_state & SS_PROTOREF, ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF")); so->so_state &= ~SS_PROTOREF; sofree(so); } else { /* * If we don't own the only reference, the socket and * inpcb need to be left around to be handled by * tcp_usr_detach() later. */ INP_UNLOCK(inp); } } else { #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) in6_pcbfree(inp); else #endif in_pcbfree(inp); } tcpstat.tcps_closed++; crfree(tw->tw_cred); tw->tw_cred = NULL; if (reuse) return; uma_zfree(tcptw_zone, tw); } int tcp_twrespond(struct tcptw *tw, int flags) { struct inpcb *inp = tw->tw_inpcb; struct tcphdr *th; struct mbuf *m; struct ip *ip = NULL; u_int8_t *optp; u_int hdrlen, optlen; int error; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6 = inp->inp_inc.inc_isipv6; #endif INP_LOCK_ASSERT(inp); m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_data += max_linkhdr; #ifdef MAC mac_create_mbuf_from_inpcb(inp, m); #endif #ifdef INET6 if (isipv6) { hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(inp, ip6, th); } else #endif { hdrlen = sizeof(struct tcpiphdr); ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(inp, ip, th); } optp = (u_int8_t *)(th + 1); /* * Send a timestamp and echo-reply if both our side and our peer * have sent timestamps in our SYN's and this is not a RST. */ if (tw->t_recent && flags == TH_ACK) { u_int32_t *lp = (u_int32_t *)optp; /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); *lp++ = htonl(ticks); *lp = htonl(tw->t_recent); optp += TCPOLEN_TSTAMP_APPA; } optlen = optp - (u_int8_t *)(th + 1); m->m_len = hdrlen + optlen; m->m_pkthdr.len = m->m_len; KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small")); th->th_seq = htonl(tw->snd_nxt); th->th_ack = htonl(tw->rcv_nxt); th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; th->th_flags = flags; th->th_win = htons(tw->last_win); #ifdef INET6 if (isipv6) { th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), sizeof(struct tcphdr) + optlen); ip6->ip6_hlim = in6_selecthlim(inp, NULL); error = ip6_output(m, inp->in6p_outputopts, NULL, (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); } else #endif { th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); ip->ip_len = m->m_pkthdr.len; if (path_mtu_discovery) ip->ip_off |= IP_DF; error = ip_output(m, inp->inp_options, NULL, ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, inp); } if (flags & TH_ACK) tcpstat.tcps_sndacks++; else tcpstat.tcps_sndctrl++; tcpstat.tcps_sndtotal++; return (error); } /* * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING * * This code attempts to calculate the bandwidth-delay product as a * means of determining the optimal window size to maximize bandwidth, * minimize RTT, and avoid the over-allocation of buffers on interfaces and * routers. This code also does a fairly good job keeping RTTs in check * across slow links like modems. We implement an algorithm which is very * similar (but not meant to be) TCP/Vegas. The code operates on the * transmitter side of a TCP connection and so only effects the transmit * side of the connection. * * BACKGROUND: TCP makes no provision for the management of buffer space * at the end points or at the intermediate routers and switches. A TCP * stream, whether using NewReno or not, will eventually buffer as * many packets as it is able and the only reason this typically works is * due to the fairly small default buffers made available for a connection * (typicaly 16K or 32K). As machines use larger windows and/or window * scaling it is now fairly easy for even a single TCP connection to blow-out * all available buffer space not only on the local interface, but on * intermediate routers and switches as well. NewReno makes a misguided * attempt to 'solve' this problem by waiting for an actual failure to occur, * then backing off, then steadily increasing the window again until another * failure occurs, ad-infinitum. This results in terrible oscillation that * is only made worse as network loads increase and the idea of intentionally * blowing out network buffers is, frankly, a terrible way to manage network * resources. * * It is far better to limit the transmit window prior to the failure * condition being achieved. There are two general ways to do this: First * you can 'scan' through different transmit window sizes and locate the * point where the RTT stops increasing, indicating that you have filled the * pipe, then scan backwards until you note that RTT stops decreasing, then * repeat ad-infinitum. This method works in principle but has severe * implementation issues due to RTT variances, timer granularity, and * instability in the algorithm which can lead to many false positives and * create oscillations as well as interact badly with other TCP streams * implementing the same algorithm. * * The second method is to limit the window to the bandwidth delay product * of the link. This is the method we implement. RTT variances and our * own manipulation of the congestion window, bwnd, can potentially * destabilize the algorithm. For this reason we have to stabilize the * elements used to calculate the window. We do this by using the minimum * observed RTT, the long term average of the observed bandwidth, and * by adding two segments worth of slop. It isn't perfect but it is able * to react to changing conditions and gives us a very stable basis on * which to extend the algorithm. */ void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) { u_long bw; u_long bwnd; int save_ticks; INP_LOCK_ASSERT(tp->t_inpcb); /* * If inflight_enable is disabled in the middle of a tcp connection, * make sure snd_bwnd is effectively disabled. */ if (tcp_inflight_enable == 0 || tp->t_rttlow < tcp_inflight_rttthresh) { tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_bandwidth = 0; return; } /* * Figure out the bandwidth. Due to the tick granularity this * is a very rough number and it MUST be averaged over a fairly * long period of time. XXX we need to take into account a link * that is not using all available bandwidth, but for now our * slop will ramp us up if this case occurs and the bandwidth later * increases. * * Note: if ticks rollover 'bw' may wind up negative. We must * effectively reset t_bw_rtttime for this case. */ save_ticks = ticks; if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) return; bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / (save_ticks - tp->t_bw_rtttime); tp->t_bw_rtttime = save_ticks; tp->t_bw_rtseq = ack_seq; if (tp->t_bw_rtttime == 0 || (int)bw < 0) return; bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; tp->snd_bandwidth = bw; /* * Calculate the semi-static bandwidth delay product, plus two maximal * segments. The additional slop puts us squarely in the sweet * spot and also handles the bandwidth run-up case and stabilization. * Without the slop we could be locking ourselves into a lower * bandwidth. * * Situations Handled: * (1) Prevents over-queueing of packets on LANs, especially on * high speed LANs, allowing larger TCP buffers to be * specified, and also does a good job preventing * over-queueing of packets over choke points like modems * (at least for the transmit side). * * (2) Is able to handle changing network loads (bandwidth * drops so bwnd drops, bandwidth increases so bwnd * increases). * * (3) Theoretically should stabilize in the face of multiple * connections implementing the same algorithm (this may need * a little work). * * (4) Stability value (defaults to 20 = 2 maximal packets) can * be adjusted with a sysctl but typically only needs to be * on very slow connections. A value no smaller then 5 * should be used, but only reduce this default if you have * no other choice. */ #define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10; #undef USERTT if (tcp_inflight_debug > 0) { static int ltime; if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { ltime = ticks; printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", tp, bw, tp->t_rttbest, tp->t_srtt, bwnd ); } } if ((long)bwnd < tcp_inflight_min) bwnd = tcp_inflight_min; if (bwnd > tcp_inflight_max) bwnd = tcp_inflight_max; if ((long)bwnd < tp->t_maxseg * 2) bwnd = tp->t_maxseg * 2; tp->snd_bwnd = bwnd; } #ifdef TCP_SIGNATURE /* * Callback function invoked by m_apply() to digest TCP segment data * contained within an mbuf chain. */ static int tcp_signature_apply(void *fstate, void *data, u_int len) { MD5Update(fstate, (u_char *)data, len); return (0); } /* * Compute TCP-MD5 hash of a TCPv4 segment. (RFC2385) * * Parameters: * m pointer to head of mbuf chain * off0 offset to TCP header within the mbuf chain * len length of TCP segment data, excluding options * optlen length of TCP segment options * buf pointer to storage for computed MD5 digest * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND) * * We do this over ip, tcphdr, segment data, and the key in the SADB. * When called from tcp_input(), we can be sure that th_sum has been * zeroed out and verified already. * * This function is for IPv4 use only. Calling this function with an * IPv6 packet in the mbuf chain will yield undefined results. * * Return 0 if successful, otherwise return -1. * * XXX The key is retrieved from the system's PF_KEY SADB, by keying a * search with the destination IP address, and a 'magic SPI' to be * determined by the application. This is hardcoded elsewhere to 1179 * right now. Another branch of this code exists which uses the SPD to * specify per-application flows but it is unstable. */ int tcp_signature_compute(struct mbuf *m, int off0, int len, int optlen, u_char *buf, u_int direction) { union sockaddr_union dst; struct ippseudo ippseudo; MD5_CTX ctx; int doff; struct ip *ip; struct ipovly *ipovly; struct secasvar *sav; struct tcphdr *th; u_short savecsum; KASSERT(m != NULL, ("NULL mbuf chain")); KASSERT(buf != NULL, ("NULL signature pointer")); /* Extract the destination from the IP header in the mbuf. */ ip = mtod(m, struct ip *); bzero(&dst, sizeof(union sockaddr_union)); dst.sa.sa_len = sizeof(struct sockaddr_in); dst.sa.sa_family = AF_INET; dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ? ip->ip_src : ip->ip_dst; /* Look up an SADB entry which matches the address of the peer. */ sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI)); if (sav == NULL) { printf("%s: SADB lookup failed for %s\n", __func__, inet_ntoa(dst.sin.sin_addr)); return (EINVAL); } MD5Init(&ctx); ipovly = (struct ipovly *)ip; th = (struct tcphdr *)((u_char *)ip + off0); doff = off0 + sizeof(struct tcphdr) + optlen; /* * Step 1: Update MD5 hash with IP pseudo-header. * * XXX The ippseudo header MUST be digested in network byte order, * or else we'll fail the regression test. Assume all fields we've * been doing arithmetic on have been in host byte order. * XXX One cannot depend on ipovly->ih_len here. When called from * tcp_output(), the underlying ip_len member has not yet been set. */ ippseudo.ippseudo_src = ipovly->ih_src; ippseudo.ippseudo_dst = ipovly->ih_dst; ippseudo.ippseudo_pad = 0; ippseudo.ippseudo_p = IPPROTO_TCP; ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) + optlen); MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo)); /* * Step 2: Update MD5 hash with TCP header, excluding options. * The TCP checksum must be set to zero. */ savecsum = th->th_sum; th->th_sum = 0; MD5Update(&ctx, (char *)th, sizeof(struct tcphdr)); th->th_sum = savecsum; /* * Step 3: Update MD5 hash with TCP segment data. * Use m_apply() to avoid an early m_pullup(). */ if (len > 0) m_apply(m, doff, len, tcp_signature_apply, &ctx); /* * Step 4: Update MD5 hash with shared secret. */ MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth)); MD5Final(buf, &ctx); key_sa_recordxfer(sav, m); KEY_FREESAV(&sav); return (0); } #endif /* TCP_SIGNATURE */ static int sysctl_drop(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; struct tcptw *tw; struct sockaddr_in *fin, *lin; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; struct in6_addr f6, l6; #endif int error; inp = NULL; fin = lin = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; break; } error = sa6_embedscope(fin6, ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, ip6_use_defzone); if (error) return (error); break; #endif case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; default: return (EINVAL); } INP_INFO_WLOCK(&tcbinfo); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup_hash(&tcbinfo, &f6, fin6->sin6_port, &l6, lin6->sin6_port, 0, NULL); break; #endif case AF_INET: inp = in_pcblookup_hash(&tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL); break; } if (inp != NULL) { INP_LOCK(inp); if (inp->inp_vflag & INP_TIMEWAIT) { /* * XXXRW: There currently exists a state where an * inpcb is present, but its timewait state has been * discarded. For now, don't allow dropping of this * type of inpcb. */ tw = intotw(inp); if (tw != NULL) tcp_twclose(tw, 0); } else if (!(inp->inp_vflag & INP_DROPPED) && !(inp->inp_socket->so_options & SO_ACCEPTCONN)) { tp = intotcpcb(inp); tcp_drop(tp, ECONNABORTED); } INP_UNLOCK(inp); } else error = ESRCH; INP_INFO_WUNLOCK(&tcbinfo); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL, 0, sysctl_drop, "", "Drop TCP connection"); Index: head/sys/netinet/tcp_var.h =================================================================== --- head/sys/netinet/tcp_var.h (revision 162083) +++ head/sys/netinet/tcp_var.h (revision 162084) @@ -1,571 +1,572 @@ /*- * Copyright (c) 1982, 1986, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 * $FreeBSD$ */ #ifndef _NETINET_TCP_VAR_H_ #define _NETINET_TCP_VAR_H_ #include /* * Kernel variables for tcp. */ extern int tcp_do_rfc1323; /* TCP segment queue entry */ struct tseg_qent { LIST_ENTRY(tseg_qent) tqe_q; int tqe_len; /* TCP segment data length */ struct tcphdr *tqe_th; /* a pointer to tcp header */ struct mbuf *tqe_m; /* mbuf contains packet */ }; LIST_HEAD(tsegqe_head, tseg_qent); extern int tcp_reass_qsize; extern struct uma_zone *tcp_reass_zone; struct sackblk { tcp_seq start; /* start seq no. of sack block */ tcp_seq end; /* end seq no. */ }; struct sackhole { tcp_seq start; /* start seq no. of hole */ tcp_seq end; /* end seq no. */ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ TAILQ_ENTRY(sackhole) scblink; /* scoreboard linkage */ }; struct sackhint { struct sackhole *nexthole; int sack_bytes_rexmit; }; struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; }; #define tcp6cb tcpcb /* for KAME src sync over BSD*'s */ /* * Tcp control block, one per tcp; fields: * Organized for 16 byte cacheline efficiency. */ struct tcpcb { struct tsegqe_head t_segq; /* segment reassembly queue */ int t_segqlen; /* segment reassembly queue length */ int t_dupacks; /* consecutive dup acks recd */ struct callout *tt_rexmt; /* retransmit timer */ struct callout *tt_persist; /* retransmit persistence */ struct callout *tt_keep; /* keepalive */ struct callout *tt_2msl; /* 2*msl TIME_WAIT timer */ struct callout *tt_delack; /* delayed ACK timer */ struct inpcb *t_inpcb; /* back pointer to internet pcb */ int t_state; /* state of this connection */ u_int t_flags; #define TF_ACKNOW 0x000001 /* ack peer immediately */ #define TF_DELACK 0x000002 /* ack, but try to delay it */ #define TF_NODELAY 0x000004 /* don't delay packets to coalesce */ #define TF_NOOPT 0x000008 /* don't use tcp options */ #define TF_SENTFIN 0x000010 /* have sent FIN */ #define TF_REQ_SCALE 0x000020 /* have/will request window scaling */ #define TF_RCVD_SCALE 0x000040 /* other side has requested scaling */ #define TF_REQ_TSTMP 0x000080 /* have/will request timestamps */ #define TF_RCVD_TSTMP 0x000100 /* a timestamp was received in SYN */ #define TF_SACK_PERMIT 0x000200 /* other side said I could SACK */ #define TF_NEEDSYN 0x000400 /* send SYN (implicit state) */ #define TF_NEEDFIN 0x000800 /* send FIN (implicit state) */ #define TF_NOPUSH 0x001000 /* don't push */ #define TF_MORETOCOME 0x010000 /* More data to be appended to sock */ #define TF_LQ_OVERFLOW 0x020000 /* listen queue overflow */ #define TF_LASTIDLE 0x040000 /* connection was previously idle */ #define TF_RXWIN0SENT 0x080000 /* sent a receiver win 0 in response */ #define TF_FASTRECOVERY 0x100000 /* in NewReno Fast Recovery */ #define TF_WASFRECOVERY 0x200000 /* was in NewReno Fast Recovery */ #define TF_SIGNATURE 0x400000 /* require MD5 digests (RFC2385) */ #define TF_FORCEDATA 0x800000 /* force out a byte */ +#define TF_TSO 0x1000000 /* TSO enabled on this connection */ tcp_seq snd_una; /* send unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; * used to recognize retransmits */ tcp_seq snd_nxt; /* send next */ tcp_seq snd_up; /* send urgent pointer */ tcp_seq snd_wl1; /* window update seg seq number */ tcp_seq snd_wl2; /* window update seg ack number */ tcp_seq iss; /* initial send sequence number */ tcp_seq irs; /* initial receive sequence number */ tcp_seq rcv_nxt; /* receive next */ tcp_seq rcv_adv; /* advertised window */ u_long rcv_wnd; /* receive window */ tcp_seq rcv_up; /* receive urgent pointer */ u_long snd_wnd; /* send window */ u_long snd_cwnd; /* congestion-controlled window */ u_long snd_bwnd; /* bandwidth-controlled window */ u_long snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ u_long snd_bandwidth; /* calculated bandwidth or 0 */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ u_int t_maxopd; /* mss plus options */ u_long t_rcvtime; /* inactivity time */ u_long t_starttime; /* time connection was established */ int t_rtttime; /* round trip time */ tcp_seq t_rtseq; /* sequence number being timed */ int t_bw_rtttime; /* used for bandwidth calculation */ tcp_seq t_bw_rtseq; /* used for bandwidth calculation */ int t_rxtcur; /* current retransmit value (ticks) */ u_int t_maxseg; /* maximum segment size */ int t_srtt; /* smoothed round-trip time */ int t_rttvar; /* variance in round-trip time */ int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rttmin; /* minimum rtt allowed */ u_int t_rttbest; /* best rtt we've seen */ u_long t_rttupdated; /* number of times rtt sampled */ u_long max_sndwnd; /* largest window peer has offered */ int t_softerror; /* possible error not yet reported */ /* out-of-band data */ char t_oobflags; /* have some */ char t_iobc; /* input character */ #define TCPOOB_HAVEDATA 0x01 #define TCPOOB_HADDATA 0x02 /* RFC 1323 variables */ u_char snd_scale; /* window scaling for send window */ u_char rcv_scale; /* window scaling for recv window */ u_char request_r_scale; /* pending window scaling */ u_char requested_s_scale; /* unused, to be reused later */ u_long ts_recent; /* timestamp echo data */ u_long ts_recent_age; /* when last updated */ tcp_seq last_ack_sent; /* experimental */ u_long snd_cwnd_prev; /* cwnd prior to retransmit */ u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ u_long t_badrxtwin; /* window for retransmit recovery */ u_char snd_limited; /* segments limited transmitted */ /* anti DoS counters */ u_long rcv_second; /* start of interval second */ u_long rcv_pps; /* received packets per second */ u_long rcv_byps; /* received bytes per second */ /* SACK related state */ int sack_enable; /* enable SACK for this connection */ int snd_numholes; /* number of holes seen by sender */ TAILQ_HEAD(sackhole_head, sackhole) snd_holes; /* SACK scoreboard (sorted) */ tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/ int rcv_numsacks; /* # distinct sack blks present */ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ tcp_seq sack_newdata; /* New data xmitted in this recovery episode starts at this seq number */ struct sackhint sackhint; /* SACK scoreboard hint */ int t_rttlow; /* smallest observerved RTT */ }; #define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY) #define ENTER_FASTRECOVERY(tp) tp->t_flags |= TF_FASTRECOVERY #define EXIT_FASTRECOVERY(tp) tp->t_flags &= ~TF_FASTRECOVERY #ifdef TCP_SIGNATURE /* * Defines which are needed by the xform_tcp module and tcp_[in|out]put * for SADB verification and lookup. */ #define TCP_SIGLEN 16 /* length of computed digest in bytes */ #define TCP_KEYLEN_MIN 1 /* minimum length of TCP-MD5 key */ #define TCP_KEYLEN_MAX 80 /* maximum length of TCP-MD5 key */ /* * Only a single SA per host may be specified at this time. An SPI is * needed in order for the KEY_ALLOCSA() lookup to work. */ #define TCP_SIG_SPI 0x1000 #endif /* TCP_SIGNATURE */ /* * Structure to hold TCP options that are only used during segment * processing (in tcp_input), but not held in the tcpcb. * It's basically used to reduce the number of parameters * to tcp_dooptions. */ struct tcpopt { u_long to_flags; /* which options are present */ #define TOF_TS 0x0001 /* timestamp */ #define TOF_MSS 0x0010 /* maximum segment size */ #define TOF_SCALE 0x0020 /* window scaling */ #define TOF_SIGNATURE 0x0040 /* signature option present */ #define TOF_SIGLEN 0x0080 /* signature length valid (RFC2385) */ #define TOF_SACK 0x0100 /* Peer sent SACK option */ u_int32_t to_tsval; u_int32_t to_tsecr; u_int16_t to_mss; u_int8_t to_requested_s_scale; u_int8_t to_nsacks; /* number of SACK blocks */ u_char *to_sacks; /* pointer to the first SACK blocks */ }; /* * Flags for tcp_dooptions. */ #define TO_SYN 0x01 /* parse SYN-only options */ struct hc_metrics_lite { /* must stay in sync with hc_metrics */ u_long rmx_mtu; /* MTU for this path */ u_long rmx_ssthresh; /* outbound gateway buffer limit */ u_long rmx_rtt; /* estimated round trip time */ u_long rmx_rttvar; /* estimated rtt variance */ u_long rmx_bandwidth; /* estimated bandwidth */ u_long rmx_cwnd; /* congestion window */ u_long rmx_sendpipe; /* outbound delay-bandwidth product */ u_long rmx_recvpipe; /* inbound delay-bandwidth product */ }; #ifndef _NETINET_IN_PCB_H_ struct in_conninfo; #endif /* _NETINET_IN_PCB_H_ */ struct tcptw { struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */ tcp_seq snd_nxt; tcp_seq rcv_nxt; tcp_seq iss; tcp_seq irs; u_short last_win; /* cached window value */ u_short tw_so_options; /* copy of so_options */ struct ucred *tw_cred; /* user credentials */ u_long t_recent; u_long t_starttime; int tw_time; LIST_ENTRY(tcptw) tw_2msl; }; #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) #define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb) #define sototcpcb(so) (intotcpcb(sotoinpcb(so))) /* * The smoothed round-trip time and estimated variance * are stored as fixed point numbers scaled by the values below. * For convenience, these scales are also used in smoothing the average * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). * With these scales, srtt has 3 bits to the right of the binary point, * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the * binary point, and is smoothed with an ALPHA of 0.75. */ #define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ #define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ #define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ #define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ #define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ /* * The initial retransmission should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). * This version of the macro adapted from a paper by Lawrence * Brakmo and Larry Peterson which outlines a problem caused * by insufficient precision in the original implementation, * which results in inappropriately large RTO values for very * fast networks. */ #define TCP_REXMTVAL(tp) \ max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) /* * TCP statistics. * Many of these should be kept per connection, * but that's inconvenient at the moment. */ struct tcpstat { u_long tcps_connattempt; /* connections initiated */ u_long tcps_accepts; /* connections accepted */ u_long tcps_connects; /* connections established */ u_long tcps_drops; /* connections dropped */ u_long tcps_conndrops; /* embryonic connections dropped */ u_long tcps_minmssdrops; /* average minmss too low drops */ u_long tcps_closed; /* conn. closed (includes drops) */ u_long tcps_segstimed; /* segs where we tried to get rtt */ u_long tcps_rttupdated; /* times we succeeded */ u_long tcps_delack; /* delayed acks sent */ u_long tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ u_long tcps_rexmttimeo; /* retransmit timeouts */ u_long tcps_persisttimeo; /* persist timeouts */ u_long tcps_keeptimeo; /* keepalive timeouts */ u_long tcps_keepprobe; /* keepalive probes sent */ u_long tcps_keepdrops; /* connections dropped in keepalive */ u_long tcps_sndtotal; /* total packets sent */ u_long tcps_sndpack; /* data packets sent */ u_long tcps_sndbyte; /* data bytes sent */ u_long tcps_sndrexmitpack; /* data packets retransmitted */ u_long tcps_sndrexmitbyte; /* data bytes retransmitted */ u_long tcps_sndrexmitbad; /* unnecessary packet retransmissions */ u_long tcps_sndacks; /* ack-only packets sent */ u_long tcps_sndprobe; /* window probes sent */ u_long tcps_sndurg; /* packets sent with URG only */ u_long tcps_sndwinup; /* window update-only packets sent */ u_long tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ u_long tcps_rcvtotal; /* total packets received */ u_long tcps_rcvpack; /* packets received in sequence */ u_long tcps_rcvbyte; /* bytes received in sequence */ u_long tcps_rcvbadsum; /* packets received with ccksum errs */ u_long tcps_rcvbadoff; /* packets received with bad offset */ u_long tcps_rcvmemdrop; /* packets dropped for lack of memory */ u_long tcps_rcvshort; /* packets received too short */ u_long tcps_rcvduppack; /* duplicate-only packets received */ u_long tcps_rcvdupbyte; /* duplicate-only bytes received */ u_long tcps_rcvpartduppack; /* packets with some duplicate data */ u_long tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ u_long tcps_rcvoopack; /* out-of-order packets received */ u_long tcps_rcvoobyte; /* out-of-order bytes received */ u_long tcps_rcvpackafterwin; /* packets with data after window */ u_long tcps_rcvbyteafterwin; /* bytes rcvd after window */ u_long tcps_rcvafterclose; /* packets rcvd after "close" */ u_long tcps_rcvwinprobe; /* rcvd window probe packets */ u_long tcps_rcvdupack; /* rcvd duplicate acks */ u_long tcps_rcvacktoomuch; /* rcvd acks for unsent data */ u_long tcps_rcvackpack; /* rcvd ack packets */ u_long tcps_rcvackbyte; /* bytes acked by rcvd acks */ u_long tcps_rcvwinupd; /* rcvd window update packets */ u_long tcps_pawsdrop; /* segments dropped due to PAWS */ u_long tcps_predack; /* times hdr predict ok for acks */ u_long tcps_preddat; /* times hdr predict ok for data pkts */ u_long tcps_pcbcachemiss; u_long tcps_cachedrtt; /* times cached RTT in route updated */ u_long tcps_cachedrttvar; /* times cached rttvar updated */ u_long tcps_cachedssthresh; /* times cached ssthresh updated */ u_long tcps_usedrtt; /* times RTT initialized from route */ u_long tcps_usedrttvar; /* times RTTVAR initialized from rt */ u_long tcps_usedssthresh; /* times ssthresh initialized from rt*/ u_long tcps_persistdrop; /* timeout in persist state */ u_long tcps_badsyn; /* bogus SYN, e.g. premature ACK */ u_long tcps_mturesent; /* resends due to MTU discovery */ u_long tcps_listendrop; /* listen queue overflows */ u_long tcps_badrst; /* ignored RSTs in the window */ u_long tcps_sc_added; /* entry added to syncache */ u_long tcps_sc_retransmitted; /* syncache entry was retransmitted */ u_long tcps_sc_dupsyn; /* duplicate SYN packet */ u_long tcps_sc_dropped; /* could not reply to packet */ u_long tcps_sc_completed; /* successful extraction of entry */ u_long tcps_sc_bucketoverflow; /* syncache per-bucket limit hit */ u_long tcps_sc_cacheoverflow; /* syncache cache limit hit */ u_long tcps_sc_reset; /* RST removed entry from syncache */ u_long tcps_sc_stale; /* timed out or listen socket gone */ u_long tcps_sc_aborted; /* syncache entry aborted */ u_long tcps_sc_badack; /* removed due to bad ACK */ u_long tcps_sc_unreach; /* ICMP unreachable received */ u_long tcps_sc_zonefail; /* zalloc() failed */ u_long tcps_sc_sendcookie; /* SYN cookie sent */ u_long tcps_sc_recvcookie; /* SYN cookie received */ u_long tcps_hc_added; /* entry added to hostcache */ u_long tcps_hc_bucketoverflow; /* hostcache per bucket limit hit */ /* SACK related stats */ u_long tcps_sack_recovery_episode; /* SACK recovery episodes */ u_long tcps_sack_rexmits; /* SACK rexmit segments */ u_long tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */ u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */ u_long tcps_sack_sboverflow; /* times scoreboard overflowed */ }; /* * TCB structure exported to user-land via sysctl(3). * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been * included. Not all of our clients do. */ #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) struct xtcpcb { size_t xt_len; struct inpcb xt_inp; struct tcpcb xt_tp; struct xsocket xt_socket; u_quad_t xt_alignment_hack; }; #endif /* * Names for TCP sysctl objects */ #define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ #define TCPCTL_MSSDFLT 3 /* MSS default */ #define TCPCTL_STATS 4 /* statistics (read-only) */ #define TCPCTL_RTTDFLT 5 /* default RTT estimate */ #define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ #define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ #define TCPCTL_SENDSPACE 8 /* send buffer space */ #define TCPCTL_RECVSPACE 9 /* receive buffer space */ #define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */ #define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ #define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ #define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ #define TCPCTL_DROP 15 /* drop tcp connection */ #define TCPCTL_MAXID 16 #define TCPCTL_NAMES { \ { 0, 0 }, \ { "rfc1323", CTLTYPE_INT }, \ { "mssdflt", CTLTYPE_INT }, \ { "stats", CTLTYPE_STRUCT }, \ { "rttdflt", CTLTYPE_INT }, \ { "keepidle", CTLTYPE_INT }, \ { "keepintvl", CTLTYPE_INT }, \ { "sendspace", CTLTYPE_INT }, \ { "recvspace", CTLTYPE_INT }, \ { "keepinit", CTLTYPE_INT }, \ { "pcblist", CTLTYPE_STRUCT }, \ { "delacktime", CTLTYPE_INT }, \ { "v6mssdflt", CTLTYPE_INT }, \ { "maxid", CTLTYPE_INT }, \ } #ifdef _KERNEL #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_tcp); SYSCTL_DECL(_net_inet_tcp_sack); #endif extern struct inpcbhead tcb; /* head of queue of active tcpcb's */ extern struct inpcbinfo tcbinfo; extern struct tcpstat tcpstat; /* tcp statistics */ extern int tcp_mssdflt; /* XXX */ extern int tcp_minmss; extern int tcp_minmssoverload; extern int tcp_delack_enabled; extern int tcp_do_newreno; extern int path_mtu_discovery; extern int ss_fltsz; extern int ss_fltsz_local; extern int tcp_do_sack; /* SACK enabled/disabled */ struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); void tcp_twstart(struct tcpcb *); #if 0 int tcp_twrecycleable(struct tcptw *tw); #endif void tcp_twclose(struct tcptw *_tw, int _reuse); void tcp_ctlinput(int, struct sockaddr *, void *); int tcp_ctloutput(struct socket *, struct sockopt *); struct tcpcb * tcp_drop(struct tcpcb *, int); void tcp_drain(void); void tcp_fasttimo(void); void tcp_init(void); void tcp_fini(void *); void tcp_reass_init(void); void tcp_input(struct mbuf *, int); -u_long tcp_maxmtu(struct in_conninfo *); -u_long tcp_maxmtu6(struct in_conninfo *); +u_long tcp_maxmtu(struct in_conninfo *, int *); +u_long tcp_maxmtu6(struct in_conninfo *, int *); void tcp_mss(struct tcpcb *, int); int tcp_mssopt(struct in_conninfo *); struct inpcb * tcp_drop_syn_sent(struct inpcb *, int); struct inpcb * tcp_mtudisc(struct inpcb *, int); struct tcpcb * tcp_newtcpcb(struct inpcb *); int tcp_output(struct tcpcb *); void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); int tcp_twrespond(struct tcptw *, int); void tcp_setpersist(struct tcpcb *); #ifdef TCP_SIGNATURE int tcp_signature_compute(struct mbuf *, int, int, int, u_char *, u_int); #endif void tcp_slowtimo(void); struct tcptemp * tcpip_maketemplate(struct inpcb *); void tcpip_fillheaders(struct inpcb *, void *, void *); struct tcpcb * tcp_timers(struct tcpcb *, int); void tcp_trace(int, int, struct tcpcb *, void *, struct tcphdr *, int); void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq); void syncache_init(void); void syncache_unreach(struct in_conninfo *, struct tcphdr *); int syncache_expand(struct in_conninfo *, struct tcphdr *, struct socket **, struct mbuf *); int syncache_add(struct in_conninfo *, struct tcpopt *, struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *); void syncache_chkrst(struct in_conninfo *, struct tcphdr *); void syncache_badack(struct in_conninfo *); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ void tcp_hc_init(void); void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); u_long tcp_hc_getmtu(struct in_conninfo *); void tcp_hc_updatemtu(struct in_conninfo *, u_long); void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); extern struct pr_usrreqs tcp_usrreqs; extern u_long tcp_sendspace; extern u_long tcp_recvspace; tcp_seq tcp_new_isn(struct tcpcb *); void tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); void tcp_clean_sackreport(struct tcpcb *tp); void tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); int tcp_newreno(struct tcpcb *, struct tcphdr *); u_long tcp_seq_subtract(u_long, u_long ); #endif /* _KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */ Index: head/sys/netinet6/icmp6.c =================================================================== --- head/sys/netinet6/icmp6.c (revision 162083) +++ head/sys/netinet6/icmp6.c (revision 162084) @@ -1,2785 +1,2785 @@ /* $FreeBSD$ */ /* $KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $ */ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #endif #ifdef FAST_IPSEC #include #include #endif extern struct domain inet6domain; struct icmp6stat icmp6stat; extern struct inpcbinfo ripcbinfo; extern struct inpcbhead ripcb; extern int icmp6errppslim; static int icmp6errpps_count = 0; static struct timeval icmp6errppslim_last; extern int icmp6_nodeinfo; static void icmp6_errcount __P((struct icmp6errstat *, int, int)); static int icmp6_rip6_input __P((struct mbuf **, int)); static int icmp6_ratelimit __P((const struct in6_addr *, const int, const int)); static const char *icmp6_redirect_diag __P((struct in6_addr *, struct in6_addr *, struct in6_addr *)); static struct mbuf *ni6_input __P((struct mbuf *, int)); static struct mbuf *ni6_nametodns __P((const char *, int, int)); static int ni6_dnsmatch __P((const char *, int, const char *, int)); static int ni6_addrs __P((struct icmp6_nodeinfo *, struct mbuf *, struct ifnet **, struct in6_addr *)); static int ni6_store_addrs __P((struct icmp6_nodeinfo *, struct icmp6_nodeinfo *, struct ifnet *, int)); static int icmp6_notify_error __P((struct mbuf **, int, int, int)); void icmp6_init() { mld6_init(); } static void icmp6_errcount(stat, type, code) struct icmp6errstat *stat; int type, code; { switch (type) { case ICMP6_DST_UNREACH: switch (code) { case ICMP6_DST_UNREACH_NOROUTE: stat->icp6errs_dst_unreach_noroute++; return; case ICMP6_DST_UNREACH_ADMIN: stat->icp6errs_dst_unreach_admin++; return; case ICMP6_DST_UNREACH_BEYONDSCOPE: stat->icp6errs_dst_unreach_beyondscope++; return; case ICMP6_DST_UNREACH_ADDR: stat->icp6errs_dst_unreach_addr++; return; case ICMP6_DST_UNREACH_NOPORT: stat->icp6errs_dst_unreach_noport++; return; } break; case ICMP6_PACKET_TOO_BIG: stat->icp6errs_packet_too_big++; return; case ICMP6_TIME_EXCEEDED: switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: stat->icp6errs_time_exceed_transit++; return; case ICMP6_TIME_EXCEED_REASSEMBLY: stat->icp6errs_time_exceed_reassembly++; return; } break; case ICMP6_PARAM_PROB: switch (code) { case ICMP6_PARAMPROB_HEADER: stat->icp6errs_paramprob_header++; return; case ICMP6_PARAMPROB_NEXTHEADER: stat->icp6errs_paramprob_nextheader++; return; case ICMP6_PARAMPROB_OPTION: stat->icp6errs_paramprob_option++; return; } break; case ND_REDIRECT: stat->icp6errs_redirect++; return; } stat->icp6errs_unknown++; } /* * A wrapper function for icmp6_error() necessary when the erroneous packet * may not contain enough scope zone information. */ void icmp6_error2(m, type, code, param, ifp) struct mbuf *m; int type, code, param; struct ifnet *ifp; { struct ip6_hdr *ip6; if (ifp == NULL) return; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); #else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) return; } #endif ip6 = mtod(m, struct ip6_hdr *); if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0) return; if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) return; icmp6_error(m, type, code, param); } /* * Generate an error packet of type error in response to bad IP6 packet. */ void icmp6_error(m, type, code, param) struct mbuf *m; int type, code, param; { struct ip6_hdr *oip6, *nip6; struct icmp6_hdr *icmp6; u_int preplen; int off; int nxt; icmp6stat.icp6s_error++; /* count per-type-code statistics */ icmp6_errcount(&icmp6stat.icp6s_outerrhist, type, code); #ifdef M_DECRYPTED /*not openbsd*/ if (m->m_flags & M_DECRYPTED) { icmp6stat.icp6s_canterror++; goto freeit; } #endif #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); #else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) return; } #endif oip6 = mtod(m, struct ip6_hdr *); /* * If the destination address of the erroneous packet is a multicast * address, or the packet was sent using link-layer multicast, * we should basically suppress sending an error (RFC 2463, Section * 2.4). * We have two exceptions (the item e.2 in that section): * - the Pakcet Too Big message can be sent for path MTU discovery. * - the Parameter Problem Message that can be allowed an icmp6 error * in the option type field. This check has been done in * ip6_unknown_opt(), so we can just check the type and code. */ if ((m->m_flags & (M_BCAST|M_MCAST) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) && (type != ICMP6_PACKET_TOO_BIG && (type != ICMP6_PARAM_PROB || code != ICMP6_PARAMPROB_OPTION))) goto freeit; /* * RFC 2463, 2.4 (e.5): source address check. * XXX: the case of anycast source? */ if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_src)) goto freeit; /* * If we are about to send ICMPv6 against ICMPv6 error/redirect, * don't do it. */ nxt = -1; off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); if (off >= 0 && nxt == IPPROTO_ICMPV6) { struct icmp6_hdr *icp; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, off + sizeof(struct icmp6_hdr), ); icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off, sizeof(*icp)); if (icp == NULL) { icmp6stat.icp6s_tooshort++; return; } #endif if (icp->icmp6_type < ICMP6_ECHO_REQUEST || icp->icmp6_type == ND_REDIRECT) { /* * ICMPv6 error * Special case: for redirect (which is * informational) we must not send icmp6 error. */ icmp6stat.icp6s_canterror++; goto freeit; } else { /* ICMPv6 informational - send the error */ } } else { /* non-ICMPv6 - send the error */ } oip6 = mtod(m, struct ip6_hdr *); /* adjust pointer */ /* Finally, do rate limitation check. */ if (icmp6_ratelimit(&oip6->ip6_src, type, code)) { icmp6stat.icp6s_toofreq++; goto freeit; } /* * OK, ICMP6 can be generated. */ if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN) m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len); preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); M_PREPEND(m, preplen, M_DONTWAIT); if (m && m->m_len < preplen) m = m_pullup(m, preplen); if (m == NULL) { nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__)); return; } nip6 = mtod(m, struct ip6_hdr *); nip6->ip6_src = oip6->ip6_src; nip6->ip6_dst = oip6->ip6_dst; in6_clearscope(&oip6->ip6_src); in6_clearscope(&oip6->ip6_dst); icmp6 = (struct icmp6_hdr *)(nip6 + 1); icmp6->icmp6_type = type; icmp6->icmp6_code = code; icmp6->icmp6_pptr = htonl((u_int32_t)param); /* * icmp6_reflect() is designed to be in the input path. * icmp6_error() can be called from both input and output path, * and if we are in output path rcvif could contain bogus value. * clear m->m_pkthdr.rcvif for safety, we should have enough scope * information in ip header (nip6). */ m->m_pkthdr.rcvif = NULL; icmp6stat.icp6s_outhist[type]++; icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */ return; freeit: /* * If we can't tell whether or not we can generate ICMP6, free it. */ m_freem(m); } /* * Process a received ICMP6 message. */ int icmp6_input(mp, offp, proto) struct mbuf **mp; int *offp, proto; { struct mbuf *m = *mp, *n; struct ip6_hdr *ip6, *nip6; struct icmp6_hdr *icmp6, *nicmp6; int off = *offp; int icmp6len = m->m_pkthdr.len - *offp; int code, sum, noff; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), IPPROTO_DONE); /* m might change if M_LOOP. So, call mtod after this */ #endif /* * Locate icmp6 structure in mbuf, and check * that not corrupted and of at least minimum length */ ip6 = mtod(m, struct ip6_hdr *); if (icmp6len < sizeof(struct icmp6_hdr)) { icmp6stat.icp6s_tooshort++; goto freeit; } /* * calculate the checksum */ #ifndef PULLDOWN_TEST icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { icmp6stat.icp6s_tooshort++; return IPPROTO_DONE; } #endif code = icmp6->icmp6_code; if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) { nd6log((LOG_ERR, "ICMP6 checksum error(%d|%x) %s\n", icmp6->icmp6_type, sum, ip6_sprintf(&ip6->ip6_src))); icmp6stat.icp6s_checksum++; goto freeit; } if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) { /* * Deliver very specific ICMP6 type only. * This is important to deliver TOOBIG. Otherwise PMTUD * will not work. */ switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: case ICMP6_PACKET_TOO_BIG: case ICMP6_TIME_EXCEEDED: break; default: goto freeit; } } icmp6stat.icp6s_inhist[icmp6->icmp6_type]++; icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_msg); if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK) icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_error); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_dstunreach); switch (code) { case ICMP6_DST_UNREACH_NOROUTE: code = PRC_UNREACH_NET; break; case ICMP6_DST_UNREACH_ADMIN: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_adminprohib); code = PRC_UNREACH_PROTOCOL; /* is this a good code? */ break; case ICMP6_DST_UNREACH_ADDR: code = PRC_HOSTDEAD; break; case ICMP6_DST_UNREACH_BEYONDSCOPE: /* I mean "source address was incorrect." */ code = PRC_PARAMPROB; break; case ICMP6_DST_UNREACH_NOPORT: code = PRC_UNREACH_PORT; break; default: goto badcode; } goto deliver; break; case ICMP6_PACKET_TOO_BIG: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_pkttoobig); /* validation is made in icmp6_mtudisc_update */ code = PRC_MSGSIZE; /* * Updating the path MTU will be done after examining * intermediate extension headers. */ goto deliver; break; case ICMP6_TIME_EXCEEDED: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_timeexceed); switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: code = PRC_TIMXCEED_INTRANS; break; case ICMP6_TIME_EXCEED_REASSEMBLY: code = PRC_TIMXCEED_REASS; break; default: goto badcode; } goto deliver; break; case ICMP6_PARAM_PROB: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_paramprob); switch (code) { case ICMP6_PARAMPROB_NEXTHEADER: code = PRC_UNREACH_PROTOCOL; break; case ICMP6_PARAMPROB_HEADER: case ICMP6_PARAMPROB_OPTION: code = PRC_PARAMPROB; break; default: goto badcode; } goto deliver; break; case ICMP6_ECHO_REQUEST: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echo); if (code != 0) goto badcode; if ((n = m_copy(m, 0, M_COPYALL)) == NULL) { /* Give up remote */ break; } if ((n->m_flags & M_EXT) != 0 || n->m_len < off + sizeof(struct icmp6_hdr)) { struct mbuf *n0 = n; const int maxlen = sizeof(*nip6) + sizeof(*nicmp6); int n0len; MGETHDR(n, M_DONTWAIT, n0->m_type); n0len = n0->m_pkthdr.len; /* save for use below */ if (n) M_MOVE_PKTHDR(n, n0); if (n && maxlen >= MHLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_free(n); n = NULL; } } if (n == NULL) { /* Give up remote */ m_freem(n0); break; } /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); noff = sizeof(struct ip6_hdr); /* new mbuf contains only ipv6+icmpv6 headers */ n->m_len = noff + sizeof(struct icmp6_hdr); /* * Adjust mbuf. ip6_plen will be adjusted in * ip6_output(). */ m_adj(n0, off + sizeof(struct icmp6_hdr)); /* recalculate complete packet size */ n->m_pkthdr.len = n0len + (noff - off); n->m_next = n0; } else { nip6 = mtod(n, struct ip6_hdr *); nicmp6 = (struct icmp6_hdr *)((caddr_t)nip6 + off); noff = off; } nicmp6->icmp6_type = ICMP6_ECHO_REPLY; nicmp6->icmp6_code = 0; if (n) { icmp6stat.icp6s_reflect++; icmp6stat.icp6s_outhist[ICMP6_ECHO_REPLY]++; icmp6_reflect(n, noff); } break; case ICMP6_ECHO_REPLY: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echoreply); if (code != 0) goto badcode; break; case MLD_LISTENER_QUERY: case MLD_LISTENER_REPORT: if (icmp6len < sizeof(struct mld_hdr)) goto badlen; if (icmp6->icmp6_type == MLD_LISTENER_QUERY) /* XXX: ugly... */ icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldquery); else icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldreport); if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ mld6_input(m, off); m = NULL; goto freeit; } mld6_input(n, off); /* m stays. */ break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mlddone); if (icmp6len < sizeof(struct mld_hdr)) /* necessary? */ goto badlen; break; /* nothing to be done in kernel */ case MLD_MTRACE_RESP: case MLD_MTRACE: /* XXX: these two are experimental. not officially defined. */ /* XXX: per-interface statistics? */ break; /* just pass it to applications */ case ICMP6_WRUREQUEST: /* ICMP6_FQDN_QUERY */ { enum { WRU, FQDN } mode; if (!icmp6_nodeinfo) break; if (icmp6len == sizeof(struct icmp6_hdr) + 4) mode = WRU; else if (icmp6len >= sizeof(struct icmp6_nodeinfo)) mode = FQDN; else goto badlen; #define hostnamelen strlen(hostname) if (mode == FQDN) { #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo), IPPROTO_DONE); #endif n = m_copy(m, 0, M_COPYALL); if (n) n = ni6_input(n, off); /* XXX meaningless if n == NULL */ noff = sizeof(struct ip6_hdr); } else { u_char *p; int maxlen, maxhlen; if ((icmp6_nodeinfo & 5) != 5) break; if (code != 0) goto badcode; maxlen = sizeof(*nip6) + sizeof(*nicmp6) + 4; if (maxlen >= MCLBYTES) { /* Give up remote */ break; } MGETHDR(n, M_DONTWAIT, m->m_type); if (n && maxlen > MHLEN) { MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { m_free(n); n = NULL; } } if (!m_dup_pkthdr(n, m, M_DONTWAIT)) { /* * Previous code did a blind M_COPY_PKTHDR * and said "just for rcvif". If true, then * we could tolerate the dup failing (due to * the deep copy of the tag chain). For now * be conservative and just fail. */ m_free(n); n = NULL; } if (n == NULL) { /* Give up remote */ break; } n->m_pkthdr.rcvif = NULL; n->m_len = 0; maxhlen = M_TRAILINGSPACE(n) - maxlen; if (maxhlen > hostnamelen) maxhlen = hostnamelen; /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); p = (u_char *)(nicmp6 + 1); bzero(p, 4); bcopy(hostname, p + 4, maxhlen); /* meaningless TTL */ noff = sizeof(struct ip6_hdr); n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + 4 + maxhlen; nicmp6->icmp6_type = ICMP6_WRUREPLY; nicmp6->icmp6_code = 0; } #undef hostnamelen if (n) { icmp6stat.icp6s_reflect++; icmp6stat.icp6s_outhist[ICMP6_WRUREPLY]++; icmp6_reflect(n, noff); } break; } case ICMP6_WRUREPLY: if (code != 0) goto badcode; break; case ND_ROUTER_SOLICIT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_routersolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_solicit)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_rs_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_rs_input(n, off, icmp6len); /* m stays. */ break; case ND_ROUTER_ADVERT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_routeradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_advert)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_ra_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_ra_input(n, off, icmp6len); /* m stays. */ break; case ND_NEIGHBOR_SOLICIT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighborsolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_solicit)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_ns_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_ns_input(n, off, icmp6len); /* m stays. */ break; case ND_NEIGHBOR_ADVERT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighboradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_advert)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ nd6_na_input(m, off, icmp6len); m = NULL; goto freeit; } nd6_na_input(n, off, icmp6len); /* m stays. */ break; case ND_REDIRECT: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_redirect); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_redirect)) goto badlen; if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ icmp6_redirect_input(m, off); m = NULL; goto freeit; } icmp6_redirect_input(n, off); /* m stays. */ break; case ICMP6_ROUTER_RENUMBERING: if (code != ICMP6_ROUTER_RENUMBERING_COMMAND && code != ICMP6_ROUTER_RENUMBERING_RESULT) goto badcode; if (icmp6len < sizeof(struct icmp6_router_renum)) goto badlen; break; default: nd6log((LOG_DEBUG, "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n", icmp6->icmp6_type, ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst), m->m_pkthdr.rcvif ? m->m_pkthdr.rcvif->if_index : 0)); if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) { /* ICMPv6 error: MUST deliver it by spec... */ code = PRC_NCMDS; /* deliver */ } else { /* ICMPv6 informational: MUST not deliver */ break; } deliver: if (icmp6_notify_error(&m, off, icmp6len, code)) { /* In this case, m should've been freed. */ return (IPPROTO_DONE); } break; badcode: icmp6stat.icp6s_badcode++; break; badlen: icmp6stat.icp6s_badlen++; break; } /* deliver the packet to appropriate sockets */ icmp6_rip6_input(&m, *offp); return IPPROTO_DONE; freeit: m_freem(m); return IPPROTO_DONE; } static int icmp6_notify_error(mp, off, icmp6len, code) struct mbuf **mp; int off, icmp6len, code; { struct mbuf *m = *mp; struct icmp6_hdr *icmp6; struct ip6_hdr *eip6; u_int32_t notifymtu; struct sockaddr_in6 icmp6src, icmp6dst; if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) { icmp6stat.icp6s_tooshort++; goto freeit; } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr), -1); icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { icmp6stat.icp6s_tooshort++; return (-1); } #endif eip6 = (struct ip6_hdr *)(icmp6 + 1); /* Detect the upper level protocol */ { void (*ctlfunc) __P((int, struct sockaddr *, void *)); u_int8_t nxt = eip6->ip6_nxt; int eoff = off + sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr); struct ip6ctlparam ip6cp; struct in6_addr *finaldst = NULL; int icmp6type = icmp6->icmp6_type; struct ip6_frag *fh; struct ip6_rthdr *rth; struct ip6_rthdr0 *rth0; int rthlen; while (1) { /* XXX: should avoid infinite loop explicitly? */ struct ip6_ext *eh; switch (nxt) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_AH: #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_ext), -1); eh = (struct ip6_ext *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(eh, struct ip6_ext *, m, eoff, sizeof(*eh)); if (eh == NULL) { icmp6stat.icp6s_tooshort++; return (-1); } #endif if (nxt == IPPROTO_AH) eoff += (eh->ip6e_len + 2) << 2; else eoff += (eh->ip6e_len + 1) << 3; nxt = eh->ip6e_nxt; break; case IPPROTO_ROUTING: /* * When the erroneous packet contains a * routing header, we should examine the * header to determine the final destination. * Otherwise, we can't properly update * information that depends on the final * destination (e.g. path MTU). */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(*rth), -1); rth = (struct ip6_rthdr *) (mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m, eoff, sizeof(*rth)); if (rth == NULL) { icmp6stat.icp6s_tooshort++; return (-1); } #endif rthlen = (rth->ip6r_len + 1) << 3; /* * XXX: currently there is no * officially defined type other * than type-0. * Note that if the segment left field * is 0, all intermediate hops must * have been passed. */ if (rth->ip6r_segleft && rth->ip6r_type == IPV6_RTHDR_TYPE_0) { int hops; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + rthlen, -1); rth0 = (struct ip6_rthdr0 *) (mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(rth0, struct ip6_rthdr0 *, m, eoff, rthlen); if (rth0 == NULL) { icmp6stat.icp6s_tooshort++; return (-1); } #endif /* just ignore a bogus header */ if ((rth0->ip6r0_len % 2) == 0 && (hops = rth0->ip6r0_len/2)) finaldst = (struct in6_addr *)(rth0 + 1) + (hops - 1); } eoff += rthlen; nxt = rth->ip6r_nxt; break; case IPPROTO_FRAGMENT: #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(struct ip6_frag), -1); fh = (struct ip6_frag *)(mtod(m, caddr_t) + eoff); #else IP6_EXTHDR_GET(fh, struct ip6_frag *, m, eoff, sizeof(*fh)); if (fh == NULL) { icmp6stat.icp6s_tooshort++; return (-1); } #endif /* * Data after a fragment header is meaningless * unless it is the first fragment, but * we'll go to the notify label for path MTU * discovery. */ if (fh->ip6f_offlg & IP6F_OFF_MASK) goto notify; eoff += sizeof(struct ip6_frag); nxt = fh->ip6f_nxt; break; default: /* * This case includes ESP and the No Next * Header. In such cases going to the notify * label does not have any meaning * (i.e. ctlfunc will be NULL), but we go * anyway since we might have to update * path MTU information. */ goto notify; } } notify: #ifndef PULLDOWN_TEST icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { icmp6stat.icp6s_tooshort++; return (-1); } #endif /* * retrieve parameters from the inner IPv6 header, and convert * them into sockaddr structures. * XXX: there is no guarantee that the source or destination * addresses of the inner packet are in the same scope as * the addresses of the icmp packet. But there is no other * way to determine the zone. */ eip6 = (struct ip6_hdr *)(icmp6 + 1); bzero(&icmp6dst, sizeof(icmp6dst)); icmp6dst.sin6_len = sizeof(struct sockaddr_in6); icmp6dst.sin6_family = AF_INET6; if (finaldst == NULL) icmp6dst.sin6_addr = eip6->ip6_dst; else icmp6dst.sin6_addr = *finaldst; if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; bzero(&icmp6src, sizeof(icmp6src)); icmp6src.sin6_len = sizeof(struct sockaddr_in6); icmp6src.sin6_family = AF_INET6; icmp6src.sin6_addr = eip6->ip6_src; if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; icmp6src.sin6_flowinfo = (eip6->ip6_flow & IPV6_FLOWLABEL_MASK); if (finaldst == NULL) finaldst = &eip6->ip6_dst; ip6cp.ip6c_m = m; ip6cp.ip6c_icmp6 = icmp6; ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1); ip6cp.ip6c_off = eoff; ip6cp.ip6c_finaldst = finaldst; ip6cp.ip6c_src = &icmp6src; ip6cp.ip6c_nxt = nxt; if (icmp6type == ICMP6_PACKET_TOO_BIG) { notifymtu = ntohl(icmp6->icmp6_mtu); ip6cp.ip6c_cmdarg = (void *)¬ifymtu; icmp6_mtudisc_update(&ip6cp, 1); /*XXX*/ } ctlfunc = (void (*) __P((int, struct sockaddr *, void *))) (inet6sw[ip6_protox[nxt]].pr_ctlinput); if (ctlfunc) { (void) (*ctlfunc)(code, (struct sockaddr *)&icmp6dst, &ip6cp); } } *mp = m; return (0); freeit: m_freem(m); return (-1); } void icmp6_mtudisc_update(ip6cp, validated) struct ip6ctlparam *ip6cp; int validated; { struct in6_addr *dst = ip6cp->ip6c_finaldst; struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6; struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */ u_int mtu = ntohl(icmp6->icmp6_mtu); struct in_conninfo inc; #if 0 /* * RFC2460 section 5, last paragraph. * even though minimum link MTU for IPv6 is IPV6_MMTU, * we may see ICMPv6 too big with mtu < IPV6_MMTU * due to packet translator in the middle. * see ip6_output() and ip6_getpmtu() "alwaysfrag" case for * special handling. */ if (mtu < IPV6_MMTU) return; #endif /* * we reject ICMPv6 too big with abnormally small value. * XXX what is the good definition of "abnormally small"? */ if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8) return; if (!validated) return; bzero(&inc, sizeof(inc)); inc.inc_flags = 1; /* IPv6 */ inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) return; - if (mtu < tcp_maxmtu6(&inc)) { + if (mtu < tcp_maxmtu6(&inc, NULL)) { tcp_hc_updatemtu(&inc, mtu); icmp6stat.icp6s_pmtuchg++; } } /* * Process a Node Information Query packet, based on * draft-ietf-ipngwg-icmp-name-lookups-07. * * Spec incompatibilities: * - IPv6 Subject address handling * - IPv4 Subject address handling support missing * - Proxy reply (answer even if it's not for me) * - joins NI group address at in6_ifattach() time only, does not cope * with hostname changes by sethostname(3) */ #define hostnamelen strlen(hostname) static struct mbuf * ni6_input(m, off) struct mbuf *m; int off; { struct icmp6_nodeinfo *ni6, *nni6; struct mbuf *n = NULL; u_int16_t qtype; int subjlen; int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); struct ni_reply_fqdn *fqdn; int addrs; /* for NI_QTYPE_NODEADDR */ struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */ struct in6_addr in6_subj; /* subject address */ struct ip6_hdr *ip6; int oldfqdn = 0; /* if 1, return pascal string (03 draft) */ char *subj = NULL; struct in6_ifaddr *ia6 = NULL; ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo *, m, off, sizeof(*ni6)); if (ni6 == NULL) { /* m is already reclaimed */ return (NULL); } #endif /* * Validate IPv6 destination address. * * The Responder must discard the Query without further processing * unless it is one of the Responder's unicast or anycast addresses, or * a link-local scope multicast address which the Responder has joined. * [icmp-name-lookups-08, Section 4.] */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) goto bad; /* else it's a link-local multicast, fine */ } else { /* unicast or anycast */ if ((ia6 = ip6_getdstifaddr(m)) == NULL) goto bad; /* XXX impossible */ if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) && !(icmp6_nodeinfo & 4)) { nd6log((LOG_DEBUG, "ni6_input: ignore node info to " "a temporary address in %s:%d", __FILE__, __LINE__)); goto bad; } } /* validate query Subject field. */ qtype = ntohs(ni6->ni_qtype); subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo); switch (qtype) { case NI_QTYPE_NOOP: case NI_QTYPE_SUPTYPES: /* 07 draft */ if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0) break; /* FALLTHROUGH */ case NI_QTYPE_FQDN: case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: #if ICMP6_NI_SUBJ_IPV6 != 0 case 0: #endif /* * backward compatibility - try to accept 03 draft * format, where no Subject is present. */ if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 && subjlen == 0) { oldfqdn++; break; } #if ICMP6_NI_SUBJ_IPV6 != 0 if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6) goto bad; #endif if (subjlen != sizeof(struct in6_addr)) goto bad; /* * Validate Subject address. * * Not sure what exactly "address belongs to the node" * means in the spec, is it just unicast, or what? * * At this moment we consider Subject address as * "belong to the node" if the Subject address equals * to the IPv6 destination address; validation for * IPv6 destination address should have done enough * check for us. * * We do not do proxy at this moment. */ /* m_pulldown instead of copy? */ m_copydata(m, off + sizeof(struct icmp6_nodeinfo), subjlen, (caddr_t)&in6_subj); if (in6_setscope(&in6_subj, m->m_pkthdr.rcvif, NULL)) goto bad; subj = (char *)&in6_subj; if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj)) break; /* * XXX if we are to allow other cases, we should really * be careful about scope here. * basically, we should disallow queries toward IPv6 * destination X with subject Y, * if scope(X) > scope(Y). * if we allow scope(X) > scope(Y), it will result in * information leakage across scope boundary. */ goto bad; case ICMP6_NI_SUBJ_FQDN: /* * Validate Subject name with gethostname(3). * * The behavior may need some debate, since: * - we are not sure if the node has FQDN as * hostname (returned by gethostname(3)). * - the code does wildcard match for truncated names. * however, we are not sure if we want to perform * wildcard match, if gethostname(3) side has * truncated hostname. */ n = ni6_nametodns(hostname, hostnamelen, 0); if (!n || n->m_next || n->m_len == 0) goto bad; IP6_EXTHDR_GET(subj, char *, m, off + sizeof(struct icmp6_nodeinfo), subjlen); if (subj == NULL) goto bad; if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *), n->m_len)) { goto bad; } m_freem(n); n = NULL; break; case ICMP6_NI_SUBJ_IPV4: /* XXX: to be implemented? */ default: goto bad; } break; } /* refuse based on configuration. XXX ICMP6_NI_REFUSED? */ switch (qtype) { case NI_QTYPE_FQDN: if ((icmp6_nodeinfo & 1) == 0) goto bad; break; case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: if ((icmp6_nodeinfo & 2) == 0) goto bad; break; } /* guess reply length */ switch (qtype) { case NI_QTYPE_NOOP: break; /* no reply data */ case NI_QTYPE_SUPTYPES: replylen += sizeof(u_int32_t); break; case NI_QTYPE_FQDN: /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); break; case NI_QTYPE_NODEADDR: addrs = ni6_addrs(ni6, m, &ifp, (struct in6_addr *)subj); if ((replylen += addrs * (sizeof(struct in6_addr) + sizeof(u_int32_t))) > MCLBYTES) replylen = MCLBYTES; /* XXX: will truncate pkt later */ break; case NI_QTYPE_IPV4ADDR: /* unsupported - should respond with unknown Qtype? */ break; default: /* * XXX: We must return a reply with the ICMP6 code * `unknown Qtype' in this case. However we regard the case * as an FQDN query for backward compatibility. * Older versions set a random value to this field, * so it rarely varies in the defined qtypes. * But the mechanism is not reliable... * maybe we should obsolete older versions. */ qtype = NI_QTYPE_FQDN; /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); oldfqdn++; break; } /* allocate an mbuf to reply. */ MGETHDR(n, M_DONTWAIT, m->m_type); if (n == NULL) { m_freem(m); return (NULL); } M_MOVE_PKTHDR(n, m); /* just for recvif */ if (replylen > MHLEN) { if (replylen > MCLBYTES) { /* * XXX: should we try to allocate more? But MCLBYTES * is probably much larger than IPV6_MMTU... */ goto bad; } MCLGET(n, M_DONTWAIT); if ((n->m_flags & M_EXT) == 0) { goto bad; } } n->m_pkthdr.len = n->m_len = replylen; /* copy mbuf header and IPv6 + Node Information base headers */ bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr)); nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1); bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo)); /* qtype dependent procedure */ switch (qtype) { case NI_QTYPE_NOOP: nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = 0; break; case NI_QTYPE_SUPTYPES: { u_int32_t v; nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = htons(0x0000); /* raw bitmap */ /* supports NOOP, SUPTYPES, FQDN, and NODEADDR */ v = (u_int32_t)htonl(0x0000000f); bcopy(&v, nni6 + 1, sizeof(u_int32_t)); break; } case NI_QTYPE_FQDN: nni6->ni_code = ICMP6_NI_SUCCESS; fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) + sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo)); nni6->ni_flags = 0; /* XXX: meaningless TTL */ fqdn->ni_fqdn_ttl = 0; /* ditto. */ /* * XXX do we really have FQDN in variable "hostname"? */ n->m_next = ni6_nametodns(hostname, hostnamelen, oldfqdn); if (n->m_next == NULL) goto bad; /* XXX we assume that n->m_next is not a chain */ if (n->m_next->m_next != NULL) goto bad; n->m_pkthdr.len += n->m_next->m_len; break; case NI_QTYPE_NODEADDR: { int lenlim, copied; nni6->ni_code = ICMP6_NI_SUCCESS; n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); lenlim = M_TRAILINGSPACE(n); copied = ni6_store_addrs(ni6, nni6, ifp, lenlim); /* XXX: reset mbuf length */ n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo) + copied; break; } default: break; /* XXX impossible! */ } nni6->ni_type = ICMP6_NI_REPLY; m_freem(m); return (n); bad: m_freem(m); if (n) m_freem(n); return (NULL); } #undef hostnamelen /* * make a mbuf with DNS-encoded string. no compression support. * * XXX names with less than 2 dots (like "foo" or "foo.section") will be * treated as truncated name (two \0 at the end). this is a wild guess. */ static struct mbuf * ni6_nametodns(name, namelen, old) const char *name; int namelen; int old; /* return pascal string if non-zero */ { struct mbuf *m; char *cp, *ep; const char *p, *q; int i, len, nterm; if (old) len = namelen + 1; else len = MCLBYTES; /* because MAXHOSTNAMELEN is usually 256, we use cluster mbuf */ MGET(m, M_DONTWAIT, MT_DATA); if (m && len > MLEN) { MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) goto fail; } if (!m) goto fail; m->m_next = NULL; if (old) { m->m_len = len; *mtod(m, char *) = namelen; bcopy(name, mtod(m, char *) + 1, namelen); return m; } else { m->m_len = 0; cp = mtod(m, char *); ep = mtod(m, char *) + M_TRAILINGSPACE(m); /* if not certain about my name, return empty buffer */ if (namelen == 0) return m; /* * guess if it looks like shortened hostname, or FQDN. * shortened hostname needs two trailing "\0". */ i = 0; for (p = name; p < name + namelen; p++) { if (*p && *p == '.') i++; } if (i < 2) nterm = 2; else nterm = 1; p = name; while (cp < ep && p < name + namelen) { i = 0; for (q = p; q < name + namelen && *q && *q != '.'; q++) i++; /* result does not fit into mbuf */ if (cp + i + 1 >= ep) goto fail; /* * DNS label length restriction, RFC1035 page 8. * "i == 0" case is included here to avoid returning * 0-length label on "foo..bar". */ if (i <= 0 || i >= 64) goto fail; *cp++ = i; bcopy(p, cp, i); cp += i; p = q; if (p < name + namelen && *p == '.') p++; } /* termination */ if (cp + nterm >= ep) goto fail; while (nterm-- > 0) *cp++ = '\0'; m->m_len = cp - mtod(m, char *); return m; } panic("should not reach here"); /* NOTREACHED */ fail: if (m) m_freem(m); return NULL; } /* * check if two DNS-encoded string matches. takes care of truncated * form (with \0\0 at the end). no compression support. * XXX upper/lowercase match (see RFC2065) */ static int ni6_dnsmatch(a, alen, b, blen) const char *a; int alen; const char *b; int blen; { const char *a0, *b0; int l; /* simplest case - need validation? */ if (alen == blen && bcmp(a, b, alen) == 0) return 1; a0 = a; b0 = b; /* termination is mandatory */ if (alen < 2 || blen < 2) return 0; if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0') return 0; alen--; blen--; while (a - a0 < alen && b - b0 < blen) { if (a - a0 + 1 > alen || b - b0 + 1 > blen) return 0; if ((signed char)a[0] < 0 || (signed char)b[0] < 0) return 0; /* we don't support compression yet */ if (a[0] >= 64 || b[0] >= 64) return 0; /* truncated case */ if (a[0] == 0 && a - a0 == alen - 1) return 1; if (b[0] == 0 && b - b0 == blen - 1) return 1; if (a[0] == 0 || b[0] == 0) return 0; if (a[0] != b[0]) return 0; l = a[0]; if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen) return 0; if (bcmp(a + 1, b + 1, l) != 0) return 0; a += 1 + l; b += 1 + l; } if (a - a0 == alen && b - b0 == blen) return 1; else return 0; } /* * calculate the number of addresses to be returned in the node info reply. */ static int ni6_addrs(ni6, m, ifpp, subj) struct icmp6_nodeinfo *ni6; struct mbuf *m; struct ifnet **ifpp; struct in6_addr *subj; { struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; int addrs = 0, addrsofif, iffound = 0; int niflags = ni6->ni_flags; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) { switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: if (subj == NULL) /* must be impossible... */ return (0); break; default: /* * XXX: we only support IPv6 subject address for * this Qtype. */ return (0); } } IFNET_RLOCK(); for (ifp = TAILQ_FIRST(&ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { addrsofif = 0; TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 && IN6_ARE_ADDR_EQUAL(subj, &ifa6->ia_addr.sin6_addr)) iffound = 1; /* * IPv4-mapped addresses can only be returned by a * Node Information proxy, since they represent * addresses of IPv4-only nodes, which perforce do * not implement this protocol. * [icmp-name-lookups-07, Section 5.4] * So we don't support NI_NODEADDR_FLAG_COMPAT in * this function at this moment. */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; /* we need only unicast addresses */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (icmp6_nodeinfo & 4) == 0) { continue; } addrsofif++; /* count the address */ } if (iffound) { *ifpp = ifp; IFNET_RUNLOCK(); return (addrsofif); } addrs += addrsofif; } IFNET_RUNLOCK(); return (addrs); } static int ni6_store_addrs(ni6, nni6, ifp0, resid) struct icmp6_nodeinfo *ni6, *nni6; struct ifnet *ifp0; int resid; { struct ifnet *ifp = ifp0 ? ifp0 : TAILQ_FIRST(&ifnet); struct in6_ifaddr *ifa6; struct ifaddr *ifa; struct ifnet *ifp_dep = NULL; int copied = 0, allow_deprecated = 0; u_char *cp = (u_char *)(nni6 + 1); int niflags = ni6->ni_flags; u_int32_t ltime; if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL)) return (0); /* needless to copy */ IFNET_RLOCK(); again: for (; ifp; ifp = TAILQ_NEXT(ifp, if_list)) { for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 && allow_deprecated == 0) { /* * prefererred address should be put before * deprecated addresses. */ /* record the interface for later search */ if (ifp_dep == NULL) ifp_dep = ifp; continue; } else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 && allow_deprecated != 0) continue; /* we now collect deprecated addrs */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (icmp6_nodeinfo & 4) == 0) { continue; } /* now we can copy the address */ if (resid < sizeof(struct in6_addr) + sizeof(u_int32_t)) { /* * We give up much more copy. * Set the truncate flag and return. */ nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE; IFNET_RUNLOCK(); return (copied); } /* * Set the TTL of the address. * The TTL value should be one of the following * according to the specification: * * 1. The remaining lifetime of a DHCP lease on the * address, or * 2. The remaining Valid Lifetime of a prefix from * which the address was derived through Stateless * Autoconfiguration. * * Note that we currently do not support stateful * address configuration by DHCPv6, so the former * case can't happen. */ if (ifa6->ia6_lifetime.ia6t_expire == 0) ltime = ND6_INFINITE_LIFETIME; else { if (ifa6->ia6_lifetime.ia6t_expire > time_second) ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_second); else ltime = 0; } bcopy(<ime, cp, sizeof(u_int32_t)); cp += sizeof(u_int32_t); /* copy the address itself */ bcopy(&ifa6->ia_addr.sin6_addr, cp, sizeof(struct in6_addr)); in6_clearscope((struct in6_addr *)cp); /* XXX */ cp += sizeof(struct in6_addr); resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t)); copied += (sizeof(struct in6_addr) + sizeof(u_int32_t)); } if (ifp0) /* we need search only on the specified IF */ break; } if (allow_deprecated == 0 && ifp_dep != NULL) { ifp = ifp_dep; allow_deprecated = 1; goto again; } IFNET_RUNLOCK(); return (copied); } /* * XXX almost dup'ed code with rip6_input. */ static int icmp6_rip6_input(mp, off) struct mbuf **mp; int off; { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct in6pcb *in6p; struct in6pcb *last = NULL; struct sockaddr_in6 fromsa; struct icmp6_hdr *icmp6; struct mbuf *opts = NULL; #ifndef PULLDOWN_TEST /* this is assumed to be safe. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { /* m is already reclaimed */ return (IPPROTO_DONE); } #endif /* * XXX: the address may have embedded scope zone ID, which should be * hidden from applications. */ bzero(&fromsa, sizeof(fromsa)); fromsa.sin6_family = AF_INET6; fromsa.sin6_len = sizeof(struct sockaddr_in6); fromsa.sin6_addr = ip6->ip6_src; if (sa6_recoverscope(&fromsa)) { m_freem(m); return (IPPROTO_DONE); } INP_INFO_RLOCK(&ripcbinfo); LIST_FOREACH(in6p, &ripcb, inp_list) { INP_LOCK(in6p); if ((in6p->inp_vflag & INP_IPV6) == 0) { docontinue: INP_UNLOCK(in6p); continue; } if (in6p->in6p_ip6_nxt != IPPROTO_ICMPV6) goto docontinue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst)) goto docontinue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) goto docontinue; if (in6p->in6p_icmp6filt && ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, in6p->in6p_icmp6filt)) goto docontinue; if (last) { struct mbuf *n = NULL; /* * Recent network drivers tend to allocate a single * mbuf cluster, rather than to make a couple of * mbufs without clusters. Also, since the IPv6 code * path tries to avoid m_pullup(), it is highly * probable that we still have an mbuf cluster here * even though the necessary length can be stored in an * mbuf's internal buffer. * Meanwhile, the default size of the receive socket * buffer for raw sockets is not so large. This means * the possibility of packet loss is relatively higher * than before. To avoid this scenario, we copy the * received data to a separate mbuf that does not use * a cluster, if possible. * XXX: it is better to copy the data after stripping * intermediate headers. */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { MGET(n, M_DONTWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; } else { m_free(n); n = NULL; } } } if (n != NULL || (n = m_copy(m, 0, (int)M_COPYALL)) != NULL) { if (last->in6p_flags & IN6P_CONTROLOPTS) ip6_savecontrol(last, n, &opts); /* strip intermediate headers */ m_adj(n, off); SOCKBUF_LOCK(&last->in6p_socket->so_rcv); if (sbappendaddr_locked( &last->in6p_socket->so_rcv, (struct sockaddr *)&fromsa, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) { m_freem(opts); } SOCKBUF_UNLOCK( &last->in6p_socket->so_rcv); } else sorwakeup_locked(last->in6p_socket); opts = NULL; } INP_UNLOCK(last); } last = in6p; } if (last) { if (last->in6p_flags & IN6P_CONTROLOPTS) ip6_savecontrol(last, m, &opts); /* strip intermediate headers */ m_adj(m, off); /* avoid using mbuf clusters if possible (see above) */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { struct mbuf *n; MGET(n, M_DONTWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; m_freem(m); m = n; } else { m_freem(n); n = NULL; } } } SOCKBUF_LOCK(&last->in6p_socket->so_rcv); if (sbappendaddr_locked(&last->in6p_socket->so_rcv, (struct sockaddr *)&fromsa, m, opts) == 0) { m_freem(m); if (opts) m_freem(opts); SOCKBUF_UNLOCK(&last->in6p_socket->so_rcv); } else sorwakeup_locked(last->in6p_socket); INP_UNLOCK(last); } else { m_freem(m); ip6stat.ip6s_delivered--; } INP_INFO_RUNLOCK(&ripcbinfo); return IPPROTO_DONE; } /* * Reflect the ip6 packet back to the source. * OFF points to the icmp6 header, counted from the top of the mbuf. */ void icmp6_reflect(m, off) struct mbuf *m; size_t off; { struct ip6_hdr *ip6; struct icmp6_hdr *icmp6; struct in6_ifaddr *ia; int plen; int type, code; struct ifnet *outif = NULL; struct in6_addr origdst, *src = NULL; /* too short to reflect */ if (off < sizeof(struct ip6_hdr)) { nd6log((LOG_DEBUG, "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n", (u_long)off, (u_long)sizeof(struct ip6_hdr), __FILE__, __LINE__)); goto bad; } /* * If there are extra headers between IPv6 and ICMPv6, strip * off that header first. */ #ifdef DIAGNOSTIC if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN) panic("assumption failed in icmp6_reflect"); #endif if (off > sizeof(struct ip6_hdr)) { size_t l; struct ip6_hdr nip6; l = off - sizeof(struct ip6_hdr); m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6); m_adj(m, l); l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6)); } else /* off == sizeof(struct ip6_hdr) */ { size_t l; l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } } plen = m->m_pkthdr.len - sizeof(struct ip6_hdr); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_nxt = IPPROTO_ICMPV6; icmp6 = (struct icmp6_hdr *)(ip6 + 1); type = icmp6->icmp6_type; /* keep type for statistics */ code = icmp6->icmp6_code; /* ditto. */ origdst = ip6->ip6_dst; /* * ip6_input() drops a packet if its src is multicast. * So, the src is never multicast. */ ip6->ip6_dst = ip6->ip6_src; /* * If the incoming packet was addressed directly to us (i.e. unicast), * use dst as the src for the reply. * The IN6_IFF_NOTREADY case should be VERY rare, but is possible * (for example) when we encounter an error while forwarding procedure * destined to a duplicated address of ours. * Note that ip6_getdstifaddr() may fail if we are in an error handling * procedure of an outgoing packet of our own, in which case we need * to search in the ifaddr list. */ if (!IN6_IS_ADDR_MULTICAST(&origdst)) { if ((ia = ip6_getdstifaddr(m))) { if (!(ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY))) src = &ia->ia_addr.sin6_addr; } else { struct sockaddr_in6 d; bzero(&d, sizeof(d)); d.sin6_family = AF_INET6; d.sin6_len = sizeof(d); d.sin6_addr = origdst; ia = (struct in6_ifaddr *) ifa_ifwithaddr((struct sockaddr *)&d); if (ia && !(ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY))) { src = &ia->ia_addr.sin6_addr; } } } if (src == NULL) { int e; struct sockaddr_in6 sin6; struct route_in6 ro; /* * This case matches to multicasts, our anycast, or unicasts * that we do not own. Select a source address based on the * source address of the erroneous packet. */ bzero(&sin6, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_addr = ip6->ip6_dst; /* zone ID should be embedded */ bzero(&ro, sizeof(ro)); src = in6_selectsrc(&sin6, NULL, NULL, &ro, NULL, &outif, &e); if (ro.ro_rt) RTFREE(ro.ro_rt); /* XXX: we could use this */ if (src == NULL) { nd6log((LOG_DEBUG, "icmp6_reflect: source can't be determined: " "dst=%s, error=%d\n", ip6_sprintf(&sin6.sin6_addr), e)); goto bad; } } ip6->ip6_src = *src; ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; if (outif) ip6->ip6_hlim = ND_IFINFO(outif)->chlim; else if (m->m_pkthdr.rcvif) { /* XXX: This may not be the outgoing interface */ ip6->ip6_hlim = ND_IFINFO(m->m_pkthdr.rcvif)->chlim; } else ip6->ip6_hlim = ip6_defhlim; icmp6->icmp6_cksum = 0; icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), plen); /* * XXX option handling */ m->m_flags &= ~(M_BCAST|M_MCAST); ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) icmp6_ifoutstat_inc(outif, type, code); return; bad: m_freem(m); return; } void icmp6_fasttimo() { return; } static const char * icmp6_redirect_diag(src6, dst6, tgt6) struct in6_addr *src6; struct in6_addr *dst6; struct in6_addr *tgt6; { static char buf[1024]; snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)", ip6_sprintf(src6), ip6_sprintf(dst6), ip6_sprintf(tgt6)); return buf; } void icmp6_redirect_input(m, off) struct mbuf *m; int off; { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_redirect *nd_rd; int icmp6len = ntohs(ip6->ip6_plen); char *lladdr = NULL; int lladdrlen = 0; u_char *redirhdr = NULL; int redirhdrlen = 0; struct rtentry *rt = NULL; int is_router; int is_onlink; struct in6_addr src6 = ip6->ip6_src; struct in6_addr redtgt6; struct in6_addr reddst6; union nd_opts ndopts; if (!m || !ifp) return; /* XXX if we are router, we don't update route by icmp6 redirect */ if (ip6_forwarding) goto freeit; if (!icmp6_rediraccept) goto freeit; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len); if (nd_rd == NULL) { icmp6stat.icp6s_tooshort++; return; } #endif redtgt6 = nd_rd->nd_rd_target; reddst6 = nd_rd->nd_rd_dst; if (in6_setscope(&redtgt6, m->m_pkthdr.rcvif, NULL) || in6_setscope(&reddst6, m->m_pkthdr.rcvif, NULL)) { goto freeit; } /* validation */ if (!IN6_IS_ADDR_LINKLOCAL(&src6)) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "must be from linklocal\n", ip6_sprintf(&src6))); goto bad; } if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "hlim=%d (must be 255)\n", ip6_sprintf(&src6), ip6->ip6_hlim)); goto bad; } { /* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */ struct sockaddr_in6 sin6; struct in6_addr *gw6; bzero(&sin6, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sin6.sin6_addr, sizeof(reddst6)); rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL); if (rt) { if (rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_INET6) { nd6log((LOG_ERR, "ICMP6 redirect rejected; no route " "with inet6 gateway found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); RTFREE_LOCKED(rt); goto bad; } gw6 = &(((struct sockaddr_in6 *)rt->rt_gateway)->sin6_addr); if (bcmp(&src6, gw6, sizeof(struct in6_addr)) != 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "not equal to gw-for-src=%s (must be same): " "%s\n", ip6_sprintf(gw6), icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); RTFREE_LOCKED(rt); goto bad; } } else { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "no route found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } RTFREE_LOCKED(rt); rt = NULL; } if (IN6_IS_ADDR_MULTICAST(&reddst6)) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "redirect dst must be unicast: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } is_router = is_onlink = 0; if (IN6_IS_ADDR_LINKLOCAL(&redtgt6)) is_router = 1; /* router case */ if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0) is_onlink = 1; /* on-link destination case */ if (!is_router && !is_onlink) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "neither router case nor onlink case: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* validation passed */ icmp6len -= sizeof(*nd_rd); nd6_option_init(nd_rd + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "icmp6_redirect_input: " "invalid ND option, rejected: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); /* nd6_options have incremented stats */ goto freeit; } if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } if (ndopts.nd_opts_rh) { redirhdrlen = ndopts.nd_opts_rh->nd_opt_rh_len; redirhdr = (u_char *)(ndopts.nd_opts_rh + 1); /* xxx */ } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "icmp6_redirect_input: lladdrlen mismatch for %s " "(if %d, icmp6 packet %d): %s\n", ip6_sprintf(&redtgt6), ifp->if_addrlen, lladdrlen - 2, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* RFC 2461 8.3 */ nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT, is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER); if (!is_onlink) { /* better router case. perform rtredirect. */ /* perform rtredirect */ struct sockaddr_in6 sdst; struct sockaddr_in6 sgw; struct sockaddr_in6 ssrc; bzero(&sdst, sizeof(sdst)); bzero(&sgw, sizeof(sgw)); bzero(&ssrc, sizeof(ssrc)); sdst.sin6_family = sgw.sin6_family = ssrc.sin6_family = AF_INET6; sdst.sin6_len = sgw.sin6_len = ssrc.sin6_len = sizeof(struct sockaddr_in6); bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr)); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr)); rtredirect((struct sockaddr *)&sdst, (struct sockaddr *)&sgw, (struct sockaddr *)NULL, RTF_GATEWAY | RTF_HOST, (struct sockaddr *)&ssrc); } /* finally update cached route in each socket via pfctlinput */ { struct sockaddr_in6 sdst; bzero(&sdst, sizeof(sdst)); sdst.sin6_family = AF_INET6; sdst.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&sdst); #if defined(IPSEC) || defined(FAST_IPSEC) key_sa_routechange((struct sockaddr *)&sdst); #endif } freeit: m_freem(m); return; bad: icmp6stat.icp6s_badredirect++; m_freem(m); } void icmp6_redirect_output(m0, rt) struct mbuf *m0; struct rtentry *rt; { struct ifnet *ifp; /* my outgoing interface */ struct in6_addr *ifp_ll6; struct in6_addr *router_ll6; struct ip6_hdr *sip6; /* m0 as struct ip6_hdr */ struct mbuf *m = NULL; /* newly allocated one */ struct ip6_hdr *ip6; /* m as struct ip6_hdr */ struct nd_redirect *nd_rd; size_t maxlen; u_char *p; struct ifnet *outif = NULL; struct sockaddr_in6 src_sa; icmp6_errcount(&icmp6stat.icp6s_outerrhist, ND_REDIRECT, 0); /* if we are not router, we don't send icmp6 redirect */ if (!ip6_forwarding) goto fail; /* sanity check */ if (!m0 || !rt || !(rt->rt_flags & RTF_UP) || !(ifp = rt->rt_ifp)) goto fail; /* * Address check: * the source address must identify a neighbor, and * the destination address must not be a multicast address * [RFC 2461, sec 8.2] */ sip6 = mtod(m0, struct ip6_hdr *); bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = sip6->ip6_src; if (nd6_is_addr_neighbor(&src_sa, ifp) == 0) goto fail; if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst)) goto fail; /* what should we do here? */ /* rate limit */ if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0)) goto fail; /* * Since we are going to append up to 1280 bytes (= IPV6_MMTU), * we almost always ask for an mbuf cluster for simplicity. * (MHLEN < IPV6_MMTU is almost always true) */ #if IPV6_MMTU >= MCLBYTES # error assumption failed about IPV6_MMTU and MCLBYTES #endif MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m && IPV6_MMTU >= MHLEN) MCLGET(m, M_DONTWAIT); if (!m) goto fail; m->m_pkthdr.rcvif = NULL; m->m_len = 0; maxlen = M_TRAILINGSPACE(m); maxlen = min(IPV6_MMTU, maxlen); /* just for safety */ if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) { goto fail; } { /* get ip6 linklocal address for ifp(my outgoing interface). */ struct in6_ifaddr *ia; if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY| IN6_IFF_ANYCAST)) == NULL) goto fail; ifp_ll6 = &ia->ia_addr.sin6_addr; } /* get ip6 linklocal address for the router. */ if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)rt->rt_gateway; router_ll6 = &sin6->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(router_ll6)) router_ll6 = (struct in6_addr *)NULL; } else router_ll6 = (struct in6_addr *)NULL; /* ip6 */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; /* ip6->ip6_src must be linklocal addr for my outgoing if. */ bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr)); bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr)); /* ND Redirect */ nd_rd = (struct nd_redirect *)(ip6 + 1); nd_rd->nd_rd_type = ND_REDIRECT; nd_rd->nd_rd_code = 0; nd_rd->nd_rd_reserved = 0; if (rt->rt_flags & RTF_GATEWAY) { /* * nd_rd->nd_rd_target must be a link-local address in * better router cases. */ if (!router_ll6) goto fail; bcopy(router_ll6, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } else { /* make sure redtgt == reddst */ bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } p = (u_char *)(nd_rd + 1); if (!router_ll6) goto nolladdropt; { /* target lladdr option */ struct rtentry *rt_router = NULL; int len; struct sockaddr_dl *sdl; struct nd_opt_hdr *nd_opt; char *lladdr; rt_router = nd6_lookup(router_ll6, 0, ifp); if (!rt_router) goto nolladdropt; len = sizeof(*nd_opt) + ifp->if_addrlen; len = (len + 7) & ~7; /* round by 8 */ /* safety check */ if (len + (p - (u_char *)ip6) > maxlen) goto nolladdropt; if (!(rt_router->rt_flags & RTF_GATEWAY) && (rt_router->rt_flags & RTF_LLINFO) && (rt_router->rt_gateway->sa_family == AF_LINK) && (sdl = (struct sockaddr_dl *)rt_router->rt_gateway) && sdl->sdl_alen) { nd_opt = (struct nd_opt_hdr *)p; nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = len >> 3; lladdr = (char *)(nd_opt + 1); bcopy(LLADDR(sdl), lladdr, ifp->if_addrlen); p += len; } } nolladdropt:; m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* just to be safe */ #ifdef M_DECRYPTED /*not openbsd*/ if (m0->m_flags & M_DECRYPTED) goto noredhdropt; #endif if (p - (u_char *)ip6 > maxlen) goto noredhdropt; { /* redirected header option */ int len; struct nd_opt_rd_hdr *nd_opt_rh; /* * compute the maximum size for icmp6 redirect header option. * XXX room for auth header? */ len = maxlen - (p - (u_char *)ip6); len &= ~7; /* This is just for simplicity. */ if (m0->m_pkthdr.len != m0->m_len) { if (m0->m_next) { m_freem(m0->m_next); m0->m_next = NULL; } m0->m_pkthdr.len = m0->m_len; } /* * Redirected header option spec (RFC2461 4.6.3) talks nothing * about padding/truncate rule for the original IP packet. * From the discussion on IPv6imp in Feb 1999, * the consensus was: * - "attach as much as possible" is the goal * - pad if not aligned (original size can be guessed by * original ip6 header) * Following code adds the padding if it is simple enough, * and truncates if not. */ if (m0->m_next || m0->m_pkthdr.len != m0->m_len) panic("assumption failed in %s:%d", __FILE__, __LINE__); if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) { /* not enough room, truncate */ m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } else { /* enough room, pad or truncate */ size_t extra; extra = m0->m_pkthdr.len % 8; if (extra) { /* pad if easy enough, truncate if not */ if (8 - extra <= M_TRAILINGSPACE(m0)) { /* pad */ m0->m_len += (8 - extra); m0->m_pkthdr.len += (8 - extra); } else { /* truncate */ m0->m_pkthdr.len -= extra; m0->m_len -= extra; } } len = m0->m_pkthdr.len + sizeof(*nd_opt_rh); m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } nd_opt_rh = (struct nd_opt_rd_hdr *)p; bzero(nd_opt_rh, sizeof(*nd_opt_rh)); nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER; nd_opt_rh->nd_opt_rh_len = len >> 3; p += sizeof(*nd_opt_rh); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* connect m0 to m */ m_tag_delete_chain(m0, NULL); m0->m_flags &= ~M_PKTHDR; m->m_next = m0; m->m_pkthdr.len = m->m_len + m0->m_len; m0 = NULL; } noredhdropt:; if (m0) { m_freem(m0); m0 = NULL; } /* XXX: clear embedded link IDs in the inner header */ in6_clearscope(&sip6->ip6_src); in6_clearscope(&sip6->ip6_dst); in6_clearscope(&nd_rd->nd_rd_target); in6_clearscope(&nd_rd->nd_rd_dst); ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); nd_rd->nd_rd_cksum = 0; nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), ntohs(ip6->ip6_plen)); /* send the packet to outside... */ ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_redirect); } icmp6stat.icp6s_outhist[ND_REDIRECT]++; return; fail: if (m) m_freem(m); if (m0) m_freem(m0); } /* * ICMPv6 socket option processing. */ int icmp6_ctloutput(so, sopt) struct socket *so; struct sockopt *sopt; { int error = 0; int optlen; struct inpcb *inp = sotoinpcb(so); int level, op, optname; if (sopt) { level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; } else level = op = optname = optlen = 0; if (level != IPPROTO_ICMPV6) { return EINVAL; } switch (op) { case PRCO_SETOPT: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter *p; if (optlen != sizeof(*p)) { error = EMSGSIZE; break; } if (inp->in6p_icmp6filt == NULL) { error = EINVAL; break; } error = sooptcopyin(sopt, inp->in6p_icmp6filt, optlen, optlen); break; } default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (optname) { case ICMP6_FILTER: { if (inp->in6p_icmp6filt == NULL) { error = EINVAL; break; } error = sooptcopyout(sopt, inp->in6p_icmp6filt, sizeof(struct icmp6_filter)); break; } default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Perform rate limit check. * Returns 0 if it is okay to send the icmp6 packet. * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate * limitation. * * XXX per-destination/type check necessary? */ static int icmp6_ratelimit(dst, type, code) const struct in6_addr *dst; /* not used at this moment */ const int type; /* not used at this moment */ const int code; /* not used at this moment */ { int ret; ret = 0; /* okay to send */ /* PPS limit */ if (!ppsratecheck(&icmp6errppslim_last, &icmp6errpps_count, icmp6errppslim)) { /* The packet is subject to rate limit */ ret++; } return ret; } Index: head/sys/sys/mbuf.h =================================================================== --- head/sys/sys/mbuf.h (revision 162083) +++ head/sys/sys/mbuf.h (revision 162084) @@ -1,855 +1,857 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mbuf.h 8.5 (Berkeley) 2/19/95 * $FreeBSD$ */ #ifndef _SYS_MBUF_H_ #define _SYS_MBUF_H_ /* XXX: These includes suck. Sorry! */ #include #ifdef _KERNEL #include #include #ifdef WITNESS #include #endif #endif /* * Mbufs are of a single size, MSIZE (sys/param.h), which * includes overhead. An mbuf may add a single "mbuf cluster" of size * MCLBYTES (also in sys/param.h), which has no additional overhead * and is used instead of the internal data area; this is done when * at least MINCLSIZE of data must be stored. Additionally, it is possible * to allocate a separate buffer externally and attach it to the mbuf in * a way similar to that of mbuf clusters. */ #define MLEN (MSIZE - sizeof(struct m_hdr)) /* normal data len */ #define MHLEN (MLEN - sizeof(struct pkthdr)) /* data len w/pkthdr */ #define MINCLSIZE (MHLEN + 1) /* smallest amount to put in cluster */ #define M_MAXCOMPRESS (MHLEN / 2) /* max amount to copy for compression */ #ifdef _KERNEL /*- * Macros for type conversion: * mtod(m, t) -- Convert mbuf pointer to data pointer of correct type. * dtom(x) -- Convert data pointer within mbuf to mbuf pointer (XXX). */ #define mtod(m, t) ((t)((m)->m_data)) #define dtom(x) ((struct mbuf *)((intptr_t)(x) & ~(MSIZE-1))) /* * Argument structure passed to UMA routines during mbuf and packet * allocations. */ struct mb_args { int flags; /* Flags for mbuf being allocated */ short type; /* Type of mbuf being allocated */ }; #endif /* _KERNEL */ /* * Header present at the beginning of every mbuf. */ struct m_hdr { struct mbuf *mh_next; /* next buffer in chain */ struct mbuf *mh_nextpkt; /* next chain in queue/record */ caddr_t mh_data; /* location of data */ int mh_len; /* amount of data in this mbuf */ int mh_flags; /* flags; see below */ short mh_type; /* type of data in this mbuf */ }; /* * Packet tag structure (see below for details). */ struct m_tag { SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ u_int16_t m_tag_id; /* Tag ID */ u_int16_t m_tag_len; /* Length of data */ u_int32_t m_tag_cookie; /* ABI/Module ID */ void (*m_tag_free)(struct m_tag *); }; /* * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set. */ struct pkthdr { struct ifnet *rcvif; /* rcv interface */ int len; /* total packet length */ /* variables for ip and tcp reassembly */ void *header; /* pointer to packet header */ /* variables for hardware checksum */ int csum_flags; /* flags regarding checksum */ int csum_data; /* data field used by csum routines */ + u_int16_t tso_segsz; /* TSO segment size */ SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ }; /* * Description of external storage mapped into mbuf; valid only if M_EXT is set. */ struct m_ext { caddr_t ext_buf; /* start of buffer */ void (*ext_free) /* free routine if not the usual */ (void *, void *); void *ext_args; /* optional argument pointer */ u_int ext_size; /* size of buffer, for ext_free */ volatile u_int *ref_cnt; /* pointer to ref count info */ int ext_type; /* type of external storage */ }; /* * The core of the mbuf object along with some shortcut defines for * practical purposes. */ struct mbuf { struct m_hdr m_hdr; union { struct { struct pkthdr MH_pkthdr; /* M_PKTHDR set */ union { struct m_ext MH_ext; /* M_EXT set */ char MH_databuf[MHLEN]; } MH_dat; } MH; char M_databuf[MLEN]; /* !M_PKTHDR, !M_EXT */ } M_dat; }; #define m_next m_hdr.mh_next #define m_len m_hdr.mh_len #define m_data m_hdr.mh_data #define m_type m_hdr.mh_type #define m_flags m_hdr.mh_flags #define m_nextpkt m_hdr.mh_nextpkt #define m_act m_nextpkt #define m_pkthdr M_dat.MH.MH_pkthdr #define m_ext M_dat.MH.MH_dat.MH_ext #define m_pktdat M_dat.MH.MH_dat.MH_databuf #define m_dat M_dat.M_databuf /* * mbuf flags. */ #define M_EXT 0x0001 /* has associated external storage */ #define M_PKTHDR 0x0002 /* start of record */ #define M_EOR 0x0004 /* end of record */ #define M_RDONLY 0x0008 /* associated data is marked read-only */ #define M_PROTO1 0x0010 /* protocol-specific */ #define M_PROTO2 0x0020 /* protocol-specific */ #define M_PROTO3 0x0040 /* protocol-specific */ #define M_PROTO4 0x0080 /* protocol-specific */ #define M_PROTO5 0x0100 /* protocol-specific */ #define M_SKIP_FIREWALL 0x4000 /* skip firewall processing */ #define M_FREELIST 0x8000 /* mbuf is on the free list */ /* * mbuf pkthdr flags (also stored in m_flags). */ #define M_BCAST 0x0200 /* send/received as link-level broadcast */ #define M_MCAST 0x0400 /* send/received as link-level multicast */ #define M_FRAG 0x0800 /* packet is a fragment of a larger packet */ #define M_FIRSTFRAG 0x1000 /* packet is first fragment */ #define M_LASTFRAG 0x2000 /* packet is last fragment */ #define M_VLANTAG 0x10000 /* packet has VLAN tag attached */ /* * External buffer types: identify ext_buf type. */ #define EXT_CLUSTER 1 /* mbuf cluster */ #define EXT_SFBUF 2 /* sendfile(2)'s sf_bufs */ #define EXT_JUMBOP 3 /* jumbo cluster 4096 bytes */ #define EXT_JUMBO9 4 /* jumbo cluster 9216 bytes */ #define EXT_JUMBO16 5 /* jumbo cluster 16184 bytes */ #define EXT_PACKET 6 /* mbuf+cluster from packet zone */ #define EXT_NET_DRV 100 /* custom ext_buf provided by net driver(s) */ #define EXT_MOD_TYPE 200 /* custom module's ext_buf type */ #define EXT_DISPOSABLE 300 /* can throw this buffer away w/page flipping */ #define EXT_EXTREF 400 /* has externally maintained ref_cnt ptr */ /* * Flags copied when copying m_pkthdr. */ #define M_COPYFLAGS (M_PKTHDR|M_EOR|M_RDONLY|M_PROTO1|M_PROTO1|M_PROTO2|\ M_PROTO3|M_PROTO4|M_PROTO5|M_SKIP_FIREWALL|\ M_BCAST|M_MCAST|M_FRAG|M_FIRSTFRAG|M_LASTFRAG|\ M_VLANTAG) /* * Flags to purge when crossing layers. */ #define M_PROTOFLAGS (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5) /* * Flags indicating hw checksum support and sw checksum requirements. * This field can be directly tested against if_data.ifi_hwassist. */ #define CSUM_IP 0x0001 /* will csum IP */ #define CSUM_TCP 0x0002 /* will csum TCP */ #define CSUM_UDP 0x0004 /* will csum UDP */ #define CSUM_IP_FRAGS 0x0008 /* will csum IP fragments */ #define CSUM_FRAGMENT 0x0010 /* will do IP fragmentation */ +#define CSUM_TSO 0x0020 /* will do TSO */ #define CSUM_IP_CHECKED 0x0100 /* did csum IP */ #define CSUM_IP_VALID 0x0200 /* ... the csum is valid */ #define CSUM_DATA_VALID 0x0400 /* csum_data field is valid */ #define CSUM_PSEUDO_HDR 0x0800 /* csum_data has pseudo hdr */ #define CSUM_DELAY_DATA (CSUM_TCP | CSUM_UDP) #define CSUM_DELAY_IP (CSUM_IP) /* XXX add ipv6 here too? */ /* * mbuf types. */ #define MT_NOTMBUF 0 /* USED INTERNALLY ONLY! Object is not mbuf */ #define MT_DATA 1 /* dynamic (data) allocation */ #define MT_HEADER MT_DATA /* packet header, use M_PKTHDR instead */ #define MT_SONAME 8 /* socket name */ #define MT_CONTROL 14 /* extra-data protocol message */ #define MT_OOBDATA 15 /* expedited data */ #define MT_NTYPES 16 /* number of mbuf types for mbtypes[] */ #define MT_NOINIT 255 /* Not a type but a flag to allocate a non-initialized mbuf */ /* * General mbuf allocator statistics structure. * * Many of these statistics are no longer used; we instead track many * allocator statistics through UMA's built in statistics mechanism. */ struct mbstat { u_long m_mbufs; /* XXX */ u_long m_mclusts; /* XXX */ u_long m_drain; /* times drained protocols for space */ u_long m_mcfail; /* XXX: times m_copym failed */ u_long m_mpfail; /* XXX: times m_pullup failed */ u_long m_msize; /* length of an mbuf */ u_long m_mclbytes; /* length of an mbuf cluster */ u_long m_minclsize; /* min length of data to allocate a cluster */ u_long m_mlen; /* length of data in an mbuf */ u_long m_mhlen; /* length of data in a header mbuf */ /* Number of mbtypes (gives # elems in mbtypes[] array: */ short m_numtypes; /* XXX: Sendfile stats should eventually move to their own struct */ u_long sf_iocnt; /* times sendfile had to do disk I/O */ u_long sf_allocfail; /* times sfbuf allocation failed */ u_long sf_allocwait; /* times sfbuf allocation had to wait */ }; /* * Flags specifying how an allocation should be made. * * The flag to use is as follows: * - M_DONTWAIT or M_NOWAIT from an interrupt handler to not block allocation. * - M_WAIT or M_WAITOK or M_TRYWAIT from wherever it is safe to block. * * M_DONTWAIT/M_NOWAIT means that we will not block the thread explicitly * and if we cannot allocate immediately we may return NULL, * whereas M_WAIT/M_WAITOK/M_TRYWAIT means that if we cannot allocate * resources we will block until they are available, and thus never * return NULL. * * XXX Eventually just phase this out to use M_WAITOK/M_NOWAIT. */ #define MBTOM(how) (how) #define M_DONTWAIT M_NOWAIT #define M_TRYWAIT M_WAITOK #define M_WAIT M_WAITOK /* * String names of mbuf-related UMA(9) and malloc(9) types. Exposed to * !_KERNEL so that monitoring tools can look up the zones with * libmemstat(3). */ #define MBUF_MEM_NAME "mbuf" #define MBUF_CLUSTER_MEM_NAME "mbuf_cluster" #define MBUF_PACKET_MEM_NAME "mbuf_packet" #define MBUF_JUMBOP_MEM_NAME "mbuf_jumbo_pagesize" #define MBUF_JUMBO9_MEM_NAME "mbuf_jumbo_9k" #define MBUF_JUMBO16_MEM_NAME "mbuf_jumbo_16k" #define MBUF_TAG_MEM_NAME "mbuf_tag" #define MBUF_EXTREFCNT_MEM_NAME "mbuf_ext_refcnt" #ifdef _KERNEL #ifdef WITNESS #define MBUF_CHECKSLEEP(how) do { \ if (how == M_WAITOK) \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, \ "Sleeping in \"%s\"", __func__); \ } while (0) #else #define MBUF_CHECKSLEEP(how) #endif /* * Network buffer allocation API * * The rest of it is defined in kern/kern_mbuf.c */ extern uma_zone_t zone_mbuf; extern uma_zone_t zone_clust; extern uma_zone_t zone_pack; extern uma_zone_t zone_jumbop; extern uma_zone_t zone_jumbo9; extern uma_zone_t zone_jumbo16; extern uma_zone_t zone_ext_refcnt; extern uma_zone_t zone_mtag_vlan; static __inline struct mbuf *m_get(int how, short type); static __inline struct mbuf *m_gethdr(int how, short type); static __inline struct mbuf *m_getcl(int how, short type, int flags); static __inline struct mbuf *m_getjcl(int how, short type, int flags, int size); static __inline struct mbuf *m_getclr(int how, short type); /* XXX */ static __inline struct mbuf *m_free(struct mbuf *m); static __inline void m_clget(struct mbuf *m, int how); static __inline void *m_cljget(struct mbuf *m, int how, int size); static __inline void m_chtype(struct mbuf *m, short new_type); void mb_free_ext(struct mbuf *); static __inline struct mbuf * m_get(int how, short type) { struct mb_args args; args.flags = 0; args.type = type; return (struct mbuf *)(uma_zalloc_arg(zone_mbuf, &args, how)); } /* XXX This should be depracated, very little use */ static __inline struct mbuf * m_getclr(int how, short type) { struct mbuf *m; struct mb_args args; args.flags = 0; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); if (m != NULL) bzero(m->m_data, MLEN); return m; } static __inline struct mbuf * m_gethdr(int how, short type) { struct mb_args args; args.flags = M_PKTHDR; args.type = type; return (struct mbuf *)(uma_zalloc_arg(zone_mbuf, &args, how)); } static __inline struct mbuf * m_getcl(int how, short type, int flags) { struct mb_args args; args.flags = flags; args.type = type; return (struct mbuf *)(uma_zalloc_arg(zone_pack, &args, how)); } /* * m_getjcl() returns an mbuf with a cluster of the specified size attached. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ static __inline /* XXX: This is rather large, should be real function maybe. */ struct mbuf * m_getjcl(int how, short type, int flags, int size) { struct mb_args args; struct mbuf *m, *n; uma_zone_t zone; args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return NULL; switch (size) { case MCLBYTES: zone = zone_clust; break; #if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: zone = zone_jumbop; break; #endif case MJUM9BYTES: zone = zone_jumbo9; break; case MJUM16BYTES: zone = zone_jumbo16; break; default: panic("%s: m_getjcl: invalid cluster type", __func__); } n = uma_zalloc_arg(zone, m, how); if (n == NULL) { uma_zfree(zone_mbuf, m); return NULL; } return m; } static __inline struct mbuf * m_free(struct mbuf *m) { struct mbuf *n = m->m_next; if (m->m_flags & M_EXT) mb_free_ext(m); else uma_zfree(zone_mbuf, m); return n; } static __inline void m_clget(struct mbuf *m, int how) { if (m->m_flags & M_EXT) printf("%s: %p mbuf already has cluster\n", __func__, m); m->m_ext.ext_buf = (char *)NULL; uma_zalloc_arg(zone_clust, m, how); } /* * m_cljget() is different from m_clget() as it can allocate clusters * without attaching them to an mbuf. In that case the return value * is the pointer to the cluster of the requested size. If an mbuf was * specified, it gets the cluster attached to it and the return value * can be safely ignored. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ static __inline void * m_cljget(struct mbuf *m, int how, int size) { uma_zone_t zone; if (m && m->m_flags & M_EXT) printf("%s: %p mbuf already has cluster\n", __func__, m); if (m != NULL) m->m_ext.ext_buf = NULL; switch (size) { case MCLBYTES: zone = zone_clust; break; #if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: zone = zone_jumbop; break; #endif case MJUM9BYTES: zone = zone_jumbo9; break; case MJUM16BYTES: zone = zone_jumbo16; break; default: panic("%s: m_getjcl: invalid cluster type", __func__); } return (uma_zalloc_arg(zone, m, how)); } static __inline void m_chtype(struct mbuf *m, short new_type) { m->m_type = new_type; } /* * mbuf, cluster, and external object allocation macros * (for compatibility purposes). */ /* NB: M_COPY_PKTHDR is deprecated. Use M_MOVE_PKTHDR or m_dup_pktdr. */ #define M_MOVE_PKTHDR(to, from) m_move_pkthdr((to), (from)) #define MGET(m, how, type) ((m) = m_get((how), (type))) #define MGETHDR(m, how, type) ((m) = m_gethdr((how), (type))) #define MCLGET(m, how) m_clget((m), (how)) #define MEXTADD(m, buf, size, free, args, flags, type) \ m_extadd((m), (caddr_t)(buf), (size), (free), (args), (flags), (type)) /* * Evaluate TRUE if it's safe to write to the mbuf m's data region (this * can be both the local data payload, or an external buffer area, * depending on whether M_EXT is set). */ #define M_WRITABLE(m) (!((m)->m_flags & M_RDONLY) && \ (!(((m)->m_flags & M_EXT)) || \ (*((m)->m_ext.ref_cnt) == 1)) ) \ /* Check if the supplied mbuf has a packet header, or else panic. */ #define M_ASSERTPKTHDR(m) \ KASSERT(m != NULL && m->m_flags & M_PKTHDR, \ ("%s: no mbuf packet header!", __func__)) /* Ensure that the supplied mbuf is a valid, non-free mbuf. */ /* XXX: Broken at the moment. Need some UMA magic to make it work again. */ #define M_ASSERTVALID(m) \ KASSERT((((struct mbuf *)m)->m_flags & 0) == 0, \ ("%s: attempted use of a free mbuf!", __func__)) /* * Set the m_data pointer of a newly-allocated mbuf (m_get/MGET) to place * an object of the specified size at the end of the mbuf, longword aligned. */ #define M_ALIGN(m, len) do { \ KASSERT(!((m)->m_flags & (M_PKTHDR|M_EXT)), \ ("%s: M_ALIGN not normal mbuf", __func__)); \ KASSERT((m)->m_data == (m)->m_dat, \ ("%s: M_ALIGN not a virgin mbuf", __func__)); \ (m)->m_data += (MLEN - (len)) & ~(sizeof(long) - 1); \ } while (0) /* * As above, for mbufs allocated with m_gethdr/MGETHDR * or initialized by M_COPY_PKTHDR. */ #define MH_ALIGN(m, len) do { \ KASSERT((m)->m_flags & M_PKTHDR && !((m)->m_flags & M_EXT), \ ("%s: MH_ALIGN not PKTHDR mbuf", __func__)); \ KASSERT((m)->m_data == (m)->m_pktdat, \ ("%s: MH_ALIGN not a virgin mbuf", __func__)); \ (m)->m_data += (MHLEN - (len)) & ~(sizeof(long) - 1); \ } while (0) /* * Compute the amount of space available * before the current start of data in an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. */ #define M_LEADINGSPACE(m) \ ((m)->m_flags & M_EXT ? \ (M_WRITABLE(m) ? (m)->m_data - (m)->m_ext.ext_buf : 0): \ (m)->m_flags & M_PKTHDR ? (m)->m_data - (m)->m_pktdat : \ (m)->m_data - (m)->m_dat) /* * Compute the amount of space available * after the end of data in an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. */ #define M_TRAILINGSPACE(m) \ ((m)->m_flags & M_EXT ? \ (M_WRITABLE(m) ? (m)->m_ext.ext_buf + (m)->m_ext.ext_size \ - ((m)->m_data + (m)->m_len) : 0) : \ &(m)->m_dat[MLEN] - ((m)->m_data + (m)->m_len)) /* * Arrange to prepend space of size plen to mbuf m. * If a new mbuf must be allocated, how specifies whether to wait. * If the allocation fails, the original mbuf chain is freed and m is * set to NULL. */ #define M_PREPEND(m, plen, how) do { \ struct mbuf **_mmp = &(m); \ struct mbuf *_mm = *_mmp; \ int _mplen = (plen); \ int __mhow = (how); \ \ MBUF_CHECKSLEEP(how); \ if (M_LEADINGSPACE(_mm) >= _mplen) { \ _mm->m_data -= _mplen; \ _mm->m_len += _mplen; \ } else \ _mm = m_prepend(_mm, _mplen, __mhow); \ if (_mm != NULL && _mm->m_flags & M_PKTHDR) \ _mm->m_pkthdr.len += _mplen; \ *_mmp = _mm; \ } while (0) /* * Change mbuf to new type. * This is a relatively expensive operation and should be avoided. */ #define MCHTYPE(m, t) m_chtype((m), (t)) /* Length to m_copy to copy all. */ #define M_COPYALL 1000000000 /* Compatibility with 4.3. */ #define m_copy(m, o, l) m_copym((m), (o), (l), M_DONTWAIT) extern int max_datalen; /* MHLEN - max_hdr */ extern int max_hdr; /* Largest link + protocol header */ extern int max_linkhdr; /* Largest link-level header */ extern int max_protohdr; /* Largest protocol header */ extern struct mbstat mbstat; /* General mbuf stats/infos */ extern int nmbclusters; /* Maximum number of clusters */ struct uio; void m_adj(struct mbuf *, int); void m_align(struct mbuf *, int); int m_apply(struct mbuf *, int, int, int (*)(void *, void *, u_int), void *); int m_append(struct mbuf *, int, c_caddr_t); void m_cat(struct mbuf *, struct mbuf *); void m_extadd(struct mbuf *, caddr_t, u_int, void (*)(void *, void *), void *, int, int); void m_copyback(struct mbuf *, int, int, c_caddr_t); void m_copydata(const struct mbuf *, int, int, caddr_t); struct mbuf *m_copym(struct mbuf *, int, int, int); struct mbuf *m_copymdata(struct mbuf *, struct mbuf *, int, int, int, int); struct mbuf *m_copypacket(struct mbuf *, int); void m_copy_pkthdr(struct mbuf *, struct mbuf *); struct mbuf *m_copyup(struct mbuf *n, int len, int dstoff); struct mbuf *m_defrag(struct mbuf *, int); void m_demote(struct mbuf *, int); struct mbuf *m_devget(char *, int, int, struct ifnet *, void (*)(char *, caddr_t, u_int)); struct mbuf *m_dup(struct mbuf *, int); int m_dup_pkthdr(struct mbuf *, struct mbuf *, int); u_int m_fixhdr(struct mbuf *); struct mbuf *m_fragment(struct mbuf *, int, int); void m_freem(struct mbuf *); struct mbuf *m_getm(struct mbuf *, int, int, short); struct mbuf *m_getptr(struct mbuf *, int, int *); u_int m_length(struct mbuf *, struct mbuf **); void m_move_pkthdr(struct mbuf *, struct mbuf *); struct mbuf *m_prepend(struct mbuf *, int, int); void m_print(const struct mbuf *, int); struct mbuf *m_pulldown(struct mbuf *, int, int, int *); struct mbuf *m_pullup(struct mbuf *, int); int m_sanity(struct mbuf *, int); struct mbuf *m_split(struct mbuf *, int, int); struct mbuf *m_uiotombuf(struct uio *, int, int, int); struct mbuf *m_unshare(struct mbuf *, int how); /*- * Network packets may have annotations attached by affixing a list * of "packet tags" to the pkthdr structure. Packet tags are * dynamically allocated semi-opaque data structures that have * a fixed header (struct m_tag) that specifies the size of the * memory block and a pair that identifies it. * The cookie is a 32-bit unique unsigned value used to identify * a module or ABI. By convention this value is chose as the * date+time that the module is created, expressed as the number of * seconds since the epoch (e.g., using date -u +'%s'). The type value * is an ABI/module-specific value that identifies a particular annotation * and is private to the module. For compatibility with systems * like OpenBSD that define packet tags w/o an ABI/module cookie, * the value PACKET_ABI_COMPAT is used to implement m_tag_get and * m_tag_find compatibility shim functions and several tag types are * defined below. Users that do not require compatibility should use * a private cookie value so that packet tag-related definitions * can be maintained privately. * * Note that the packet tag returned by m_tag_alloc has the default * memory alignment implemented by malloc. To reference private data * one can use a construct like: * * struct m_tag *mtag = m_tag_alloc(...); * struct foo *p = (struct foo *)(mtag+1); * * if the alignment of struct m_tag is sufficient for referencing members * of struct foo. Otherwise it is necessary to embed struct m_tag within * the private data structure to insure proper alignment; e.g., * * struct foo { * struct m_tag tag; * ... * }; * struct foo *p = (struct foo *) m_tag_alloc(...); * struct m_tag *mtag = &p->tag; */ /* * Persistent tags stay with an mbuf until the mbuf is reclaimed. * Otherwise tags are expected to ``vanish'' when they pass through * a network interface. For most interfaces this happens normally * as the tags are reclaimed when the mbuf is free'd. However in * some special cases reclaiming must be done manually. An example * is packets that pass through the loopback interface. Also, one * must be careful to do this when ``turning around'' packets (e.g., * icmp_reflect). * * To mark a tag persistent bit-or this flag in when defining the * tag id. The tag will then be treated as described above. */ #define MTAG_PERSISTENT 0x800 #define PACKET_TAG_NONE 0 /* Nadda */ /* Packet tags for use with PACKET_ABI_COMPAT. */ #define PACKET_TAG_IPSEC_IN_DONE 1 /* IPsec applied, in */ #define PACKET_TAG_IPSEC_OUT_DONE 2 /* IPsec applied, out */ #define PACKET_TAG_IPSEC_IN_CRYPTO_DONE 3 /* NIC IPsec crypto done */ #define PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED 4 /* NIC IPsec crypto req'ed */ #define PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO 5 /* NIC notifies IPsec */ #define PACKET_TAG_IPSEC_PENDING_TDB 6 /* Reminder to do IPsec */ #define PACKET_TAG_BRIDGE 7 /* Bridge processing done */ #define PACKET_TAG_GIF 8 /* GIF processing done */ #define PACKET_TAG_GRE 9 /* GRE processing done */ #define PACKET_TAG_IN_PACKET_CHECKSUM 10 /* NIC checksumming done */ #define PACKET_TAG_ENCAP 11 /* Encap. processing */ #define PACKET_TAG_IPSEC_SOCKET 12 /* IPSEC socket ref */ #define PACKET_TAG_IPSEC_HISTORY 13 /* IPSEC history */ #define PACKET_TAG_IPV6_INPUT 14 /* IPV6 input processing */ #define PACKET_TAG_DUMMYNET 15 /* dummynet info */ #define PACKET_TAG_DIVERT 17 /* divert info */ #define PACKET_TAG_IPFORWARD 18 /* ipforward info */ #define PACKET_TAG_MACLABEL (19 | MTAG_PERSISTENT) /* MAC label */ #define PACKET_TAG_PF_ROUTED 21 /* PF routed, avoid loops */ #define PACKET_TAG_PF_FRAGCACHE 22 /* PF fragment cached */ #define PACKET_TAG_PF_QID 23 /* PF ALTQ queue id */ #define PACKET_TAG_PF_TAG 24 /* PF tagged */ #define PACKET_TAG_RTSOCKFAM 25 /* rtsock sa family */ #define PACKET_TAG_PF_TRANSLATE_LOCALHOST 26 /* PF translate localhost */ #define PACKET_TAG_IPOPTIONS 27 /* Saved IP options */ #define PACKET_TAG_CARP 28 /* CARP info */ /* Specific cookies and tags. */ #define MTAG_VLAN 1035328035 #define MTAG_VLAN_TAG 0 /* tag of VLAN interface */ /* Packet tag routines. */ struct m_tag *m_tag_alloc(u_int32_t, int, int, int); void m_tag_delete(struct mbuf *, struct m_tag *); void m_tag_delete_chain(struct mbuf *, struct m_tag *); void m_tag_free_default(struct m_tag *); struct m_tag *m_tag_locate(struct mbuf *, u_int32_t, int, struct m_tag *); struct m_tag *m_tag_copy(struct m_tag *, int); int m_tag_copy_chain(struct mbuf *, struct mbuf *, int); void m_tag_delete_nonpersistent(struct mbuf *); /* * Initialize the list of tags associated with an mbuf. */ static __inline void m_tag_init(struct mbuf *m) { SLIST_INIT(&m->m_pkthdr.tags); } /* * Set up the contents of a tag. Note that this does not * fill in the free method; the caller is expected to do that. * * XXX probably should be called m_tag_init, but that was * already taken. */ static __inline void m_tag_setup(struct m_tag *t, u_int32_t cookie, int type, int len) { t->m_tag_id = type; t->m_tag_len = len; t->m_tag_cookie = cookie; } /* * Reclaim resources associated with a tag. */ static __inline void m_tag_free(struct m_tag *t) { (*t->m_tag_free)(t); } /* * Return the first tag associated with an mbuf. */ static __inline struct m_tag * m_tag_first(struct mbuf *m) { return (SLIST_FIRST(&m->m_pkthdr.tags)); } /* * Return the next tag in the list of tags associated with an mbuf. */ static __inline struct m_tag * m_tag_next(struct mbuf *m, struct m_tag *t) { return (SLIST_NEXT(t, m_tag_link)); } /* * Prepend a tag to the list of tags associated with an mbuf. */ static __inline void m_tag_prepend(struct mbuf *m, struct m_tag *t) { SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); } /* * Unlink a tag from the list of tags associated with an mbuf. */ static __inline void m_tag_unlink(struct mbuf *m, struct m_tag *t) { SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link); } /* These are for OpenBSD compatibility. */ #define MTAG_ABI_COMPAT 0 /* compatibility ABI */ static __inline struct m_tag * m_tag_get(int type, int length, int wait) { return (m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait)); } static __inline struct m_tag * m_tag_find(struct mbuf *m, int type, struct m_tag *start) { return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL : m_tag_locate(m, MTAG_ABI_COMPAT, type, start)); } #endif /* _KERNEL */ #endif /* !_SYS_MBUF_H_ */