Index: stable/6/share/man/man4/ip.4 =================================================================== --- stable/6/share/man/man4/ip.4 (revision 150827) +++ stable/6/share/man/man4/ip.4 (revision 150828) @@ -1,643 +1,657 @@ .\" Copyright (c) 1983, 1991, 1993 .\" The Regents of the University of California. All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. All advertising materials mentioning features or use of this software .\" must display the following acknowledgement: .\" This product includes software developed by the University of .\" California, Berkeley and its contributors. .\" 4. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" @(#)ip.4 8.2 (Berkeley) 11/30/93 .\" $FreeBSD$ .\" -.Dd August 22, 2005 +.Dd September 26, 2005 .Dt IP 4 .Os .Sh NAME .Nm ip .Nd Internet Protocol .Sh SYNOPSIS .In sys/types.h .In sys/socket.h .In netinet/in.h .Ft int .Fn socket AF_INET SOCK_RAW proto .Sh DESCRIPTION .Tn IP is the transport layer protocol used by the Internet protocol family. Options may be set at the .Tn IP level when using higher-level protocols that are based on .Tn IP (such as .Tn TCP and .Tn UDP ) . It may also be accessed through a .Dq raw socket when developing new protocols, or special-purpose applications. .Pp There are several .Tn IP-level .Xr setsockopt 2 and .Xr getsockopt 2 options. .Dv IP_OPTIONS may be used to provide .Tn IP options to be transmitted in the .Tn IP header of each outgoing packet or to examine the header options on incoming packets. .Tn IP options may be used with any socket type in the Internet family. The format of .Tn IP options to be sent is that specified by the .Tn IP protocol specification (RFC-791), with one exception: the list of addresses for Source Route options must include the first-hop gateway at the beginning of the list of gateways. The first-hop gateway address will be extracted from the option list and the size adjusted accordingly before use. To disable previously specified options, use a zero-length buffer: .Bd -literal setsockopt(s, IPPROTO_IP, IP_OPTIONS, NULL, 0); .Ed .Pp .Dv IP_TOS and .Dv IP_TTL may be used to set the type-of-service and time-to-live fields in the .Tn IP header for .Dv SOCK_STREAM , SOCK_DGRAM , and certain types of .Dv SOCK_RAW sockets. For example, .Bd -literal int tos = IPTOS_LOWDELAY; /* see */ setsockopt(s, IPPROTO_IP, IP_TOS, &tos, sizeof(tos)); int ttl = 60; /* max = 255 */ setsockopt(s, IPPROTO_IP, IP_TTL, &ttl, sizeof(ttl)); .Ed .Pp .Dv IP_MINTTL may be used to set the minimum acceptable TTL a packet must have when received on a socket. All packets with a lower TTL are silently dropped. This option is only really useful when set to 255 preventing packets from outside the directly connected networks reaching local listeners on sockets. +.Pp +.Dv IP_DONTFRAG +may be used to set the Don't Fragment flag on IP packets. +Currently this option is respected only on +.Xr udp 4 +and Raw +.Xr ip 4 +sockets, unless the IP_HDRINCL option has been set. +On +.Xr tcp 4 +sockets the Don't Fragment flag is controlled by the Path +MTU Discovery option. +Sending a packet larger than the MTU size of the egress interface, +determined by the destination address, returns an EMSGSIZE error. .Pp If the .Dv IP_RECVDSTADDR option is enabled on a .Dv SOCK_DGRAM socket, the .Xr recvmsg 2 call will return the destination .Tn IP address for a .Tn UDP datagram. The .Vt msg_control field in the .Vt msghdr structure points to a buffer that contains a .Vt cmsghdr structure followed by the .Tn IP address. The .Vt cmsghdr fields have the following values: .Bd -literal cmsg_len = sizeof(struct in_addr) cmsg_level = IPPROTO_IP cmsg_type = IP_RECVDSTADDR .Ed .Pp The source address to be used for outgoing .Tn UDP datagrams on a socket that is not bound to a specific .Tn IP address can be specified as ancillary data with a type code of .Dv IP_SENDSRCADDR . The msg_control field in the msghdr structure should point to a buffer that contains a .Vt cmsghdr structure followed by the .Tn IP address. The cmsghdr fields should have the following values: .Bd -literal cmsg_len = sizeof(struct in_addr) cmsg_level = IPPROTO_IP cmsg_type = IP_SENDSRCADDR .Ed .Pp For convenience, .Dv IP_SENDSRCADDR is defined to have the same value as .Dv IP_RECVDSTADDR , so the .Dv IP_RECVDSTADDR control message from .Xr recvmsg 2 can be used directly as a control message for .Xr sendmsg 2 . .Pp If the .Dv IP_ONESBCAST option is enabled on a .Dv SOCK_DGRAM or a .Dv SOCK_RAW socket, the destination address of outgoing broadcast datagrams on that socket will be forced to the undirected broadcast address, .Dv INADDR_BROADCAST , before transmission. This is in contrast to the default behavior of the system, which is to transmit undirected broadcasts via the first network interface with the .Dv IFF_BROADCAST flag set. .Pp This option allows applications to choose which interface is used to transmit an undirected broadcast datagram. For example, the following code would force an undirected broadcast to be transmitted via the interface configured with the broadcast address 192.168.2.255: .Bd -literal char msg[512]; struct sockaddr_in sin; u_char onesbcast = 1; /* 0 = disable (default), 1 = enable */ setsockopt(s, IPPROTO_IP, IP_ONESBCAST, &onesbcast, sizeof(onesbcast)); sin.sin_addr.s_addr = inet_addr("192.168.2.255"); sin.sin_port = htons(1234); sendto(s, msg, sizeof(msg), 0, &sin, sizeof(sin)); .Ed .Pp It is the application's responsibility to set the .Dv IP_TTL option to an appropriate value in order to prevent broadcast storms. The application must have sufficient credentials to set the .Dv SO_BROADCAST socket level option, otherwise the .Dv IP_ONESBCAST option has no effect. .Pp If the .Dv IP_RECVTTL option is enabled on a .Dv SOCK_DGRAM socket, the .Xr recvmsg 2 call will return the .Tn IP .Tn TTL (time to live) field for a .Tn UDP datagram. The msg_control field in the msghdr structure points to a buffer that contains a cmsghdr structure followed by the .Tn TTL . The cmsghdr fields have the following values: .Bd -literal cmsg_len = sizeof(u_char) cmsg_level = IPPROTO_IP cmsg_type = IP_RECVTTL .Ed .Pp If the .Dv IP_RECVIF option is enabled on a .Dv SOCK_DGRAM socket, the .Xr recvmsg 2 call returns a .Vt "struct sockaddr_dl" corresponding to the interface on which the packet was received. The .Va msg_control field in the .Vt msghdr structure points to a buffer that contains a .Vt cmsghdr structure followed by the .Vt "struct sockaddr_dl" . The .Vt cmsghdr fields have the following values: .Bd -literal cmsg_len = sizeof(struct sockaddr_dl) cmsg_level = IPPROTO_IP cmsg_type = IP_RECVIF .Ed .Pp .Dv IP_PORTRANGE may be used to set the port range used for selecting a local port number on a socket with an unspecified (zero) port number. It has the following possible values: .Bl -tag -width IP_PORTRANGE_DEFAULT .It Dv IP_PORTRANGE_DEFAULT use the default range of values, normally .Dv IPPORT_HIFIRSTAUTO through .Dv IPPORT_HILASTAUTO . This is adjustable through the sysctl setting: .Va net.inet.ip.portrange.first and .Va net.inet.ip.portrange.last . .It Dv IP_PORTRANGE_HIGH use a high range of values, normally .Dv IPPORT_HIFIRSTAUTO and .Dv IPPORT_HILASTAUTO . This is adjustable through the sysctl setting: .Va net.inet.ip.portrange.hifirst and .Va net.inet.ip.portrange.hilast . .It Dv IP_PORTRANGE_LOW use a low range of ports, which are normally restricted to privileged processes on .Ux systems. The range is normally from .Dv IPPORT_RESERVED \- 1 down to .Li IPPORT_RESERVEDSTART in descending order. This is adjustable through the sysctl setting: .Va net.inet.ip.portrange.lowfirst and .Va net.inet.ip.portrange.lowlast . .El .Pp The range of privileged ports which only may be opened by root-owned processes may be modified by the .Va net.inet.ip.portrange.reservedlow and .Va net.inet.ip.portrange.reservedhigh sysctl settings. The values default to the traditional range, 0 through .Dv IPPORT_RESERVED \- 1 (0 through 1023), respectively. Note that these settings do not affect and are not accounted for in the use or calculation of the other .Va net.inet.ip.portrange values above. Changing these values departs from .Ux tradition and has security consequences that the administrator should carefully evaluate before modifying these settings. .Pp Ports are allocated at random within the specified port range in order to increase the difficulty of random spoofing attacks. In scenarios such as benchmarking, this behavior may be undesirable. In these cases, .Va net.inet.ip.portrange.randomized can be used to toggle randomization off. If more than .Va net.inet.ip.portrange.randomcps ports have been allocated in the last second, then return to sequential port allocation. Return to random allocation only once the current port allocation rate drops below .Va net.inet.ip.portrange.randomcps for at least .Va net.inet.ip.portrange.randomtime seconds. The default values for .Va net.inet.ip.portrange.randomcps and .Va net.inet.ip.portrange.randomtime are 10 port allocations per second and 45 seconds correspondingly. .Ss "Multicast Options" .Pp .Tn IP multicasting is supported only on .Dv AF_INET sockets of type .Dv SOCK_DGRAM and .Dv SOCK_RAW , and only on networks where the interface driver supports multicasting. .Pp The .Dv IP_MULTICAST_TTL option changes the time-to-live (TTL) for outgoing multicast datagrams in order to control the scope of the multicasts: .Bd -literal u_char ttl; /* range: 0 to 255, default = 1 */ setsockopt(s, IPPROTO_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); .Ed .Pp Datagrams with a TTL of 1 are not forwarded beyond the local network. Multicast datagrams with a TTL of 0 will not be transmitted on any network, but may be delivered locally if the sending host belongs to the destination group and if multicast loopback has not been disabled on the sending socket (see below). Multicast datagrams with TTL greater than 1 may be forwarded to other networks if a multicast router is attached to the local network. .Pp For hosts with multiple interfaces, each multicast transmission is sent from the primary network interface. The .Dv IP_MULTICAST_IF option overrides the default for subsequent transmissions from a given socket: .Bd -literal struct in_addr addr; setsockopt(s, IPPROTO_IP, IP_MULTICAST_IF, &addr, sizeof(addr)); .Ed .Pp where "addr" is the local .Tn IP address of the desired interface or .Dv INADDR_ANY to specify the default interface. An interface's local IP address and multicast capability can be obtained via the .Dv SIOCGIFCONF and .Dv SIOCGIFFLAGS ioctls. Normal applications should not need to use this option. .Pp If a multicast datagram is sent to a group to which the sending host itself belongs (on the outgoing interface), a copy of the datagram is, by default, looped back by the IP layer for local delivery. The .Dv IP_MULTICAST_LOOP option gives the sender explicit control over whether or not subsequent datagrams are looped back: .Bd -literal u_char loop; /* 0 = disable, 1 = enable (default) */ setsockopt(s, IPPROTO_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); .Ed .Pp This option improves performance for applications that may have no more than one instance on a single host (such as a router daemon), by eliminating the overhead of receiving their own transmissions. It should generally not be used by applications for which there may be more than one instance on a single host (such as a conferencing program) or for which the sender does not belong to the destination group (such as a time querying program). .Pp A multicast datagram sent with an initial TTL greater than 1 may be delivered to the sending host on a different interface from that on which it was sent, if the host belongs to the destination group on that other interface. The loopback control option has no effect on such delivery. .Pp A host must become a member of a multicast group before it can receive datagrams sent to the group. To join a multicast group, use the .Dv IP_ADD_MEMBERSHIP option: .Bd -literal struct ip_mreq mreq; setsockopt(s, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); .Ed .Pp where .Fa mreq is the following structure: .Bd -literal struct ip_mreq { struct in_addr imr_multiaddr; /* IP multicast address of group */ struct in_addr imr_interface; /* local IP address of interface */ } .Ed .Pp .Va imr_interface should be set to .Dv INADDR_ANY to choose the default multicast interface, or the .Tn IP address of a particular multicast-capable interface if the host is multihomed. Since .Fx 4.4 , if the .Va imr_interface member is within the network range .Li 0.0.0.0/8 , it is treated as an interface index in the system interface MIB, as per the RIP Version 2 MIB Extension (RFC-1724). .Pp Membership is associated with a single interface; programs running on multihomed hosts may need to join the same group on more than one interface. Up to .Dv IP_MAX_MEMBERSHIPS (currently 20) memberships may be added on a single socket. .Pp To drop a membership, use: .Bd -literal struct ip_mreq mreq; setsockopt(s, IPPROTO_IP, IP_DROP_MEMBERSHIP, &mreq, sizeof(mreq)); .Ed .Pp where .Fa mreq contains the same values as used to add the membership. Memberships are dropped when the socket is closed or the process exits. .\"----------------------- .Ss "Raw IP Sockets" .Pp Raw .Tn IP sockets are connectionless, and are normally used with the .Xr sendto 2 and .Xr recvfrom 2 calls, though the .Xr connect 2 call may also be used to fix the destination for future packets (in which case the .Xr read 2 or .Xr recv 2 and .Xr write 2 or .Xr send 2 system calls may be used). .Pp If .Fa proto is 0, the default protocol .Dv IPPROTO_RAW is used for outgoing packets, and only incoming packets destined for that protocol are received. If .Fa proto is non-zero, that protocol number will be used on outgoing packets and to filter incoming packets. .Pp Outgoing packets automatically have an .Tn IP header prepended to them (based on the destination address and the protocol number the socket is created with), unless the .Dv IP_HDRINCL option has been set. Incoming packets are received with .Tn IP header and options intact. .Pp .Dv IP_HDRINCL indicates the complete IP header is included with the data and may be used only with the .Dv SOCK_RAW type. .Bd -literal #include #include int hincl = 1; /* 1 = on, 0 = off */ setsockopt(s, IPPROTO_IP, IP_HDRINCL, &hincl, sizeof(hincl)); .Ed .Pp Unlike previous .Bx releases, the program must set all the fields of the IP header, including the following: .Bd -literal ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip->ip_id = 0; /* 0 means kernel set appropriate value */ ip->ip_off = offset; .Ed .Pp The .Va ip_len and .Va ip_off fields .Em must be provided in host byte order . All other fields must be provided in network byte order. See .Xr byteorder 3 for more information on network byte order. If the .Va ip_id field is set to 0 then the kernel will choose an appropriate value. If the header source address is set to .Dv INADDR_ANY , the kernel will choose an appropriate address. .Sh ERRORS A socket operation may fail with one of the following errors returned: .Bl -tag -width Er .It Bq Er EISCONN when trying to establish a connection on a socket which already has one, or when trying to send a datagram with the destination address specified and the socket is already connected; .It Bq Er ENOTCONN when trying to send a datagram, but no destination address is specified, and the socket has not been connected; .It Bq Er ENOBUFS when the system runs out of memory for an internal data structure; .It Bq Er EADDRNOTAVAIL when an attempt is made to create a socket with a network address for which no network interface exists. .It Bq Er EACCES when an attempt is made to create a raw IP socket by a non-privileged process. .El .Pp The following errors specific to .Tn IP may occur when setting or getting .Tn IP options: .Bl -tag -width Er .It Bq Er EINVAL An unknown socket option name was given. .It Bq Er EINVAL The IP option field was improperly formed; an option field was shorter than the minimum value or longer than the option buffer provided. .El .Pp The following errors may occur when attempting to send .Tn IP datagrams via a .Dq raw socket with the .Dv IP_HDRINCL option set: .Bl -tag -width Er .It Bq Er EINVAL The user-supplied .Va ip_len field was not equal to the length of the datagram written to the socket. .El .Sh SEE ALSO .Xr getsockopt 2 , .Xr recv 2 , .Xr send 2 , .Xr byteorder 3 , .Xr icmp 4 , .Xr inet 4 , .Xr intro 4 .Sh HISTORY The .Nm protocol appeared in .Bx 4.2 . Index: stable/6/sys/netinet/in.h =================================================================== --- stable/6/sys/netinet/in.h (revision 150827) +++ stable/6/sys/netinet/in.h (revision 150828) @@ -1,593 +1,594 @@ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.h 8.3 (Berkeley) 1/3/94 * $FreeBSD$ */ #ifndef _NETINET_IN_H_ #define _NETINET_IN_H_ #include #include #include /* Protocols common to RFC 1700, POSIX, and X/Open. */ #define IPPROTO_IP 0 /* dummy for IP */ #define IPPROTO_ICMP 1 /* control message protocol */ #define IPPROTO_TCP 6 /* tcp */ #define IPPROTO_UDP 17 /* user datagram protocol */ #define INADDR_ANY (u_int32_t)0x00000000 #define INADDR_BROADCAST (u_int32_t)0xffffffff /* must be masked */ #ifndef _UINT8_T_DECLARED typedef __uint8_t uint8_t; #define _UINT8_T_DECLARED #endif #ifndef _UINT16_T_DECLARED typedef __uint16_t uint16_t; #define _UINT16_T_DECLARED #endif #ifndef _UINT32_T_DECLARED typedef __uint32_t uint32_t; #define _UINT32_T_DECLARED #endif #ifndef _IN_ADDR_T_DECLARED typedef uint32_t in_addr_t; #define _IN_ADDR_T_DECLARED #endif #ifndef _IN_PORT_T_DECLARED typedef uint16_t in_port_t; #define _IN_PORT_T_DECLARED #endif #ifndef _SA_FAMILY_T_DECLARED typedef __sa_family_t sa_family_t; #define _SA_FAMILY_T_DECLARED #endif /* Internet address (a structure for historical reasons). */ #ifndef _STRUCT_IN_ADDR_DECLARED struct in_addr { in_addr_t s_addr; }; #define _STRUCT_IN_ADDR_DECLARED #endif /* Socket address, internet style. */ struct sockaddr_in { uint8_t sin_len; sa_family_t sin_family; in_port_t sin_port; struct in_addr sin_addr; char sin_zero[8]; }; #ifndef _KERNEL #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED __BEGIN_DECLS uint32_t htonl(uint32_t); uint16_t htons(uint16_t); uint32_t ntohl(uint32_t); uint16_t ntohs(uint16_t); __END_DECLS #endif #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif #endif /* !_KERNEL */ #if __POSIX_VISIBLE >= 200112 #define IPPROTO_RAW 255 /* raw IP packet */ #define INET_ADDRSTRLEN 16 #endif #if __BSD_VISIBLE /* * Constants and structures defined by the internet system, * Per RFC 790, September 1981, and numerous additions. */ /* * Protocols (RFC 1700) */ #define IPPROTO_HOPOPTS 0 /* IP6 hop-by-hop options */ #define IPPROTO_IGMP 2 /* group mgmt protocol */ #define IPPROTO_GGP 3 /* gateway^2 (deprecated) */ #define IPPROTO_IPV4 4 /* IPv4 encapsulation */ #define IPPROTO_IPIP IPPROTO_IPV4 /* for compatibility */ #define IPPROTO_ST 7 /* Stream protocol II */ #define IPPROTO_EGP 8 /* exterior gateway protocol */ #define IPPROTO_PIGP 9 /* private interior gateway */ #define IPPROTO_RCCMON 10 /* BBN RCC Monitoring */ #define IPPROTO_NVPII 11 /* network voice protocol*/ #define IPPROTO_PUP 12 /* pup */ #define IPPROTO_ARGUS 13 /* Argus */ #define IPPROTO_EMCON 14 /* EMCON */ #define IPPROTO_XNET 15 /* Cross Net Debugger */ #define IPPROTO_CHAOS 16 /* Chaos*/ #define IPPROTO_MUX 18 /* Multiplexing */ #define IPPROTO_MEAS 19 /* DCN Measurement Subsystems */ #define IPPROTO_HMP 20 /* Host Monitoring */ #define IPPROTO_PRM 21 /* Packet Radio Measurement */ #define IPPROTO_IDP 22 /* xns idp */ #define IPPROTO_TRUNK1 23 /* Trunk-1 */ #define IPPROTO_TRUNK2 24 /* Trunk-2 */ #define IPPROTO_LEAF1 25 /* Leaf-1 */ #define IPPROTO_LEAF2 26 /* Leaf-2 */ #define IPPROTO_RDP 27 /* Reliable Data */ #define IPPROTO_IRTP 28 /* Reliable Transaction */ #define IPPROTO_TP 29 /* tp-4 w/ class negotiation */ #define IPPROTO_BLT 30 /* Bulk Data Transfer */ #define IPPROTO_NSP 31 /* Network Services */ #define IPPROTO_INP 32 /* Merit Internodal */ #define IPPROTO_SEP 33 /* Sequential Exchange */ #define IPPROTO_3PC 34 /* Third Party Connect */ #define IPPROTO_IDPR 35 /* InterDomain Policy Routing */ #define IPPROTO_XTP 36 /* XTP */ #define IPPROTO_DDP 37 /* Datagram Delivery */ #define IPPROTO_CMTP 38 /* Control Message Transport */ #define IPPROTO_TPXX 39 /* TP++ Transport */ #define IPPROTO_IL 40 /* IL transport protocol */ #define IPPROTO_IPV6 41 /* IP6 header */ #define IPPROTO_SDRP 42 /* Source Demand Routing */ #define IPPROTO_ROUTING 43 /* IP6 routing header */ #define IPPROTO_FRAGMENT 44 /* IP6 fragmentation header */ #define IPPROTO_IDRP 45 /* InterDomain Routing*/ #define IPPROTO_RSVP 46 /* resource reservation */ #define IPPROTO_GRE 47 /* General Routing Encap. */ #define IPPROTO_MHRP 48 /* Mobile Host Routing */ #define IPPROTO_BHA 49 /* BHA */ #define IPPROTO_ESP 50 /* IP6 Encap Sec. Payload */ #define IPPROTO_AH 51 /* IP6 Auth Header */ #define IPPROTO_INLSP 52 /* Integ. Net Layer Security */ #define IPPROTO_SWIPE 53 /* IP with encryption */ #define IPPROTO_NHRP 54 /* Next Hop Resolution */ #define IPPROTO_MOBILE 55 /* IP Mobility */ #define IPPROTO_TLSP 56 /* Transport Layer Security */ #define IPPROTO_SKIP 57 /* SKIP */ #define IPPROTO_ICMPV6 58 /* ICMP6 */ #define IPPROTO_NONE 59 /* IP6 no next header */ #define IPPROTO_DSTOPTS 60 /* IP6 destination option */ #define IPPROTO_AHIP 61 /* any host internal protocol */ #define IPPROTO_CFTP 62 /* CFTP */ #define IPPROTO_HELLO 63 /* "hello" routing protocol */ #define IPPROTO_SATEXPAK 64 /* SATNET/Backroom EXPAK */ #define IPPROTO_KRYPTOLAN 65 /* Kryptolan */ #define IPPROTO_RVD 66 /* Remote Virtual Disk */ #define IPPROTO_IPPC 67 /* Pluribus Packet Core */ #define IPPROTO_ADFS 68 /* Any distributed FS */ #define IPPROTO_SATMON 69 /* Satnet Monitoring */ #define IPPROTO_VISA 70 /* VISA Protocol */ #define IPPROTO_IPCV 71 /* Packet Core Utility */ #define IPPROTO_CPNX 72 /* Comp. Prot. Net. Executive */ #define IPPROTO_CPHB 73 /* Comp. Prot. HeartBeat */ #define IPPROTO_WSN 74 /* Wang Span Network */ #define IPPROTO_PVP 75 /* Packet Video Protocol */ #define IPPROTO_BRSATMON 76 /* BackRoom SATNET Monitoring */ #define IPPROTO_ND 77 /* Sun net disk proto (temp.) */ #define IPPROTO_WBMON 78 /* WIDEBAND Monitoring */ #define IPPROTO_WBEXPAK 79 /* WIDEBAND EXPAK */ #define IPPROTO_EON 80 /* ISO cnlp */ #define IPPROTO_VMTP 81 /* VMTP */ #define IPPROTO_SVMTP 82 /* Secure VMTP */ #define IPPROTO_VINES 83 /* Banyon VINES */ #define IPPROTO_TTP 84 /* TTP */ #define IPPROTO_IGP 85 /* NSFNET-IGP */ #define IPPROTO_DGP 86 /* dissimilar gateway prot. */ #define IPPROTO_TCF 87 /* TCF */ #define IPPROTO_IGRP 88 /* Cisco/GXS IGRP */ #define IPPROTO_OSPFIGP 89 /* OSPFIGP */ #define IPPROTO_SRPC 90 /* Strite RPC protocol */ #define IPPROTO_LARP 91 /* Locus Address Resoloution */ #define IPPROTO_MTP 92 /* Multicast Transport */ #define IPPROTO_AX25 93 /* AX.25 Frames */ #define IPPROTO_IPEIP 94 /* IP encapsulated in IP */ #define IPPROTO_MICP 95 /* Mobile Int.ing control */ #define IPPROTO_SCCSP 96 /* Semaphore Comm. security */ #define IPPROTO_ETHERIP 97 /* Ethernet IP encapsulation */ #define IPPROTO_ENCAP 98 /* encapsulation header */ #define IPPROTO_APES 99 /* any private encr. scheme */ #define IPPROTO_GMTP 100 /* GMTP*/ #define IPPROTO_IPCOMP 108 /* payload compression (IPComp) */ /* 101-254: Partly Unassigned */ #define IPPROTO_PIM 103 /* Protocol Independent Mcast */ #define IPPROTO_CARP 112 /* CARP */ #define IPPROTO_PGM 113 /* PGM */ #define IPPROTO_PFSYNC 240 /* PFSYNC */ /* 255: Reserved */ /* BSD Private, local use, namespace incursion, no longer used */ #define IPPROTO_OLD_DIVERT 254 /* OLD divert pseudo-proto */ #define IPPROTO_MAX 256 /* last return value of *_input(), meaning "all job for this pkt is done". */ #define IPPROTO_DONE 257 /* Only used internally, so can be outside the range of valid IP protocols. */ #define IPPROTO_DIVERT 258 /* divert pseudo-protocol */ /* * Defined to avoid confusion. The master value is defined by * PROTO_SPACER in sys/protosw.h. */ #define IPPROTO_SPACER 32767 /* spacer for loadable protos */ /* * Local port number conventions: * * When a user does a bind(2) or connect(2) with a port number of zero, * a non-conflicting local port address is chosen. * The default range is IPPORT_HIFIRSTAUTO through * IPPORT_HILASTAUTO, although that is settable by sysctl. * * A user may set the IPPROTO_IP option IP_PORTRANGE to change this * default assignment range. * * The value IP_PORTRANGE_DEFAULT causes the default behavior. * * The value IP_PORTRANGE_HIGH changes the range of candidate port numbers * into the "high" range. These are reserved for client outbound connections * which do not want to be filtered by any firewalls. Note that by default * this is the same as IP_PORTRANGE_DEFAULT. * * The value IP_PORTRANGE_LOW changes the range to the "low" are * that is (by convention) restricted to privileged processes. This * convention is based on "vouchsafe" principles only. It is only secure * if you trust the remote host to restrict these ports. * * The default range of ports and the high range can be changed by * sysctl(3). (net.inet.ip.port{hi,low}{first,last}_auto) * * Changing those values has bad security implications if you are * using a stateless firewall that is allowing packets outside of that * range in order to allow transparent outgoing connections. * * Such a firewall configuration will generally depend on the use of these * default values. If you change them, you may find your Security * Administrator looking for you with a heavy object. * * For a slightly more orthodox text view on this: * * ftp://ftp.isi.edu/in-notes/iana/assignments/port-numbers * * port numbers are divided into three ranges: * * 0 - 1023 Well Known Ports * 1024 - 49151 Registered Ports * 49152 - 65535 Dynamic and/or Private Ports * */ /* * Ports < IPPORT_RESERVED are reserved for * privileged processes (e.g. root). (IP_PORTRANGE_LOW) */ #define IPPORT_RESERVED 1024 /* * Default local port range, used by both IP_PORTRANGE_DEFAULT * and IP_PORTRANGE_HIGH. */ #define IPPORT_HIFIRSTAUTO 49152 #define IPPORT_HILASTAUTO 65535 /* * Scanning for a free reserved port return a value below IPPORT_RESERVED, * but higher than IPPORT_RESERVEDSTART. Traditionally the start value was * 512, but that conflicts with some well-known-services that firewalls may * have a fit if we use. */ #define IPPORT_RESERVEDSTART 600 #define IPPORT_MAX 65535 /* * Definitions of bits in internet address integers. * On subnets, the decomposition of addresses to host and net parts * is done according to subnet mask, not the masks here. */ #define IN_CLASSA(i) (((u_int32_t)(i) & 0x80000000) == 0) #define IN_CLASSA_NET 0xff000000 #define IN_CLASSA_NSHIFT 24 #define IN_CLASSA_HOST 0x00ffffff #define IN_CLASSA_MAX 128 #define IN_CLASSB(i) (((u_int32_t)(i) & 0xc0000000) == 0x80000000) #define IN_CLASSB_NET 0xffff0000 #define IN_CLASSB_NSHIFT 16 #define IN_CLASSB_HOST 0x0000ffff #define IN_CLASSB_MAX 65536 #define IN_CLASSC(i) (((u_int32_t)(i) & 0xe0000000) == 0xc0000000) #define IN_CLASSC_NET 0xffffff00 #define IN_CLASSC_NSHIFT 8 #define IN_CLASSC_HOST 0x000000ff #define IN_CLASSD(i) (((u_int32_t)(i) & 0xf0000000) == 0xe0000000) #define IN_CLASSD_NET 0xf0000000 /* These ones aren't really */ #define IN_CLASSD_NSHIFT 28 /* net and host fields, but */ #define IN_CLASSD_HOST 0x0fffffff /* routing needn't know. */ #define IN_MULTICAST(i) IN_CLASSD(i) #define IN_EXPERIMENTAL(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000) #define IN_BADCLASS(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000) #define INADDR_LOOPBACK (u_int32_t)0x7f000001 #ifndef _KERNEL #define INADDR_NONE 0xffffffff /* -1 return */ #endif #define INADDR_UNSPEC_GROUP (u_int32_t)0xe0000000 /* 224.0.0.0 */ #define INADDR_ALLHOSTS_GROUP (u_int32_t)0xe0000001 /* 224.0.0.1 */ #define INADDR_ALLRTRS_GROUP (u_int32_t)0xe0000002 /* 224.0.0.2 */ #define INADDR_CARP_GROUP (u_int32_t)0xe0000012 /* 224.0.0.18 */ #define INADDR_PFSYNC_GROUP (u_int32_t)0xe00000f0 /* 224.0.0.240 */ #define INADDR_ALLMDNS_GROUP (u_int32_t)0xe00000fb /* 224.0.0.251 */ #define INADDR_MAX_LOCAL_GROUP (u_int32_t)0xe00000ff /* 224.0.0.255 */ #define IN_LOOPBACKNET 127 /* official! */ /* * Options for use with [gs]etsockopt at the IP level. * First word of comment is data type; bool is stored in int. */ #define IP_OPTIONS 1 /* buf/ip_opts; set/get IP options */ #define IP_HDRINCL 2 /* int; header is included with data */ #define IP_TOS 3 /* int; IP type of service and preced. */ #define IP_TTL 4 /* int; IP time to live */ #define IP_RECVOPTS 5 /* bool; receive all IP opts w/dgram */ #define IP_RECVRETOPTS 6 /* bool; receive IP opts for response */ #define IP_RECVDSTADDR 7 /* bool; receive IP dst addr w/dgram */ #define IP_SENDSRCADDR IP_RECVDSTADDR /* cmsg_type to set src addr */ #define IP_RETOPTS 8 /* ip_opts; set/get IP options */ #define IP_MULTICAST_IF 9 /* u_char; set/get IP multicast i/f */ #define IP_MULTICAST_TTL 10 /* u_char; set/get IP multicast ttl */ #define IP_MULTICAST_LOOP 11 /* u_char; set/get IP multicast loopback */ #define IP_ADD_MEMBERSHIP 12 /* ip_mreq; add an IP group membership */ #define IP_DROP_MEMBERSHIP 13 /* ip_mreq; drop an IP group membership */ #define IP_MULTICAST_VIF 14 /* set/get IP mcast virt. iface */ #define IP_RSVP_ON 15 /* enable RSVP in kernel */ #define IP_RSVP_OFF 16 /* disable RSVP in kernel */ #define IP_RSVP_VIF_ON 17 /* set RSVP per-vif socket */ #define IP_RSVP_VIF_OFF 18 /* unset RSVP per-vif socket */ #define IP_PORTRANGE 19 /* int; range to choose for unspec port */ #define IP_RECVIF 20 /* bool; receive reception if w/dgram */ /* for IPSEC */ #define IP_IPSEC_POLICY 21 /* int; set/get security policy */ #define IP_FAITH 22 /* bool; accept FAITH'ed connections */ #define IP_ONESBCAST 23 /* bool: send all-ones broadcast */ #define IP_FW_TABLE_ADD 40 /* add entry */ #define IP_FW_TABLE_DEL 41 /* delete entry */ #define IP_FW_TABLE_FLUSH 42 /* flush table */ #define IP_FW_TABLE_GETSIZE 43 /* get table size */ #define IP_FW_TABLE_LIST 44 /* list table contents */ #define IP_FW_ADD 50 /* add a firewall rule to chain */ #define IP_FW_DEL 51 /* delete a firewall rule from chain */ #define IP_FW_FLUSH 52 /* flush firewall rule chain */ #define IP_FW_ZERO 53 /* clear single/all firewall counter(s) */ #define IP_FW_GET 54 /* get entire firewall rule chain */ #define IP_FW_RESETLOG 55 /* reset logging counters */ #define IP_DUMMYNET_CONFIGURE 60 /* add/configure a dummynet pipe */ #define IP_DUMMYNET_DEL 61 /* delete a dummynet pipe from chain */ #define IP_DUMMYNET_FLUSH 62 /* flush dummynet */ #define IP_DUMMYNET_GET 64 /* get entire dummynet pipes */ #define IP_RECVTTL 65 /* bool; receive IP TTL w/dgram */ #define IP_MINTTL 66 /* minimum TTL for packet or drop */ +#define IP_DONTFRAG 67 /* don't fragment packet */ /* * Defaults and limits for options */ #define IP_DEFAULT_MULTICAST_TTL 1 /* normally limit m'casts to 1 hop */ #define IP_DEFAULT_MULTICAST_LOOP 1 /* normally hear sends if a member */ #define IP_MAX_MEMBERSHIPS 20 /* per socket */ /* * Argument structure for IP_ADD_MEMBERSHIP and IP_DROP_MEMBERSHIP. */ struct ip_mreq { struct in_addr imr_multiaddr; /* IP multicast address of group */ struct in_addr imr_interface; /* local IP address of interface */ }; /* * Argument for IP_PORTRANGE: * - which range to search when port is unspecified at bind() or connect() */ #define IP_PORTRANGE_DEFAULT 0 /* default range */ #define IP_PORTRANGE_HIGH 1 /* "high" - request firewall bypass */ #define IP_PORTRANGE_LOW 2 /* "low" - vouchsafe security */ /* * Definitions for inet sysctl operations. * * Third level is protocol number. * Fourth level is desired variable within that protocol. */ #define IPPROTO_MAXID (IPPROTO_AH + 1) /* don't list to IPPROTO_MAX */ #define CTL_IPPROTO_NAMES { \ { "ip", CTLTYPE_NODE }, \ { "icmp", CTLTYPE_NODE }, \ { "igmp", CTLTYPE_NODE }, \ { "ggp", CTLTYPE_NODE }, \ { 0, 0 }, \ { 0, 0 }, \ { "tcp", CTLTYPE_NODE }, \ { 0, 0 }, \ { "egp", CTLTYPE_NODE }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { "pup", CTLTYPE_NODE }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { "udp", CTLTYPE_NODE }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { "idp", CTLTYPE_NODE }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { "ipsec", CTLTYPE_NODE }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { 0, 0 }, \ { "pim", CTLTYPE_NODE }, \ } /* * Names for IP sysctl objects */ #define IPCTL_FORWARDING 1 /* act as router */ #define IPCTL_SENDREDIRECTS 2 /* may send redirects when forwarding */ #define IPCTL_DEFTTL 3 /* default TTL */ #ifdef notyet #define IPCTL_DEFMTU 4 /* default MTU */ #endif #define IPCTL_RTEXPIRE 5 /* cloned route expiration time */ #define IPCTL_RTMINEXPIRE 6 /* min value for expiration time */ #define IPCTL_RTMAXCACHE 7 /* trigger level for dynamic expire */ #define IPCTL_SOURCEROUTE 8 /* may perform source routes */ #define IPCTL_DIRECTEDBROADCAST 9 /* may re-broadcast received packets */ #define IPCTL_INTRQMAXLEN 10 /* max length of netisr queue */ #define IPCTL_INTRQDROPS 11 /* number of netisr q drops */ #define IPCTL_STATS 12 /* ipstat structure */ #define IPCTL_ACCEPTSOURCEROUTE 13 /* may accept source routed packets */ #define IPCTL_FASTFORWARDING 14 /* use fast IP forwarding code */ #define IPCTL_KEEPFAITH 15 /* FAITH IPv4->IPv6 translater ctl */ #define IPCTL_GIF_TTL 16 /* default TTL for gif encap packet */ #define IPCTL_MAXID 17 #define IPCTL_NAMES { \ { 0, 0 }, \ { "forwarding", CTLTYPE_INT }, \ { "redirect", CTLTYPE_INT }, \ { "ttl", CTLTYPE_INT }, \ { "mtu", CTLTYPE_INT }, \ { "rtexpire", CTLTYPE_INT }, \ { "rtminexpire", CTLTYPE_INT }, \ { "rtmaxcache", CTLTYPE_INT }, \ { "sourceroute", CTLTYPE_INT }, \ { "directed-broadcast", CTLTYPE_INT }, \ { "intr-queue-maxlen", CTLTYPE_INT }, \ { "intr-queue-drops", CTLTYPE_INT }, \ { "stats", CTLTYPE_STRUCT }, \ { "accept_sourceroute", CTLTYPE_INT }, \ { "fastforwarding", CTLTYPE_INT }, \ } #endif /* __BSD_VISIBLE */ #ifdef _KERNEL struct ifnet; struct mbuf; /* forward declarations for Standard C */ int in_broadcast(struct in_addr, struct ifnet *); int in_canforward(struct in_addr); int in_localaddr(struct in_addr); int in_localip(struct in_addr); char *inet_ntoa(struct in_addr); /* in libkern */ char *inet_ntoa_r(struct in_addr ina, char *buf); /* in libkern */ void in_ifdetach(struct ifnet *); #define in_hosteq(s, t) ((s).s_addr == (t).s_addr) #define in_nullhost(x) ((x).s_addr == INADDR_ANY) #define satosin(sa) ((struct sockaddr_in *)(sa)) #define sintosa(sin) ((struct sockaddr *)(sin)) #define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) #endif /* _KERNEL */ /* INET6 stuff */ #if __POSIX_VISIBLE >= 200112 #define __KAME_NETINET_IN_H_INCLUDED_ #include #undef __KAME_NETINET_IN_H_INCLUDED_ #endif #endif /* !_NETINET_IN_H_*/ Index: stable/6/sys/netinet/in_pcb.h =================================================================== --- stable/6/sys/netinet/in_pcb.h (revision 150827) +++ stable/6/sys/netinet/in_pcb.h (revision 150828) @@ -1,370 +1,371 @@ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET_IN_PCB_H_ #define _NETINET_IN_PCB_H_ #include #include #include #include #define in6pcb inpcb /* for KAME src sync over BSD*'s */ #define in6p_sp inp_sp /* for KAME src sync over BSD*'s */ struct inpcbpolicy; /* * Common structure pcb for internet protocol implementation. * Here are stored pointers to local and foreign host table * entries, local and foreign socket numbers, and pointers * up (to a socket structure) and down (to a protocol-specific) * control block. */ LIST_HEAD(inpcbhead, inpcb); LIST_HEAD(inpcbporthead, inpcbport); typedef u_quad_t inp_gen_t; /* * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet. * So, AF_INET6 null laddr is also used as AF_INET null laddr, * by utilize following structure. (At last, same as INRIA) */ struct in_addr_4in6 { u_int32_t ia46_pad32[3]; struct in_addr ia46_addr4; }; /* * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. * in_conninfo has some extra padding to accomplish this. */ struct in_endpoints { u_int16_t ie_fport; /* foreign port */ u_int16_t ie_lport; /* local port */ /* protocol dependent part, local and foreign addr */ union { /* foreign host table entry */ struct in_addr_4in6 ie46_foreign; struct in6_addr ie6_foreign; } ie_dependfaddr; union { /* local host table entry */ struct in_addr_4in6 ie46_local; struct in6_addr ie6_local; } ie_dependladdr; #define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4 #define ie_laddr ie_dependladdr.ie46_local.ia46_addr4 #define ie6_faddr ie_dependfaddr.ie6_foreign #define ie6_laddr ie_dependladdr.ie6_local }; /* * XXX * the defines for inc_* are hacks and should be changed to direct references */ struct in_conninfo { u_int8_t inc_flags; u_int8_t inc_len; u_int16_t inc_pad; /* XXX alignment for in_endpoints */ /* protocol dependent part */ struct in_endpoints inc_ie; }; #define inc_isipv6 inc_flags /* temp compatability */ #define inc_fport inc_ie.ie_fport #define inc_lport inc_ie.ie_lport #define inc_faddr inc_ie.ie_faddr #define inc_laddr inc_ie.ie_laddr #define inc6_faddr inc_ie.ie6_faddr #define inc6_laddr inc_ie.ie6_laddr struct icmp6_filter; struct inpcb { LIST_ENTRY(inpcb) inp_hash; /* hash list */ LIST_ENTRY(inpcb) inp_list; /* list for all PCBs of this proto */ u_int32_t inp_flow; /* local and foreign ports, local and foreign addr */ struct in_conninfo inp_inc; caddr_t inp_ppcb; /* pointer to per-protocol pcb */ struct inpcbinfo *inp_pcbinfo; /* PCB list info */ struct socket *inp_socket; /* back pointer to socket */ /* list for this PCB's local port */ struct label *inp_label; /* MAC label */ int inp_flags; /* generic IP/datagram flags */ struct inpcbpolicy *inp_sp; /* for IPSEC */ u_char inp_vflag; /* IP version flag (v4/v6) */ #define INP_IPV4 0x1 #define INP_IPV6 0x2 #define INP_IPV6PROTO 0x4 /* opened under IPv6 protocol */ #define INP_TIMEWAIT 0x8 /* .. probably doesn't go here */ #define INP_ONESBCAST 0x10 /* send all-ones broadcast */ u_char inp_ip_ttl; /* time to live proto */ u_char inp_ip_p; /* protocol proto */ u_char inp_ip_minttl; /* minimum TTL or drop */ /* protocol dependent part; options */ struct { u_char inp4_ip_tos; /* type of service proto */ struct mbuf *inp4_options; /* IP options */ struct ip_moptions *inp4_moptions; /* IP multicast options */ } inp_depend4; #define inp_fport inp_inc.inc_fport #define inp_lport inp_inc.inc_lport #define inp_faddr inp_inc.inc_faddr #define inp_laddr inp_inc.inc_laddr #define inp_ip_tos inp_depend4.inp4_ip_tos #define inp_options inp_depend4.inp4_options #define inp_moptions inp_depend4.inp4_moptions struct { /* IP options */ struct mbuf *inp6_options; /* IP6 options for outgoing packets */ struct ip6_pktopts *inp6_outputopts; /* IP multicast options */ struct ip6_moptions *inp6_moptions; /* ICMPv6 code type filter */ struct icmp6_filter *inp6_icmp6filt; /* IPV6_CHECKSUM setsockopt */ int inp6_cksum; u_short inp6_ifindex; short inp6_hops; } inp_depend6; LIST_ENTRY(inpcb) inp_portlist; struct inpcbport *inp_phd; /* head of this list */ inp_gen_t inp_gencnt; /* generation count of this instance */ struct mtx inp_mtx; #define in6p_faddr inp_inc.inc6_faddr #define in6p_laddr inp_inc.inc6_laddr #define in6p_ip6_hlim inp_depend6.inp6_hlim #define in6p_hops inp_depend6.inp6_hops /* default hop limit */ #define in6p_ip6_nxt inp_ip_p #define in6p_flowinfo inp_flow #define in6p_vflag inp_vflag #define in6p_options inp_depend6.inp6_options #define in6p_outputopts inp_depend6.inp6_outputopts #define in6p_moptions inp_depend6.inp6_moptions #define in6p_icmp6filt inp_depend6.inp6_icmp6filt #define in6p_cksum inp_depend6.inp6_cksum #define inp6_ifindex inp_depend6.inp6_ifindex #define in6p_flags inp_flags /* for KAME src sync over BSD*'s */ #define in6p_socket inp_socket /* for KAME src sync over BSD*'s */ #define in6p_lport inp_lport /* for KAME src sync over BSD*'s */ #define in6p_fport inp_fport /* for KAME src sync over BSD*'s */ #define in6p_ppcb inp_ppcb /* for KAME src sync over BSD*'s */ }; /* * The range of the generation count, as used in this implementation, * is 9e19. We would have to create 300 billion connections per * second for this number to roll over in a year. This seems sufficiently * unlikely that we simply don't concern ourselves with that possibility. */ /* * Interface exported to userland by various protocols which use * inpcbs. Hack alert -- only define if struct xsocket is in scope. */ #ifdef _SYS_SOCKETVAR_H_ struct xinpcb { size_t xi_len; /* length of this structure */ struct inpcb xi_inp; struct xsocket xi_socket; u_quad_t xi_alignment_hack; }; struct xinpgen { size_t xig_len; /* length of this structure */ u_int xig_count; /* number of PCBs at this time */ inp_gen_t xig_gen; /* generation count at this time */ so_gen_t xig_sogen; /* socket generation count at this time */ }; #endif /* _SYS_SOCKETVAR_H_ */ struct inpcbport { LIST_ENTRY(inpcbport) phd_hash; struct inpcbhead phd_pcblist; u_short phd_port; }; struct inpcbinfo { /* XXX documentation, prefixes */ struct inpcbhead *hashbase; u_long hashmask; struct inpcbporthead *porthashbase; u_long porthashmask; struct inpcbhead *listhead; u_short lastport; u_short lastlow; u_short lasthi; struct uma_zone *ipi_zone; /* zone to allocate pcbs from */ u_int ipi_count; /* number of pcbs in this list */ u_quad_t ipi_gencnt; /* current generation count */ struct mtx ipi_mtx; }; /* * NB: We cannot enable assertions when IPv6 is configured as * this code is shared by both IPv4 and IPv6 and IPv6 is * not properly locked. */ #define INP_LOCK_INIT(inp, d, t) \ mtx_init(&(inp)->inp_mtx, (d), (t), MTX_DEF | MTX_RECURSE | MTX_DUPOK) #define INP_LOCK_DESTROY(inp) mtx_destroy(&(inp)->inp_mtx) #define INP_LOCK(inp) mtx_lock(&(inp)->inp_mtx) #define INP_UNLOCK(inp) mtx_unlock(&(inp)->inp_mtx) #define INP_LOCK_ASSERT(inp) do { \ mtx_assert(&(inp)->inp_mtx, MA_OWNED); \ NET_ASSERT_GIANT(); \ } while (0) #define INP_UNLOCK_ASSERT(inp) mtx_assert(&(inp)->inp_mtx, MA_NOTOWNED) #define INP_INFO_LOCK_INIT(ipi, d) \ mtx_init(&(ipi)->ipi_mtx, (d), NULL, MTX_DEF | MTX_RECURSE) #define INP_INFO_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_mtx) #define INP_INFO_RLOCK(ipi) mtx_lock(&(ipi)->ipi_mtx) #define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_mtx) #define INP_INFO_RUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_mtx) #define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_mtx) #define INP_INFO_RLOCK_ASSERT(ipi) do { \ mtx_assert(&(ipi)->ipi_mtx, MA_OWNED); \ NET_ASSERT_GIANT(); \ } while (0) #define INP_INFO_WLOCK_ASSERT(ipi) do { \ mtx_assert(&(ipi)->ipi_mtx, MA_OWNED); \ NET_ASSERT_GIANT(); \ } while (0) #define INP_PCBHASH(faddr, lport, fport, mask) \ (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) #define INP_PCBPORTHASH(lport, mask) \ (ntohs((lport)) & (mask)) /* flags in inp_flags: */ #define INP_RECVOPTS 0x01 /* receive incoming IP options */ #define INP_RECVRETOPTS 0x02 /* receive IP options for reply */ #define INP_RECVDSTADDR 0x04 /* receive IP dst address */ #define INP_HDRINCL 0x08 /* user supplies entire IP header */ #define INP_HIGHPORT 0x10 /* user wants "high" port binding */ #define INP_LOWPORT 0x20 /* user wants "low" port binding */ #define INP_ANONPORT 0x40 /* port chosen for user */ #define INP_RECVIF 0x80 /* receive incoming interface */ #define INP_MTUDISC 0x100 /* user can do MTU discovery */ #define INP_FAITH 0x200 /* accept FAITH'ed connections */ #define INP_RECVTTL 0x400 /* receive incoming IP TTL */ +#define INP_DONTFRAG 0x800 /* don't fragment packet */ #define IN6P_IPV6_V6ONLY 0x008000 /* restrict AF_INET6 socket for v6 */ #define IN6P_PKTINFO 0x010000 /* receive IP6 dst and I/F */ #define IN6P_HOPLIMIT 0x020000 /* receive hoplimit */ #define IN6P_HOPOPTS 0x040000 /* receive hop-by-hop options */ #define IN6P_DSTOPTS 0x080000 /* receive dst options after rthdr */ #define IN6P_RTHDR 0x100000 /* receive routing header */ #define IN6P_RTHDRDSTOPTS 0x200000 /* receive dstoptions before rthdr */ #define IN6P_TCLASS 0x400000 /* receive traffic class value */ #define IN6P_AUTOFLOWLABEL 0x800000 /* attach flowlabel automatically */ #define IN6P_RFC2292 0x40000000 /* used RFC2292 API on the socket */ #define IN6P_MTU 0x80000000 /* receive path MTU */ #define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\ INP_RECVIF|INP_RECVTTL|\ IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\ IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\ IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\ IN6P_MTU) #define INP_UNMAPPABLEOPTS (IN6P_HOPOPTS|IN6P_DSTOPTS|IN6P_RTHDR|\ IN6P_TCLASS|IN6P_AUTOFLOWLABEL) /* for KAME src sync over BSD*'s */ #define IN6P_HIGHPORT INP_HIGHPORT #define IN6P_LOWPORT INP_LOWPORT #define IN6P_ANONPORT INP_ANONPORT #define IN6P_RECVIF INP_RECVIF #define IN6P_MTUDISC INP_MTUDISC #define IN6P_FAITH INP_FAITH #define IN6P_CONTROLOPTS INP_CONTROLOPTS /* * socket AF version is {newer than,or include} * actual datagram AF version */ #define INPLOOKUP_WILDCARD 1 #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) #define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ #define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) #ifdef _KERNEL extern int ipport_lowfirstauto; extern int ipport_lowlastauto; extern int ipport_firstauto; extern int ipport_lastauto; extern int ipport_hifirstauto; extern int ipport_hilastauto; extern struct callout ipport_tick_callout; void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); int in_pcballoc(struct socket *, struct inpcbinfo *, const char *); int in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *); int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, struct ucred *); int in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *); int in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, in_addr_t *, u_short *, struct inpcb **, struct ucred *); void in_pcbdetach(struct inpcb *); void in_pcbdisconnect(struct inpcb *); int in_pcbinshash(struct inpcb *); struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_int, int); struct inpcb * in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *); void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr, int, struct inpcb *(*)(struct inpcb *, int)); void in_pcbrehash(struct inpcb *); void in_pcbsetsolabel(struct socket *so); int in_setpeeraddr(struct socket *so, struct sockaddr **nam, struct inpcbinfo *pcbinfo); int in_setsockaddr(struct socket *so, struct sockaddr **nam, struct inpcbinfo *pcbinfo); struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr); void in_pcbsosetlabel(struct socket *so); void in_pcbremlists(struct inpcb *inp); void ipport_tick(void *xtp); #endif /* _KERNEL */ #endif /* !_NETINET_IN_PCB_H_ */ Index: stable/6/sys/netinet/ip_output.c =================================================================== --- stable/6/sys/netinet/ip_output.c (revision 150827) +++ stable/6/sys/netinet/ip_output.c (revision 150828) @@ -1,2062 +1,2070 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 * $FreeBSD$ */ #include "opt_ipfw.h" #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_mbuf_stress_test.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); #ifdef IPSEC #include #include #ifdef IPSEC_DEBUG #include #else #define KEYDEBUG(lev,arg) #endif #endif /*IPSEC*/ #ifdef FAST_IPSEC #include #include #include #endif /*FAST_IPSEC*/ #define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\ x, (ntohl(a.s_addr)>>24)&0xFF,\ (ntohl(a.s_addr)>>16)&0xFF,\ (ntohl(a.s_addr)>>8)&0xFF,\ (ntohl(a.s_addr))&0xFF, y); u_short ip_id; #ifdef MBUF_STRESS_TEST int mbuf_frag_size = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); #endif static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); static struct ifnet *ip_multicast_if(struct in_addr *, int *); static void ip_mloopback (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); static int ip_getmoptions(struct inpcb *, struct sockopt *); static int ip_pcbopts(struct inpcb *, int, struct mbuf *); static int ip_setmoptions(struct inpcb *, struct sockopt *); static struct ip_moptions *ip_findmoptions(struct inpcb *inp); int ip_optcopy(struct ip *, struct ip *); extern struct protosw inetsw[]; /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * In the IP forwarding case, the packet will arrive with options already * inserted, so must have a NULL opt pointer. */ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int len, error = 0; struct sockaddr_in *dst = NULL; /* keep compiler happy */ struct in_ifaddr *ia = NULL; int isbroadcast, sw_csum; struct route iproute; struct in_addr odst; #ifdef IPFIREWALL_FORWARD struct m_tag *fwd_tag = NULL; #endif #ifdef IPSEC struct secpolicy *sp = NULL; #endif #ifdef FAST_IPSEC struct secpolicy *sp = NULL; struct tdb_ident *tdbi; struct m_tag *mtag; int s; #endif /* FAST_IPSEC */ M_ASSERTPKTHDR(m); if (ro == NULL) { ro = &iproute; bzero(ro, sizeof (*ro)); } if (inp != NULL) INP_LOCK_ASSERT(inp); if (opt) { len = 0; m = ip_insertoptions(m, opt, &len); if (len != 0) hlen = len; } ip = mtod(m, struct ip *); /* * Fill in IP header. If we are not allowing fragmentation, * then the ip_id field is meaningless, but we don't set it * to zero. Doing so causes various problems when devices along * the path (routers, load balancers, firewalls, etc.) illegally * disable DF on our packet. Note that a 16-bit counter * will wrap around in less than 10 seconds at 100 Mbit/s on a * medium with MTU 1500. See Steven M. Bellovin, "A Technique * for Counting NATted Hosts", Proc. IMW'02, available at * . */ if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip->ip_id = ip_newid(); ipstat.ips_localout++; } else { hlen = ip->ip_hl << 2; } dst = (struct sockaddr_in *)&ro->ro_dst; again: /* * If there is a cached route, * check that it is to the same destination * and is still up. If not, free it and try again. * The address family should also be checked in case of sharing the * cache with IPv6. */ if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { RTFREE(ro->ro_rt); ro->ro_rt = (struct rtentry *)0; } #ifdef IPFIREWALL_FORWARD if (ro->ro_rt == NULL && fwd_tag == NULL) { #else if (ro->ro_rt == NULL) { #endif bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } /* * If routing to interface only, * short circuit routing lookup. */ if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) { ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; IFP_TO_IA(ifp, ia); isbroadcast = 0; /* fool gcc */ } else { /* * We want to do any cloning requested by the link layer, * as this is probably required in all cases for correct * operation (as it is for ARP). */ if (ro->ro_rt == NULL) rtalloc_ign(ro, 0); if (ro->ro_rt == NULL) { ipstat.ips_noroute++; error = EHOSTUNREACH; goto bad; } ia = ifatoia(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_rmx.rmx_pksent++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; if (ro->ro_rt->rt_flags & RTF_HOST) isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); else isbroadcast = in_broadcast(dst->sin_addr, ifp); } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct in_multi *inm; m->m_flags |= M_MCAST; /* * IP destination address is multicast. Make sure "dst" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ dst = (struct sockaddr_in *)&ro->ro_dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { ip->ip_ttl = imo->imo_multicast_ttl; if (imo->imo_multicast_vif != -1) ip->ip_src.s_addr = ip_mcast_src ? ip_mcast_src(imo->imo_multicast_vif) : INADDR_ANY; } else ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } } /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) ip->ip_src = IA_SIN(ia)->sin_addr; } IN_MULTI_LOCK(); IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); if (inm != NULL && (imo == NULL || imo->imo_multicast_loop)) { IN_MULTI_UNLOCK(); /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not * forbid loopback, loop back a copy. */ ip_mloopback(ifp, m, dst, hlen); } else { IN_MULTI_UNLOCK(); /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IP_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip_mloopback(), * above, will be forwarded by the ip_input() routine, * if necessary. */ if (ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * If rsvp daemon is not running, do not * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ if (!rsvp_on) imo = NULL; if (ip_mforward && ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); goto done; } goto sendit; } #ifndef notdef /* * If the source address is not specified yet, use the address * of the outoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) { ip->ip_src = IA_SIN(ia)->sin_addr; } } #endif /* notdef */ /* * Verify that we have any chance at all of being able to queue the * packet or packet fragments, unless ALTQ is enabled on the given * interface in which case packetdrop should be done by queueing. */ #ifdef ALTQ if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) && ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= ifp->if_snd.ifq_maxlen)) #else if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= ifp->if_snd.ifq_maxlen) #endif /* ALTQ */ { error = ENOBUFS; ipstat.ips_odropped++; goto bad; } /* * Look for broadcast address and * verify user is allowed to send * such a packet. */ if (isbroadcast) { if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EADDRNOTAVAIL; goto bad; } if ((flags & IP_ALLOWBROADCAST) == 0) { error = EACCES; goto bad; } /* don't allow broadcast messages to be fragmented */ if (ip->ip_len > ifp->if_mtu) { error = EMSGSIZE; goto bad; } if (flags & IP_SENDONES) ip->ip_dst.s_addr = INADDR_BROADCAST; m->m_flags |= M_BCAST; } else { m->m_flags &= ~M_BCAST; } sendit: #ifdef IPSEC /* get SP for this packet */ if (inp == NULL) sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error); else sp = ipsec4_getpolicybypcb(m, IPSEC_DIR_OUTBOUND, inp, &error); if (sp == NULL) { ipsecstat.out_inval++; goto bad; } error = 0; /* check policy */ switch (sp->policy) { case IPSEC_POLICY_DISCARD: /* * This packet is just discarded. */ ipsecstat.out_polvio++; goto bad; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: case IPSEC_POLICY_TCP: /* no need to do IPsec. */ goto skip_ipsec; case IPSEC_POLICY_IPSEC: if (sp->req == NULL) { /* acquire a policy */ error = key_spdacquire(sp); goto bad; } break; case IPSEC_POLICY_ENTRUST: default: printf("ip_output: Invalid policy found. %d\n", sp->policy); } { struct ipsec_output_state state; bzero(&state, sizeof(state)); state.m = m; if (flags & IP_ROUTETOIF) { state.ro = &iproute; bzero(&iproute, sizeof(iproute)); } else state.ro = ro; state.dst = (struct sockaddr *)dst; ip->ip_sum = 0; /* * XXX * delayed checksums are not currently compatible with IPsec */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); error = ipsec4_output(&state, sp, flags); m = state.m; if (flags & IP_ROUTETOIF) { /* * if we have tunnel mode SA, we may need to ignore * IP_ROUTETOIF. */ if (state.ro != &iproute || state.ro->ro_rt != NULL) { flags &= ~IP_ROUTETOIF; ro = state.ro; } } else ro = state.ro; dst = (struct sockaddr_in *)state.dst; if (error) { /* mbuf is already reclaimed in ipsec4_output. */ m = NULL; switch (error) { case EHOSTUNREACH: case ENETUNREACH: case EMSGSIZE: case ENOBUFS: case ENOMEM: break; default: printf("ip4_output (ipsec): error code %d\n", error); /*fall through*/ case ENOENT: /* don't show these error codes to the user */ error = 0; break; } goto bad; } /* be sure to update variables that are affected by ipsec4_output() */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; if (ro->ro_rt == NULL) { if ((flags & IP_ROUTETOIF) == 0) { printf("ip_output: " "can't update route after IPsec processing\n"); error = EHOSTUNREACH; /*XXX*/ goto bad; } } else { if (state.encap) { ia = ifatoia(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; } } } /* make it flipped, again. */ ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); skip_ipsec: #endif /*IPSEC*/ #ifdef FAST_IPSEC /* * Check the security policy (SP) for the packet and, if * required, do IPsec-related processing. There are two * cases here; the first time a packet is sent through * it will be untagged and handled by ipsec4_checkpolicy. * If the packet is resubmitted to ip_output (e.g. after * AH, ESP, etc. processing), there will be a tag to bypass * the lookup and related policy checking. */ mtag = m_tag_find(m, PACKET_TAG_IPSEC_PENDING_TDB, NULL); s = splnet(); if (mtag != NULL) { tdbi = (struct tdb_ident *)(mtag + 1); sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND); if (sp == NULL) error = -EINVAL; /* force silent drop */ m_tag_delete(m, mtag); } else { sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags, &error, inp); } /* * There are four return cases: * sp != NULL apply IPsec policy * sp == NULL, error == 0 no IPsec handling needed * sp == NULL, error == -EINVAL discard packet w/o error * sp == NULL, error != 0 discard packet, report error */ if (sp != NULL) { /* Loop detection, check if ipsec processing already done */ KASSERT(sp->req != NULL, ("ip_output: no ipsec request")); for (mtag = m_tag_first(m); mtag != NULL; mtag = m_tag_next(m, mtag)) { if (mtag->m_tag_cookie != MTAG_ABI_COMPAT) continue; if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE && mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED) continue; /* * Check if policy has an SA associated with it. * This can happen when an SP has yet to acquire * an SA; e.g. on first reference. If it occurs, * then we let ipsec4_process_packet do its thing. */ if (sp->req->sav == NULL) break; tdbi = (struct tdb_ident *)(mtag + 1); if (tdbi->spi == sp->req->sav->spi && tdbi->proto == sp->req->sav->sah->saidx.proto && bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst, sizeof (union sockaddr_union)) == 0) { /* * No IPsec processing is needed, free * reference to SP. * * NB: null pointer to avoid free at * done: below. */ KEY_FREESP(&sp), sp = NULL; splx(s); goto spd_done; } } /* * Do delayed checksums now because we send before * this is done in the normal processing path. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); /* NB: callee frees mbuf */ error = ipsec4_process_packet(m, sp->req, flags, 0); /* * Preserve KAME behaviour: ENOENT can be returned * when an SA acquire is in progress. Don't propagate * this to user-level; it confuses applications. * * XXX this will go away when the SADB is redone. */ if (error == ENOENT) error = 0; splx(s); goto done; } else { splx(s); if (error != 0) { /* * Hack: -EINVAL is used to signal that a packet * should be silently discarded. This is typically * because we asked key management for an SA and * it was delayed (e.g. kicked up to IKE). */ if (error == -EINVAL) error = 0; goto bad; } else { /* No IPsec processing for this packet. */ } #ifdef notyet /* * If deferred crypto processing is needed, check that * the interface supports it. */ mtag = m_tag_find(m, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL); if (mtag != NULL && (ifp->if_capenable & IFCAP_IPSEC) == 0) { /* notify IPsec to do its own crypto */ ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1)); error = EHOSTUNREACH; goto bad; } #endif } spd_done: #endif /* FAST_IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ if (inet_pfil_hook.ph_busy_count == -1) goto passout; /* Run through list of hooks for output packets. */ odst.s_addr = ip->ip_dst.s_addr; error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp); if (error != 0 || m == NULL) goto done; ip = mtod(m, struct ip *); /* See if destination IP address was changed by packet filter. */ if (odst.s_addr != ip->ip_dst.s_addr) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip_input(). */ if (in_localip(ip->ip_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; error = netisr_queue(NETISR_IP, m); goto done; } else goto again; /* Redo the routing table lookup. */ } #ifdef IPFIREWALL_FORWARD /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; error = netisr_queue(NETISR_IP, m); goto done; } /* Or forward to some other address? */ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); if (fwd_tag) { #ifndef IPFIREWALL_FORWARD_EXTENDED if (!in_localip(ip->ip_src) && !in_localaddr(ip->ip_dst)) { #endif dst = (struct sockaddr_in *)&ro->ro_dst; bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); m->m_flags |= M_SKIP_FIREWALL; m_tag_delete(m, fwd_tag); goto again; #ifndef IPFIREWALL_FORWARD_EXTENDED } else { m_tag_delete(m, fwd_tag); /* Continue. */ } #endif } #endif /* IPFIREWALL_FORWARD */ passout: /* 127/8 must not appear on wire - RFC1122. */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { ipstat.ips_badaddr++; error = EADDRNOTAVAIL; goto bad; } } m->m_pkthdr.csum_flags |= CSUM_IP; sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; if (sw_csum & CSUM_DELAY_DATA) { in_delayed_cksum(m); sw_csum &= ~CSUM_DELAY_DATA; } m->m_pkthdr.csum_flags &= ifp->if_hwassist; /* * If small enough for interface, or the interface will take * care of the fragmentation for us, can just send directly. */ if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT && ((ip->ip_off & IP_DF) == 0))) { ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) ip->ip_sum = in_cksum(m, hlen); /* Record statistics for this interface address. */ if (!(flags & IP_FORWARDING) && ia) { ia->ia_ifa.if_opackets++; ia->ia_ifa.if_obytes += m->m_pkthdr.len; } #ifdef IPSEC /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif #ifdef MBUF_STRESS_TEST if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) m = m_fragment(m, M_DONTWAIT, mbuf_frag_size); #endif error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro->ro_rt); goto done; } if (ip->ip_off & IP_DF) { error = EMSGSIZE; /* * This case can happen if the user changed the MTU * of an interface after enabling IP on it. Because * most netifs don't keep track of routes pointing to * them, there is no way for one to update all its * routes when the MTU is changed. */ if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; } ipstat.ips_cantfrag++; goto bad; } /* * Too large for interface; fragment if possible. If successful, * on return, m will point to a list of packets to be sent. */ error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum); if (error) goto bad; for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; #ifdef IPSEC /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { ia->ia_ifa.if_opackets++; ia->ia_ifa.if_obytes += m->m_pkthdr.len; } error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro->ro_rt); } else m_freem(m); } if (error == 0) ipstat.ips_fragmented++; done: if (ro == &iproute && ro->ro_rt) { RTFREE(ro->ro_rt); } #ifdef IPSEC if (sp != NULL) { KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP ip_output call free SP:%p\n", sp)); key_freesp(sp); } #endif #ifdef FAST_IPSEC if (sp != NULL) KEY_FREESP(&sp); #endif return (error); bad: m_freem(m); goto done; } /* * Create a chain of fragments which fit the given mtu. m_frag points to the * mbuf to be fragmented; on return it points to the chain with the fragments. * Return 0 if no error. If error, m_frag may contain a partially built * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags, int sw_csum) { int error = 0; int hlen = ip->ip_hl << 2; int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ int off; struct mbuf *m0 = *m_frag; /* the original packet */ int firstlen; struct mbuf **mnext; int nfrags; if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ ipstat.ips_cantfrag++; return EMSGSIZE; } /* * Must be able to put at least 8 bytes per fragment. */ if (len < 8) return EMSGSIZE; /* * If the interface will not calculate checksums on * fragmented packets, then do it here. */ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA && (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } if (len > PAGE_SIZE) { /* * Fragment large datagrams such that each segment * contains a multiple of PAGE_SIZE amount of data, * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. * * XXX When does this help given that sender and receiver * could have different page sizes, and also mtu could * be less than the receiver's page size ? */ int newlen; struct mbuf *m; for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) off += m->m_len; /* * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) goto smart_frag_failure; off = ((off - hlen) & ~7) + hlen; newlen = (~PAGE_MASK) & mtu; if ((newlen + sizeof (struct ip)) > mtu) { /* we failed, go back the default */ smart_frag_failure: newlen = len; off = hlen + len; } len = newlen; } else { off = hlen + len; } firstlen = off - hlen; mnext = &m0->m_nextpkt; /* pointer to next packet */ /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. * Here, m0 is the original packet, m is the fragment being created. * The fragments are linked off the m_nextpkt of the original * packet, which after processing serves as the first fragment. */ for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) { struct ip *mhip; /* ip header on the fragment */ struct mbuf *m; int mhlen = sizeof (struct ip); MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == NULL) { error = ENOBUFS; ipstat.ips_odropped++; goto done; } m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; /* * In the first mbuf, leave room for the link header, then * copy the original IP header including options. The payload * goes into an additional mbuf chain returned by m_copy(). */ m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); *mhip = *ip; if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); mhip->ip_v = IPVERSION; mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; /* XXX do we need to add ip->ip_off below ? */ mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; if (off + len >= ip->ip_len) { /* last fragment */ len = ip->ip_len - off; m->m_flags |= M_LASTFRAG; } else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copy(m0, off, len); if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ ipstat.ips_odropped++; goto done; } m->m_pkthdr.len = mhlen + len; m->m_pkthdr.rcvif = NULL; #ifdef MAC mac_create_fragment(m0, m); #endif m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) mhip->ip_sum = in_cksum(m, mhlen); *mnext = m; mnext = &m->m_nextpkt; } ipstat.ips_ofragments += nfrags; /* set first marker for fragment chain */ m0->m_flags |= M_FIRSTFRAG | M_FRAG; m0->m_pkthdr.csum_data = nfrags; /* * Update first fragment by trimming what's been copied out * and updating header. */ m_adj(m0, hlen + firstlen - ip->ip_len); m0->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m0->m_pkthdr.len); ip->ip_off |= IP_MF; ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) ip->ip_sum = in_cksum(m0, hlen); done: *m_frag = m0; return error; } void in_delayed_cksum(struct mbuf *m) { struct ip *ip; u_short csum, offset; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; csum = in_cksum_skip(m, ip->ip_len, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(u_short) > m->m_len) { printf("delayed m_pullup, m->len: %d off: %d p: %d\n", m->m_len, offset, ip->ip_p); /* * XXX * this shouldn't happen, but if it does, the * correct behavior may be to insert the checksum * in the existing chain instead of rearranging it. */ m = m_pullup(m, offset + sizeof(u_short)); } *(u_short *)(m->m_data + offset) = csum; } /* * Insert IP options into preformed packet. * Adjust IP destination as required for IP source routing, * as indicated by a non-zero in_addr at the start of the options. * * XXX This routine assumes that the packet has no options in place. */ static struct mbuf * ip_insertoptions(m, opt, phlen) register struct mbuf *m; struct mbuf *opt; int *phlen; { register struct ipoption *p = mtod(opt, struct ipoption *); struct mbuf *n; register struct ip *ip = mtod(m, struct ip *); unsigned optlen; optlen = opt->m_len - sizeof(p->ipopt_dst); if (optlen + ip->ip_len > IP_MAXPACKET) { *phlen = 0; return (m); /* XXX should fail */ } if (p->ipopt_dst.s_addr) ip->ip_dst = p->ipopt_dst; if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { MGETHDR(n, M_DONTWAIT, MT_HEADER); if (n == NULL) { *phlen = 0; return (m); } M_MOVE_PKTHDR(n, m); n->m_pkthdr.rcvif = NULL; #ifdef MAC mac_copy_mbuf(m, n); #endif n->m_pkthdr.len += optlen; m->m_len -= sizeof(struct ip); m->m_data += sizeof(struct ip); n->m_next = m; m = n; m->m_len = optlen + sizeof(struct ip); m->m_data += max_linkhdr; bcopy(ip, mtod(m, void *), sizeof(struct ip)); } else { m->m_data -= optlen; m->m_len += optlen; m->m_pkthdr.len += optlen; bcopy(ip, mtod(m, void *), sizeof(struct ip)); } ip = mtod(m, struct ip *); bcopy(p->ipopt_list, ip + 1, optlen); *phlen = sizeof(struct ip) + optlen; ip->ip_v = IPVERSION; ip->ip_hl = *phlen >> 2; ip->ip_len += optlen; return (m); } /* * Copy options from ip to jp, * omitting those not copied during fragmentation. */ int ip_optcopy(ip, jp) struct ip *ip, *jp; { register u_char *cp, *dp; int opt, optlen, cnt; cp = (u_char *)(ip + 1); dp = (u_char *)(jp + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) { /* Preserve for IP mcast tunnel's LSRR alignment. */ *dp++ = IPOPT_NOP; optlen = 1; continue; } KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp), ("ip_optcopy: malformed ipv4 option")); optlen = cp[IPOPT_OLEN]; KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt, ("ip_optcopy: malformed ipv4 option")); /* bogus lengths should have been caught by ip_dooptions */ if (optlen > cnt) optlen = cnt; if (IPOPT_COPIED(opt)) { bcopy(cp, dp, optlen); dp += optlen; } } for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) *dp++ = IPOPT_EOL; return (optlen); } /* * IP socket option processing. */ int ip_ctloutput(so, sopt) struct socket *so; struct sockopt *sopt; { struct inpcb *inp = sotoinpcb(so); int error, optval; error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { return (EINVAL); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif { struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_HEADER); if (m == NULL) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); INP_LOCK(inp); error = ip_pcbopts(inp, sopt->sopt_name, m); INP_UNLOCK(inp); return (error); } case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_FAITH: case IP_ONESBCAST: + case IP_DONTFRAG: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos = optval; break; case IP_TTL: inp->inp_ip_ttl = optval; break; case IP_MINTTL: if (optval > 0 && optval <= MAXTTL) inp->inp_ip_minttl = optval; else error = EINVAL; break; #define OPTSET(bit) do { \ INP_LOCK(inp); \ if (optval) \ inp->inp_flags |= bit; \ else \ inp->inp_flags &= ~bit; \ INP_UNLOCK(inp); \ } while (0) case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; case IP_RECVRETOPTS: OPTSET(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: OPTSET(INP_RECVDSTADDR); break; case IP_RECVTTL: OPTSET(INP_RECVTTL); break; case IP_RECVIF: OPTSET(INP_RECVIF); break; case IP_FAITH: OPTSET(INP_FAITH); break; case IP_ONESBCAST: OPTSET(INP_ONESBCAST); break; + case IP_DONTFRAG: + OPTSET(INP_DONTFRAG); + break; } break; #undef OPTSET case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: error = ip_setmoptions(inp, sopt); break; case IP_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_LOCK(inp); switch (optval) { case IP_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IP_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IP_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_UNLOCK(inp); break; #if defined(IPSEC) || defined(FAST_IPSEC) case IP_IPSEC_POLICY: { caddr_t req; size_t len = 0; int priv; struct mbuf *m; int optname; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; priv = (sopt->sopt_td != NULL && suser(sopt->sopt_td) != 0) ? 0 : 1; req = mtod(m, caddr_t); len = m->m_len; optname = sopt->sopt_name; error = ipsec4_set_policy(inp, optname, req, len, priv); m_freem(m); break; } #endif /*IPSEC*/ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: if (inp->inp_options) error = sooptcopyout(sopt, mtod(inp->inp_options, char *), inp->inp_options->m_len); else sopt->sopt_valsize = 0; break; case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_PORTRANGE: case IP_FAITH: case IP_ONESBCAST: + case IP_DONTFRAG: switch (sopt->sopt_name) { case IP_TOS: optval = inp->inp_ip_tos; break; case IP_TTL: optval = inp->inp_ip_ttl; break; case IP_MINTTL: optval = inp->inp_ip_minttl; break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); break; case IP_RECVRETOPTS: optval = OPTBIT(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: optval = OPTBIT(INP_RECVDSTADDR); break; case IP_RECVTTL: optval = OPTBIT(INP_RECVTTL); break; case IP_RECVIF: optval = OPTBIT(INP_RECVIF); break; case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; else if (inp->inp_flags & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = 0; break; case IP_FAITH: optval = OPTBIT(INP_FAITH); break; case IP_ONESBCAST: optval = OPTBIT(INP_ONESBCAST); + break; + case IP_DONTFRAG: + optval = OPTBIT(INP_DONTFRAG); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: error = ip_getmoptions(inp, sopt); break; #if defined(IPSEC) || defined(FAST_IPSEC) case IP_IPSEC_POLICY: { struct mbuf *m = NULL; caddr_t req = NULL; size_t len = 0; if (m != 0) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0) m_freem(m); break; } #endif /*IPSEC*/ default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Set up IP options in pcb for insertion in output packets. * Store in mbuf with pointer in pcbopt, adding pseudo-option * with destination address if source routed. */ static int ip_pcbopts(struct inpcb *inp, int optname, struct mbuf *m) { register int cnt, optlen; register u_char *cp; struct mbuf **pcbopt; u_char opt; INP_LOCK_ASSERT(inp); pcbopt = &inp->inp_options; /* turn off any old options */ if (*pcbopt) (void)m_free(*pcbopt); *pcbopt = 0; if (m == NULL || m->m_len == 0) { /* * Only turning off any previous options. */ if (m != NULL) (void)m_free(m); return (0); } if (m->m_len % sizeof(int32_t)) goto bad; /* * IP first-hop destination address will be stored before * actual options; move other options back * and clear it when none present. */ if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) goto bad; cnt = m->m_len; m->m_len += sizeof(struct in_addr); cp = mtod(m, u_char *) + sizeof(struct in_addr); bcopy(mtod(m, void *), cp, (unsigned)cnt); bzero(mtod(m, void *), sizeof(struct in_addr)); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) goto bad; optlen = cp[IPOPT_OLEN]; if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) goto bad; } switch (opt) { default: break; case IPOPT_LSRR: case IPOPT_SSRR: /* * user process specifies route as: * ->A->B->C->D * D must be our final destination (but we can't * check that since we may not have connected yet). * A is first hop destination, which doesn't appear in * actual IP option, but is stored before the options. */ if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) goto bad; m->m_len -= sizeof(struct in_addr); cnt -= sizeof(struct in_addr); optlen -= sizeof(struct in_addr); cp[IPOPT_OLEN] = optlen; /* * Move first hop before start of options. */ bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), sizeof(struct in_addr)); /* * Then copy rest of options back * to close up the deleted entry. */ bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)), &cp[IPOPT_OFFSET+1], (unsigned)cnt - (IPOPT_MINOFF - 1)); break; } } if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) goto bad; *pcbopt = m; return (0); bad: (void)m_free(m); return (EINVAL); } /* * XXX * The whole multicast option thing needs to be re-thought. * Several of these options are equally applicable to non-multicast * transmission, and one (IP_MULTICAST_TTL) totally duplicates a * standard option (IP_TTL). */ /* * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. */ static struct ifnet * ip_multicast_if(a, ifindexp) struct in_addr *a; int *ifindexp; { int ifindex; struct ifnet *ifp; if (ifindexp) *ifindexp = 0; if (ntohl(a->s_addr) >> 24 == 0) { ifindex = ntohl(a->s_addr) & 0xffffff; if (ifindex < 0 || if_index < ifindex) return NULL; ifp = ifnet_byindex(ifindex); if (ifindexp) *ifindexp = ifindex; } else { INADDR_TO_IFP(*a, ifp); } return ifp; } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. */ static struct ip_moptions * ip_findmoptions(struct inpcb *inp) { struct ip_moptions *imo; INP_LOCK(inp); if (inp->inp_moptions != NULL) return (inp->inp_moptions); INP_UNLOCK(inp); imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); imo->imo_multicast_ifp = NULL; imo->imo_multicast_addr.s_addr = INADDR_ANY; imo->imo_multicast_vif = -1; imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; imo->imo_num_memberships = 0; INP_LOCK(inp); if (inp->inp_moptions != NULL) { free(imo, M_IPMOPTS); return (inp->inp_moptions); } inp->inp_moptions = imo; return (imo); } /* * Set the IP multicast options in response to user setsockopt(). */ static int ip_setmoptions(struct inpcb *inp, struct sockopt *sopt) { int error = 0; int i; struct in_addr addr; struct ip_mreq mreq; struct ifnet *ifp; struct ip_moptions *imo; struct route ro; struct sockaddr_in *dst; int ifindex; int s; switch (sopt->sopt_name) { /* store an index number for the vif you wanna use in the send */ case IP_MULTICAST_VIF: if (legal_vif_num == 0) { error = EOPNOTSUPP; break; } error = sooptcopyin(sopt, &i, sizeof i, sizeof i); if (error) break; if (!legal_vif_num(i) && (i != -1)) { error = EINVAL; break; } imo = ip_findmoptions(inp); imo->imo_multicast_vif = i; INP_UNLOCK(inp); break; case IP_MULTICAST_IF: /* * Select the interface for outgoing multicast packets. */ error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr); if (error) break; /* * INADDR_ANY is used to remove a previous selection. * When no interface is selected, a default one is * chosen every time a multicast packet is sent. */ imo = ip_findmoptions(inp); if (addr.s_addr == INADDR_ANY) { imo->imo_multicast_ifp = NULL; INP_UNLOCK(inp); break; } /* * The selected interface is identified by its local * IP address. Find the interface and confirm that * it supports multicasting. */ s = splimp(); ifp = ip_multicast_if(&addr, &ifindex); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { INP_UNLOCK(inp); splx(s); error = EADDRNOTAVAIL; break; } imo->imo_multicast_ifp = ifp; if (ifindex) imo->imo_multicast_addr = addr; else imo->imo_multicast_addr.s_addr = INADDR_ANY; INP_UNLOCK(inp); splx(s); break; case IP_MULTICAST_TTL: /* * Set the IP time-to-live for outgoing multicast packets. * The original multicast API required a char argument, * which is inconsistent with the rest of the socket API. * We allow either a char or an int. */ if (sopt->sopt_valsize == 1) { u_char ttl; error = sooptcopyin(sopt, &ttl, 1, 1); if (error) break; imo = ip_findmoptions(inp); imo->imo_multicast_ttl = ttl; INP_UNLOCK(inp); } else { u_int ttl; error = sooptcopyin(sopt, &ttl, sizeof ttl, sizeof ttl); if (error) break; if (ttl > 255) error = EINVAL; else { imo = ip_findmoptions(inp); imo->imo_multicast_ttl = ttl; INP_UNLOCK(inp); } } break; case IP_MULTICAST_LOOP: /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. The original multicast API required a * char argument, which is inconsistent with the rest * of the socket API. We allow either a char or an int. */ if (sopt->sopt_valsize == 1) { u_char loop; error = sooptcopyin(sopt, &loop, 1, 1); if (error) break; imo = ip_findmoptions(inp); imo->imo_multicast_loop = !!loop; INP_UNLOCK(inp); } else { u_int loop; error = sooptcopyin(sopt, &loop, sizeof loop, sizeof loop); if (error) break; imo = ip_findmoptions(inp); imo->imo_multicast_loop = !!loop; INP_UNLOCK(inp); } break; case IP_ADD_MEMBERSHIP: /* * Add a multicast group membership. * Group must be a valid IP multicast address. */ error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); if (error) break; if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { error = EINVAL; break; } s = splimp(); /* * If no interface address was provided, use the interface of * the route to the given multicast address. */ if (mreq.imr_interface.s_addr == INADDR_ANY) { bzero((caddr_t)&ro, sizeof(ro)); dst = (struct sockaddr_in *)&ro.ro_dst; dst->sin_len = sizeof(*dst); dst->sin_family = AF_INET; dst->sin_addr = mreq.imr_multiaddr; rtalloc_ign(&ro, RTF_CLONING); if (ro.ro_rt == NULL) { error = EADDRNOTAVAIL; splx(s); break; } ifp = ro.ro_rt->rt_ifp; RTFREE(ro.ro_rt); } else { ifp = ip_multicast_if(&mreq.imr_interface, NULL); } /* * See if we found an interface, and confirm that it * supports multicast. */ if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; splx(s); break; } /* * See if the membership already exists or if all the * membership slots are full. */ imo = ip_findmoptions(inp); for (i = 0; i < imo->imo_num_memberships; ++i) { if (imo->imo_membership[i]->inm_ifp == ifp && imo->imo_membership[i]->inm_addr.s_addr == mreq.imr_multiaddr.s_addr) break; } if (i < imo->imo_num_memberships) { INP_UNLOCK(inp); error = EADDRINUSE; splx(s); break; } if (i == IP_MAX_MEMBERSHIPS) { INP_UNLOCK(inp); error = ETOOMANYREFS; splx(s); break; } /* * Everything looks good; add a new record to the multicast * address list for the given interface. */ if ((imo->imo_membership[i] = in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) { INP_UNLOCK(inp); error = ENOBUFS; splx(s); break; } ++imo->imo_num_memberships; INP_UNLOCK(inp); splx(s); break; case IP_DROP_MEMBERSHIP: /* * Drop a multicast group membership. * Group must be a valid IP multicast address. */ error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); if (error) break; if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { error = EINVAL; break; } s = splimp(); /* * If an interface address was specified, get a pointer * to its ifnet structure. */ if (mreq.imr_interface.s_addr == INADDR_ANY) ifp = NULL; else { ifp = ip_multicast_if(&mreq.imr_interface, NULL); if (ifp == NULL) { error = EADDRNOTAVAIL; splx(s); break; } } /* * Find the membership in the membership array. */ imo = ip_findmoptions(inp); for (i = 0; i < imo->imo_num_memberships; ++i) { if ((ifp == NULL || imo->imo_membership[i]->inm_ifp == ifp) && imo->imo_membership[i]->inm_addr.s_addr == mreq.imr_multiaddr.s_addr) break; } if (i == imo->imo_num_memberships) { INP_UNLOCK(inp); error = EADDRNOTAVAIL; splx(s); break; } /* * Give up the multicast address record to which the * membership points. */ in_delmulti(imo->imo_membership[i]); /* * Remove the gap in the membership array. */ for (++i; i < imo->imo_num_memberships; ++i) imo->imo_membership[i-1] = imo->imo_membership[i]; --imo->imo_num_memberships; INP_UNLOCK(inp); splx(s); break; default: error = EOPNOTSUPP; break; } return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ static int ip_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip_moptions *imo; struct in_addr addr; struct in_ifaddr *ia; int error, optval; u_char coptval; INP_LOCK(inp); imo = inp->inp_moptions; error = 0; switch (sopt->sopt_name) { case IP_MULTICAST_VIF: if (imo != NULL) optval = imo->imo_multicast_vif; else optval = -1; INP_UNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_MULTICAST_IF: if (imo == NULL || imo->imo_multicast_ifp == NULL) addr.s_addr = INADDR_ANY; else if (imo->imo_multicast_addr.s_addr) { /* return the value user has set */ addr = imo->imo_multicast_addr; } else { IFP_TO_IA(imo->imo_multicast_ifp, ia); addr.s_addr = (ia == NULL) ? INADDR_ANY : IA_SIN(ia)->sin_addr.s_addr; } INP_UNLOCK(inp); error = sooptcopyout(sopt, &addr, sizeof addr); break; case IP_MULTICAST_TTL: if (imo == 0) optval = coptval = IP_DEFAULT_MULTICAST_TTL; else optval = coptval = imo->imo_multicast_ttl; INP_UNLOCK(inp); if (sopt->sopt_valsize == 1) error = sooptcopyout(sopt, &coptval, 1); else error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_MULTICAST_LOOP: if (imo == 0) optval = coptval = IP_DEFAULT_MULTICAST_LOOP; else optval = coptval = imo->imo_multicast_loop; INP_UNLOCK(inp); if (sopt->sopt_valsize == 1) error = sooptcopyout(sopt, &coptval, 1); else error = sooptcopyout(sopt, &optval, sizeof optval); break; default: INP_UNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Discard the IP multicast options. */ void ip_freemoptions(imo) register struct ip_moptions *imo; { register int i; if (imo != NULL) { for (i = 0; i < imo->imo_num_memberships; ++i) in_delmulti(imo->imo_membership[i]); free(imo, M_IPMOPTS); } } /* * Routine called from ip_output() to loop back a copy of an IP multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be a loopback interface -- evil, but easier than * replicating that code here. */ static void ip_mloopback(ifp, m, dst, hlen) struct ifnet *ifp; register struct mbuf *m; register struct sockaddr_in *dst; int hlen; { register struct ip *ip; struct mbuf *copym; copym = m_copy(m, 0, M_COPYALL); if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* If needed, compute the checksum and mark it as valid. */ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(copym); copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } /* * We don't bother to fragment if the IP length is greater * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); /* * NB: * It's not clear whether there are any lingering * reentrancy problems in other areas which might * be exposed by using ip_input directly (in * particular, everything which modifies the packet * in-place). Yet another option is using the * protosw directly to deliver the looped back * packet. For the moment, we'll err on the side * of safety by using if_simloop(). */ #if 1 /* XXX */ if (dst->sin_family != AF_INET) { printf("ip_mloopback: bad address family %d\n", dst->sin_family); dst->sin_family = AF_INET; } #endif #ifdef notdef copym->m_pkthdr.rcvif = ifp; ip_input(copym); #else if_simloop(ifp, copym, dst->sin_family, 0); #endif } } Index: stable/6/sys/netinet/raw_ip.c =================================================================== --- stable/6/sys/netinet/raw_ip.c (revision 150827) +++ stable/6/sys/netinet/raw_ip.c (revision 150828) @@ -1,919 +1,922 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 * $FreeBSD$ */ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef FAST_IPSEC #include #endif /*FAST_IPSEC*/ #ifdef IPSEC #include #endif /*IPSEC*/ struct inpcbhead ripcb; struct inpcbinfo ripcbinfo; /* control hooks for ipfw and dummynet */ ip_fw_ctl_t *ip_fw_ctl_ptr = NULL; ip_dn_ctl_t *ip_dn_ctl_ptr = NULL; /* * hooks for multicast routing. They all default to NULL, * so leave them not initialized and rely on BSS being set to 0. */ /* The socket used to communicate with the multicast routing daemon. */ struct socket *ip_mrouter; /* The various mrouter and rsvp functions */ int (*ip_mrouter_set)(struct socket *, struct sockopt *); int (*ip_mrouter_get)(struct socket *, struct sockopt *); int (*ip_mrouter_done)(void); int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); int (*mrt_ioctl)(int, caddr_t); int (*legal_vif_num)(int); u_long (*ip_mcast_src)(int); void (*rsvp_input_p)(struct mbuf *m, int off); int (*ip_rsvp_vif)(struct socket *, struct sockopt *); void (*ip_rsvp_force_done)(struct socket *); /* * Nominal space allocated to a raw ip socket. */ #define RIPSNDQ 8192 #define RIPRCVQ 8192 /* * Raw interface to IP protocol. */ /* * Initialize raw connection block q. */ void rip_init() { INP_INFO_LOCK_INIT(&ripcbinfo, "rip"); LIST_INIT(&ripcb); ripcbinfo.listhead = &ripcb; /* * XXX We don't use the hash list for raw IP, but it's easier * to allocate a one entry hash list than it is to check all * over the place for hashbase == NULL. */ ripcbinfo.hashbase = hashinit(1, M_PCB, &ripcbinfo.hashmask); ripcbinfo.porthashbase = hashinit(1, M_PCB, &ripcbinfo.porthashmask); ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets); } static struct sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET }; static int raw_append(struct inpcb *last, struct ip *ip, struct mbuf *n) { int policyfail = 0; INP_LOCK_ASSERT(last); #if defined(IPSEC) || defined(FAST_IPSEC) /* check AH/ESP integrity. */ if (ipsec4_in_reject(n, last)) { policyfail = 1; #ifdef IPSEC ipsecstat.in_polvio++; #endif /*IPSEC*/ /* do not inject data to pcb */ } #endif /*IPSEC || FAST_IPSEC*/ #ifdef MAC if (!policyfail && mac_check_inpcb_deliver(last, n) != 0) policyfail = 1; #endif /* Check the minimum TTL for socket. */ if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) policyfail = 1; if (!policyfail) { struct mbuf *opts = NULL; struct socket *so; so = last->inp_socket; if ((last->inp_flags & INP_CONTROLOPTS) || (so->so_options & SO_TIMESTAMP)) ip_savecontrol(last, &opts, ip, n); SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)&ripsrc, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) m_freem(opts); SOCKBUF_UNLOCK(&so->so_rcv); } else sorwakeup_locked(so); } else m_freem(n); return policyfail; } /* * Setup generic address and protocol structures * for raw_input routine, then pass them along with * mbuf chain. */ void rip_input(struct mbuf *m, int off) { struct ip *ip = mtod(m, struct ip *); int proto = ip->ip_p; struct inpcb *inp, *last; INP_INFO_RLOCK(&ripcbinfo); ripsrc.sin_addr = ip->ip_src; last = NULL; LIST_FOREACH(inp, &ripcb, inp_list) { INP_LOCK(inp); if (inp->inp_ip_p && inp->inp_ip_p != proto) { docontinue: INP_UNLOCK(inp); continue; } #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) goto docontinue; #endif if (inp->inp_laddr.s_addr && inp->inp_laddr.s_addr != ip->ip_dst.s_addr) goto docontinue; if (inp->inp_faddr.s_addr && inp->inp_faddr.s_addr != ip->ip_src.s_addr) goto docontinue; if (jailed(inp->inp_socket->so_cred)) if (htonl(prison_getip(inp->inp_socket->so_cred)) != ip->ip_dst.s_addr) goto docontinue; if (last) { struct mbuf *n; n = m_copy(m, 0, (int)M_COPYALL); if (n != NULL) (void) raw_append(last, ip, n); /* XXX count dropped packet */ INP_UNLOCK(last); } last = inp; } if (last != NULL) { if (raw_append(last, ip, m) != 0) ipstat.ips_delivered--; INP_UNLOCK(last); } else { m_freem(m); ipstat.ips_noproto++; ipstat.ips_delivered--; } INP_INFO_RUNLOCK(&ripcbinfo); } /* * Generate IP header and pass packet to ip_output. * Tack on options user may have setup with control call. */ int rip_output(struct mbuf *m, struct socket *so, u_long dst) { struct ip *ip; int error; struct inpcb *inp = sotoinpcb(so); int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST; /* * If the user handed us a complete IP packet, use it. * Otherwise, allocate an mbuf for a header and fill it in. */ if ((inp->inp_flags & INP_HDRINCL) == 0) { if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); if (m == NULL) return(ENOBUFS); INP_LOCK(inp); ip = mtod(m, struct ip *); ip->ip_tos = inp->inp_ip_tos; - ip->ip_off = 0; + if (inp->inp_flags & INP_DONTFRAG) + ip->ip_off = IP_DF; + else + ip->ip_off = 0; ip->ip_p = inp->inp_ip_p; ip->ip_len = m->m_pkthdr.len; if (jailed(inp->inp_socket->so_cred)) ip->ip_src.s_addr = htonl(prison_getip(inp->inp_socket->so_cred)); else ip->ip_src = inp->inp_laddr; ip->ip_dst.s_addr = dst; ip->ip_ttl = inp->inp_ip_ttl; } else { if (m->m_pkthdr.len > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } INP_LOCK(inp); ip = mtod(m, struct ip *); if (jailed(inp->inp_socket->so_cred)) { if (ip->ip_src.s_addr != htonl(prison_getip(inp->inp_socket->so_cred))) { INP_UNLOCK(inp); m_freem(m); return (EPERM); } } /* don't allow both user specified and setsockopt options, and don't allow packet length sizes that will crash */ if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || (ip->ip_len > m->m_pkthdr.len) || (ip->ip_len < (ip->ip_hl << 2))) { INP_UNLOCK(inp); m_freem(m); return EINVAL; } if (ip->ip_id == 0) ip->ip_id = ip_newid(); /* XXX prevent ip_output from overwriting header fields */ flags |= IP_RAWOUTPUT; ipstat.ips_rawout++; } if (inp->inp_flags & INP_ONESBCAST) flags |= IP_SENDONES; #ifdef MAC mac_create_mbuf_from_inpcb(inp, m); #endif error = ip_output(m, inp->inp_options, NULL, flags, inp->inp_moptions, inp); INP_UNLOCK(inp); return error; } /* * Raw IP socket option processing. * * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could * only be created by a privileged process, and as such, socket option * operations to manage system properties on any raw socket were allowed to * take place without explicit additional access control checks. However, * raw sockets can now also be created in jail(), and therefore explicit * checks are now required. Likewise, raw sockets can be used by a process * after it gives up privilege, so some caution is required. For options * passed down to the IP layer via ip_ctloutput(), checks are assumed to be * performed in ip_ctloutput() and therefore no check occurs here. * Unilaterally checking suser() here breaks normal IP socket option * operations on raw sockets. * * When adding new socket options here, make sure to add access control * checks here as necessary. */ int rip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; if (sopt->sopt_level != IPPROTO_IP) return (EINVAL); error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case IP_HDRINCL: optval = inp->inp_flags & INP_HDRINCL; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_FW_ADD: /* ADD actually returns the body... */ case IP_FW_GET: case IP_FW_TABLE_GETSIZE: case IP_FW_TABLE_LIST: error = suser(curthread); if (error != 0) return (error); if (ip_fw_ctl_ptr != NULL) error = ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET_GET: error = suser(curthread); if (error != 0) return (error); if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT; break ; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = suser(curthread); if (error != 0) return (error); error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case IP_HDRINCL: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; if (optval) inp->inp_flags |= INP_HDRINCL; else inp->inp_flags &= ~INP_HDRINCL; break; case IP_FW_ADD: case IP_FW_DEL: case IP_FW_FLUSH: case IP_FW_ZERO: case IP_FW_RESETLOG: case IP_FW_TABLE_ADD: case IP_FW_TABLE_DEL: case IP_FW_TABLE_FLUSH: error = suser(curthread); if (error != 0) return (error); if (ip_fw_ctl_ptr != NULL) error = ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET_CONFIGURE: case IP_DUMMYNET_DEL: case IP_DUMMYNET_FLUSH: error = suser(curthread); if (error != 0) return (error); if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT ; break ; case IP_RSVP_ON: error = suser(curthread); if (error != 0) return (error); error = ip_rsvp_init(so); break; case IP_RSVP_OFF: error = suser(curthread); if (error != 0) return (error); error = ip_rsvp_done(); break; case IP_RSVP_VIF_ON: case IP_RSVP_VIF_OFF: error = suser(curthread); if (error != 0) return (error); error = ip_rsvp_vif ? ip_rsvp_vif(so, sopt) : EINVAL; break; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = suser(curthread); if (error != 0) return (error); error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; } return (error); } /* * This function exists solely to receive the PRC_IFDOWN messages which * are sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, * and calls in_ifadown() to remove all routes corresponding to that address. * It also receives the PRC_IFUP messages from if_up() and reinstalls the * interface routes. */ void rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct in_ifaddr *ia; struct ifnet *ifp; int err; int flags; switch (cmd) { case PRC_IFDOWN: TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa && (ia->ia_flags & IFA_ROUTE)) { /* * in_ifscrub kills the interface route. */ in_ifscrub(ia->ia_ifp, ia); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right * thing to do, but at least if we are running * a routing process they will come back. */ in_ifadown(&ia->ia_ifa, 0); break; } } break; case PRC_IFUP: TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa) break; } if (ia == 0 || (ia->ia_flags & IFA_ROUTE)) return; flags = RTF_UP; ifp = ia->ia_ifa.ifa_ifp; if ((ifp->if_flags & IFF_LOOPBACK) || (ifp->if_flags & IFF_POINTOPOINT)) flags |= RTF_HOST; err = rtinit(&ia->ia_ifa, RTM_ADD, flags); if (err == 0) ia->ia_flags |= IFA_ROUTE; break; } } u_long rip_sendspace = RIPSNDQ; u_long rip_recvspace = RIPRCVQ; SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); static int rip_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error; /* XXX why not lower? */ INP_INFO_WLOCK(&ripcbinfo); inp = sotoinpcb(so); if (inp) { /* XXX counter, printf */ INP_INFO_WUNLOCK(&ripcbinfo); return EINVAL; } if (jailed(td->td_ucred) && !jail_allow_raw_sockets) { INP_INFO_WUNLOCK(&ripcbinfo); return (EPERM); } if ((error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL)) != 0) { INP_INFO_WUNLOCK(&ripcbinfo); return error; } if (proto >= IPPROTO_MAX || proto < 0) { INP_INFO_WUNLOCK(&ripcbinfo); return EPROTONOSUPPORT; } error = soreserve(so, rip_sendspace, rip_recvspace); if (error) { INP_INFO_WUNLOCK(&ripcbinfo); return error; } error = in_pcballoc(so, &ripcbinfo, "rawinp"); if (error) { INP_INFO_WUNLOCK(&ripcbinfo); return error; } inp = (struct inpcb *)so->so_pcb; INP_LOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); inp->inp_vflag |= INP_IPV4; inp->inp_ip_p = proto; inp->inp_ip_ttl = ip_defttl; INP_UNLOCK(inp); return 0; } static void rip_pcbdetach(struct socket *so, struct inpcb *inp) { INP_INFO_WLOCK_ASSERT(&ripcbinfo); INP_LOCK_ASSERT(inp); if (so == ip_mrouter && ip_mrouter_done) ip_mrouter_done(); if (ip_rsvp_force_done) ip_rsvp_force_done(so); if (so == ip_rsvpd) ip_rsvp_done(); in_pcbdetach(inp); } static int rip_detach(struct socket *so) { struct inpcb *inp; INP_INFO_WLOCK(&ripcbinfo); inp = sotoinpcb(so); if (inp == 0) { /* XXX counter, printf */ INP_INFO_WUNLOCK(&ripcbinfo); return EINVAL; } INP_LOCK(inp); rip_pcbdetach(so, inp); INP_INFO_WUNLOCK(&ripcbinfo); return 0; } static int rip_abort(struct socket *so) { struct inpcb *inp; INP_INFO_WLOCK(&ripcbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&ripcbinfo); return EINVAL; /* ??? possible? panic instead? */ } INP_LOCK(inp); soisdisconnected(so); if (so->so_state & SS_NOFDREF) rip_pcbdetach(so, inp); else INP_UNLOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); return 0; } static int rip_disconnect(struct socket *so) { if ((so->so_state & SS_ISCONNECTED) == 0) return ENOTCONN; return rip_abort(so); } static int rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; if (nam->sa_len != sizeof(*addr)) return EINVAL; if (jailed(td->td_ucred)) { if (addr->sin_addr.s_addr == INADDR_ANY) addr->sin_addr.s_addr = htonl(prison_getip(td->td_ucred)); if (htonl(prison_getip(td->td_ucred)) != addr->sin_addr.s_addr) return (EADDRNOTAVAIL); } if (TAILQ_EMPTY(&ifnet) || (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || (addr->sin_addr.s_addr && ifa_ifwithaddr((struct sockaddr *)addr) == 0)) return EADDRNOTAVAIL; INP_INFO_WLOCK(&ripcbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&ripcbinfo); return EINVAL; } INP_LOCK(inp); inp->inp_laddr = addr->sin_addr; INP_UNLOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); return 0; } static int rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; if (nam->sa_len != sizeof(*addr)) return EINVAL; if (TAILQ_EMPTY(&ifnet)) return EADDRNOTAVAIL; if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) return EAFNOSUPPORT; INP_INFO_WLOCK(&ripcbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&ripcbinfo); return EINVAL; } INP_LOCK(inp); inp->inp_faddr = addr->sin_addr; soisconnected(so); INP_UNLOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); return 0; } static int rip_shutdown(struct socket *so) { struct inpcb *inp; INP_INFO_RLOCK(&ripcbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_RUNLOCK(&ripcbinfo); return EINVAL; } INP_LOCK(inp); INP_INFO_RUNLOCK(&ripcbinfo); socantsendmore(so); INP_UNLOCK(inp); return 0; } static int rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct inpcb *inp; u_long dst; int ret; INP_INFO_WLOCK(&ripcbinfo); inp = sotoinpcb(so); if (so->so_state & SS_ISCONNECTED) { if (nam) { INP_INFO_WUNLOCK(&ripcbinfo); m_freem(m); return EISCONN; } dst = inp->inp_faddr.s_addr; } else { if (nam == NULL) { INP_INFO_WUNLOCK(&ripcbinfo); m_freem(m); return ENOTCONN; } dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; } ret = rip_output(m, so, dst); INP_INFO_WUNLOCK(&ripcbinfo); return ret; } static int rip_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = ripcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); return 0; } if (req->newptr != 0) return EPERM; /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&ripcbinfo); gencnt = ripcbinfo.ipi_gencnt; n = ripcbinfo.ipi_count; INP_INFO_RUNLOCK(&ripcbinfo); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return ENOMEM; INP_INFO_RLOCK(&ripcbinfo); for (inp = LIST_FIRST(ripcbinfo.listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) { /* XXX held references? */ inp_list[i++] = inp; } INP_UNLOCK(inp); } INP_INFO_RUNLOCK(&ripcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); error = SYSCTL_OUT(req, &xi, sizeof xi); } } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_INFO_RLOCK(&ripcbinfo); xig.xig_gen = ripcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = ripcbinfo.ipi_count; INP_INFO_RUNLOCK(&ripcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return error; } /* * This is the wrapper function for in_setsockaddr. We just pass down * the pcbinfo for in_setpeeraddr to lock. */ static int rip_sockaddr(struct socket *so, struct sockaddr **nam) { return (in_setsockaddr(so, nam, &ripcbinfo)); } /* * This is the wrapper function for in_setpeeraddr. We just pass down * the pcbinfo for in_setpeeraddr to lock. */ static int rip_peeraddr(struct socket *so, struct sockaddr **nam) { return (in_setpeeraddr(so, nam, &ripcbinfo)); } SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0, rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); struct pr_usrreqs rip_usrreqs = { .pru_abort = rip_abort, .pru_attach = rip_attach, .pru_bind = rip_bind, .pru_connect = rip_connect, .pru_control = in_control, .pru_detach = rip_detach, .pru_disconnect = rip_disconnect, .pru_peeraddr = rip_peeraddr, .pru_send = rip_send, .pru_shutdown = rip_shutdown, .pru_sockaddr = rip_sockaddr, .pru_sosetlabel = in_pcbsosetlabel }; Index: stable/6/sys/netinet/udp_usrreq.c =================================================================== --- stable/6/sys/netinet/udp_usrreq.c (revision 150827) +++ stable/6/sys/netinet/udp_usrreq.c (revision 150828) @@ -1,1109 +1,1118 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 * $FreeBSD$ */ #include "opt_ipsec.h" #include "opt_inet6.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #ifdef INET6 #include #endif #include #include #ifdef FAST_IPSEC #include #endif /*FAST_IPSEC*/ #ifdef IPSEC #include #endif /*IPSEC*/ #include /* * UDP protocol implementation. * Per RFC 768, August, 1980. */ #ifndef COMPAT_42 static int udpcksum = 1; #else static int udpcksum = 0; /* XXX */ #endif SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, &udpcksum, 0, ""); int log_in_vain = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, &log_in_vain, 0, "Log all incoming UDP packets"); static int blackhole = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW, &blackhole, 0, "Do not send port unreachables for refused connects"); static int strict_mcast_mship = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, strict_mcast_mship, CTLFLAG_RW, &strict_mcast_mship, 0, "Only send multicast to member sockets"); struct inpcbhead udb; /* from udp_var.h */ #define udb6 udb /* for KAME src sync over BSD*'s */ struct inpcbinfo udbinfo; #ifndef UDBHASHSIZE #define UDBHASHSIZE 16 #endif struct udpstat udpstat; /* from udp_var.h */ SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW, &udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); static void udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n, int off, struct sockaddr_in *udp_in); static int udp_detach(struct socket *so); static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *); void udp_init() { INP_INFO_LOCK_INIT(&udbinfo, "udp"); LIST_INIT(&udb); udbinfo.listhead = &udb; udbinfo.hashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.hashmask); udbinfo.porthashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.porthashmask); udbinfo.ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(udbinfo.ipi_zone, maxsockets); } void udp_input(m, off) register struct mbuf *m; int off; { int iphlen = off; register struct ip *ip; register struct udphdr *uh; register struct inpcb *inp; struct mbuf *opts = 0; int len; struct ip save_ip; struct sockaddr_in udp_in; udpstat.udps_ipackets++; /* * Strip IP options, if any; should skip this, * make available to user, and use on returned packets, * but we don't yet have a way to check the checksum * with options still present. */ if (iphlen > sizeof (struct ip)) { ip_stripoptions(m, (struct mbuf *)0); iphlen = sizeof(struct ip); } /* * Get IP and UDP header together in first mbuf. */ ip = mtod(m, struct ip *); if (m->m_len < iphlen + sizeof(struct udphdr)) { if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) { udpstat.udps_hdrops++; return; } ip = mtod(m, struct ip *); } uh = (struct udphdr *)((caddr_t)ip + iphlen); /* destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; /* * Construct sockaddr format source address. * Stuff source address and datagram in user buffer. */ bzero(&udp_in, sizeof(udp_in)); udp_in.sin_len = sizeof(udp_in); udp_in.sin_family = AF_INET; udp_in.sin_port = uh->uh_sport; udp_in.sin_addr = ip->ip_src; /* * Make mbuf data length reflect UDP length. * If not enough data to reflect UDP length, drop. */ len = ntohs((u_short)uh->uh_ulen); if (ip->ip_len != len) { if (len > ip->ip_len || len < sizeof(struct udphdr)) { udpstat.udps_badlen++; goto badunlocked; } m_adj(m, len - ip->ip_len); /* ip->ip_len = len; */ } /* * Save a copy of the IP header in case we want restore it * for sending an ICMP error message in response. */ if (!blackhole) save_ip = *ip; /* * Checksum extended UDP header and data. */ if (uh->uh_sum) { if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh->uh_sum = m->m_pkthdr.csum_data; else uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + IPPROTO_UDP)); uh->uh_sum ^= 0xffff; } else { char b[9]; bcopy(((struct ipovly *)ip)->ih_x1, b, 9); bzero(((struct ipovly *)ip)->ih_x1, 9); ((struct ipovly *)ip)->ih_len = uh->uh_ulen; uh->uh_sum = in_cksum(m, len + sizeof (struct ip)); bcopy(b, ((struct ipovly *)ip)->ih_x1, 9); } if (uh->uh_sum) { udpstat.udps_badsum++; m_freem(m); return; } } else udpstat.udps_nosum++; INP_INFO_RLOCK(&udbinfo); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { struct inpcb *last; /* * Deliver a multicast or broadcast datagram to *all* sockets * for which the local and remote addresses and ports match * those of the incoming datagram. This allows more than * one process to receive multi/broadcasts on the same port. * (This really ought to be done for unicast datagrams as * well, but that would cause problems with existing * applications that open both address-specific sockets and * a wildcard socket listening to the same port -- they would * end up receiving duplicates of every unicast datagram. * Those applications open the multiple sockets to overcome an * inadequacy of the UDP socket interface, but for backwards * compatibility we avoid the problem here rather than * fixing the interface. Maybe 4.5BSD will remedy this?) */ /* * Locate pcb(s) for datagram. * (Algorithm copied from raw_intr().) */ last = NULL; LIST_FOREACH(inp, &udb, inp_list) { if (inp->inp_lport != uh->uh_dport) continue; #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_laddr.s_addr != INADDR_ANY) { if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; } if (inp->inp_faddr.s_addr != INADDR_ANY) { if (inp->inp_faddr.s_addr != ip->ip_src.s_addr || inp->inp_fport != uh->uh_sport) continue; } INP_LOCK(inp); /* * Check multicast packets to make sure they are only * sent to sockets with multicast memberships for the * packet's destination address and arrival interface */ #define MSHIP(_inp, n) ((_inp)->inp_moptions->imo_membership[(n)]) #define NMSHIPS(_inp) ((_inp)->inp_moptions->imo_num_memberships) if (strict_mcast_mship && inp->inp_moptions != NULL) { int mship, foundmship = 0; for (mship = 0; mship < NMSHIPS(inp); mship++) { if (MSHIP(inp, mship)->inm_addr.s_addr == ip->ip_dst.s_addr && MSHIP(inp, mship)->inm_ifp == m->m_pkthdr.rcvif) { foundmship = 1; break; } } if (foundmship == 0) { INP_UNLOCK(inp); continue; } } #undef NMSHIPS #undef MSHIP if (last != NULL) { struct mbuf *n; n = m_copy(m, 0, M_COPYALL); if (n != NULL) udp_append(last, ip, n, iphlen + sizeof(struct udphdr), &udp_in); INP_UNLOCK(last); } last = inp; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids searching * through all pcbs in the common case of a non-shared * port. It * assumes that an application will never * clear these options after setting them. */ if ((last->inp_socket->so_options&(SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } if (last == NULL) { /* * No matching pcb found; discard datagram. * (No need to send an ICMP Port Unreachable * for a broadcast or multicast datgram.) */ udpstat.udps_noportbcast++; goto badheadlocked; } udp_append(last, ip, m, iphlen + sizeof(struct udphdr), &udp_in); INP_UNLOCK(last); INP_INFO_RUNLOCK(&udbinfo); return; } /* * Locate pcb for datagram. */ inp = in_pcblookup_hash(&udbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif); if (inp == NULL) { if (log_in_vain) { char buf[4*sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_dst)); log(LOG_INFO, "Connection attempt to UDP %s:%d from %s:%d\n", buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src), ntohs(uh->uh_sport)); } udpstat.udps_noport++; if (m->m_flags & (M_BCAST | M_MCAST)) { udpstat.udps_noportbcast++; goto badheadlocked; } if (blackhole) goto badheadlocked; if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) goto badheadlocked; *ip = save_ip; ip->ip_len += iphlen; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); INP_INFO_RUNLOCK(&udbinfo); return; } INP_LOCK(inp); /* Check the minimum TTL for socket. */ if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) goto badheadlocked; udp_append(inp, ip, m, iphlen + sizeof(struct udphdr), &udp_in); INP_UNLOCK(inp); INP_INFO_RUNLOCK(&udbinfo); return; badheadlocked: if (inp) INP_UNLOCK(inp); INP_INFO_RUNLOCK(&udbinfo); badunlocked: m_freem(m); if (opts) m_freem(opts); return; } /* * Subroutine of udp_input(), which appends the provided mbuf chain to the * passed pcb/socket. The caller must provide a sockaddr_in via udp_in that * contains the source address. If the socket ends up being an IPv6 socket, * udp_append() will convert to a sockaddr_in6 before passing the address * into the socket code. */ static void udp_append(last, ip, n, off, udp_in) struct inpcb *last; struct ip *ip; struct mbuf *n; int off; struct sockaddr_in *udp_in; { struct sockaddr *append_sa; struct socket *so; struct mbuf *opts = 0; #ifdef INET6 struct sockaddr_in6 udp_in6; #endif INP_LOCK_ASSERT(last); #if defined(IPSEC) || defined(FAST_IPSEC) /* check AH/ESP integrity. */ if (ipsec4_in_reject(n, last)) { #ifdef IPSEC ipsecstat.in_polvio++; #endif /*IPSEC*/ m_freem(n); return; } #endif /*IPSEC || FAST_IPSEC*/ #ifdef MAC if (mac_check_inpcb_deliver(last, n) != 0) { m_freem(n); return; } #endif if (last->inp_flags & INP_CONTROLOPTS || last->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) { #ifdef INET6 if (last->inp_vflag & INP_IPV6) { int savedflags; savedflags = last->inp_flags; last->inp_flags &= ~INP_UNMAPPABLEOPTS; ip6_savecontrol(last, n, &opts); last->inp_flags = savedflags; } else #endif ip_savecontrol(last, &opts, ip, n); } #ifdef INET6 if (last->inp_vflag & INP_IPV6) { bzero(&udp_in6, sizeof(udp_in6)); udp_in6.sin6_len = sizeof(udp_in6); udp_in6.sin6_family = AF_INET6; in6_sin_2_v4mapsin6(udp_in, &udp_in6); append_sa = (struct sockaddr *)&udp_in6; } else #endif append_sa = (struct sockaddr *)udp_in; m_adj(n, off); so = last->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) { m_freem(n); if (opts) m_freem(opts); udpstat.udps_fullsock++; SOCKBUF_UNLOCK(&so->so_rcv); } else sorwakeup_locked(so); } /* * Notify a udp user of an asynchronous error; * just wake up so that he can collect error status. */ struct inpcb * udp_notify(inp, errno) register struct inpcb *inp; int errno; { inp->inp_socket->so_error = errno; sorwakeup(inp->inp_socket); sowwakeup(inp->inp_socket); return inp; } void udp_ctlinput(cmd, sa, vip) int cmd; struct sockaddr *sa; void *vip; { struct ip *ip = vip; struct udphdr *uh; struct inpcb *(*notify)(struct inpcb *, int) = udp_notify; struct in_addr faddr; struct inpcb *inp; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; /* * Redirects don't need to be handled up here. */ if (PRC_IS_REDIRECT(cmd)) return; /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ if (cmd == PRC_HOSTDEAD) ip = 0; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_RLOCK(&udbinfo); inp = in_pcblookup_hash(&udbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, 0, NULL); if (inp != NULL) { INP_LOCK(inp); if (inp->inp_socket != NULL) { (*notify)(inp, inetctlerrmap[cmd]); } INP_UNLOCK(inp); } INP_INFO_RUNLOCK(&udbinfo); } else in_pcbnotifyall(&udbinfo, faddr, inetctlerrmap[cmd], notify); } static int udp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = udbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); return 0; } if (req->newptr != 0) return EPERM; /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&udbinfo); gencnt = udbinfo.ipi_gencnt; n = udbinfo.ipi_count; INP_INFO_RUNLOCK(&udbinfo); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xinpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return ENOMEM; INP_INFO_RLOCK(&udbinfo); for (inp = LIST_FIRST(udbinfo.listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) inp_list[i++] = inp; INP_UNLOCK(inp); } INP_INFO_RUNLOCK(&udbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); xi.xi_inp.inp_gencnt = inp->inp_gencnt; error = SYSCTL_OUT(req, &xi, sizeof xi); } } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_INFO_RLOCK(&udbinfo); xig.xig_gen = udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = udbinfo.ipi_count; INP_INFO_RUNLOCK(&udbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return error; } SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, udp_pcblist, "S,xinpcb", "List of active UDP sockets"); static int udp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = suser_cred(req->td->td_ucred, SUSER_ALLOWJAIL); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); INP_INFO_RLOCK(&udbinfo); inp = in_pcblookup_hash(&udbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 1, NULL); if (inp == NULL || inp->inp_socket == NULL) { error = ENOENT; goto out; } error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error) goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: INP_INFO_RUNLOCK(&udbinfo); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, udp_getcred, "S,xucred", "Get the xucred of a UDP connection"); static int udp_output(inp, m, addr, control, td) register struct inpcb *inp; struct mbuf *m; struct sockaddr *addr; struct mbuf *control; struct thread *td; { register struct udpiphdr *ui; register int len = m->m_pkthdr.len; struct in_addr faddr, laddr; struct cmsghdr *cm; struct sockaddr_in *sin, src; int error = 0; int ipflags; u_short fport, lport; int unlock_udbinfo; /* * udp_output() may need to temporarily bind or connect the current * inpcb. As such, we don't know up front what inpcb locks we will * need. Do any work to decide what is needed up front before * acquiring locks. */ if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { if (control) m_freem(control); m_freem(m); return EMSGSIZE; } src.sin_addr.s_addr = INADDR_ANY; if (control != NULL) { /* * XXX: Currently, we assume all the optional information * is stored in a single mbuf. */ if (control->m_next) { m_freem(control); m_freem(m); return EINVAL; } for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { cm = mtod(control, struct cmsghdr *); if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) { error = EINVAL; break; } if (cm->cmsg_level != IPPROTO_IP) continue; switch (cm->cmsg_type) { case IP_SENDSRCADDR: if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_addr))) { error = EINVAL; break; } bzero(&src, sizeof(src)); src.sin_family = AF_INET; src.sin_len = sizeof(src); src.sin_port = inp->inp_lport; src.sin_addr = *(struct in_addr *)CMSG_DATA(cm); break; default: error = ENOPROTOOPT; break; } if (error) break; } m_freem(control); } if (error) { m_freem(m); return error; } if (src.sin_addr.s_addr != INADDR_ANY || addr != NULL) { INP_INFO_WLOCK(&udbinfo); unlock_udbinfo = 1; } else unlock_udbinfo = 0; INP_LOCK(inp); #ifdef MAC mac_create_mbuf_from_inpcb(inp, m); #endif laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_addr.s_addr != INADDR_ANY) { if (lport == 0) { error = EINVAL; goto release; } error = in_pcbbind_setup(inp, (struct sockaddr *)&src, &laddr.s_addr, &lport, td->td_ucred); if (error) goto release; } if (addr) { sin = (struct sockaddr_in *)addr; if (jailed(td->td_ucred)) prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr); if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto release; } error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, &faddr.s_addr, &fport, NULL, td->td_ucred); if (error) goto release; /* Commit the local port if newly assigned. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { /* * Remember addr if jailed, to prevent rebinding. */ if (jailed(td->td_ucred)) inp->inp_laddr = laddr; inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->inp_lport = 0; error = EAGAIN; goto release; } inp->inp_flags |= INP_ANONPORT; } } else { faddr = inp->inp_faddr; fport = inp->inp_fport; if (faddr.s_addr == INADDR_ANY) { error = ENOTCONN; goto release; } } /* * Calculate data length and get a mbuf for UDP, IP, and possible * link-layer headers. Immediate slide the data pointer back forward * since we won't use that space at this layer. */ M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_DONTWAIT); if (m == NULL) { error = ENOBUFS; goto release; } m->m_data += max_linkhdr; m->m_len -= max_linkhdr; m->m_pkthdr.len -= max_linkhdr; /* * Fill in mbuf with extended UDP header * and addresses and length put into network format. */ ui = mtod(m, struct udpiphdr *); bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ ui->ui_pr = IPPROTO_UDP; ui->ui_src = laddr; ui->ui_dst = faddr; ui->ui_sport = lport; ui->ui_dport = fport; ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); + /* + * Set the Don't Fragment bit in the IP header. + */ + if (inp->inp_flags & INP_DONTFRAG) { + struct ip *ip; + ip = (struct ip *)&ui->ui_i; + ip->ip_off |= IP_DF; + } + ipflags = 0; if (inp->inp_socket->so_options & SO_DONTROUTE) ipflags |= IP_ROUTETOIF; if (inp->inp_socket->so_options & SO_BROADCAST) ipflags |= IP_ALLOWBROADCAST; if (inp->inp_flags & INP_ONESBCAST) ipflags |= IP_SENDONES; /* * Set up checksum and output datagram. */ if (udpcksum) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr, htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } else { ui->ui_sum = 0; } ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len; ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ ((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */ udpstat.udps_opackets++; if (unlock_udbinfo) INP_INFO_WUNLOCK(&udbinfo); error = ip_output(m, inp->inp_options, NULL, ipflags, inp->inp_moptions, inp); INP_UNLOCK(inp); return (error); release: INP_UNLOCK(inp); if (unlock_udbinfo) INP_INFO_WUNLOCK(&udbinfo); m_freem(m); return (error); } u_long udp_sendspace = 9216; /* really max datagram size */ /* 40 1K datagrams */ SYSCTL_INT(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); u_long udp_recvspace = 40 * (1024 + #ifdef INET6 sizeof(struct sockaddr_in6) #else sizeof(struct sockaddr_in) #endif ); SYSCTL_INT(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); static int udp_abort(struct socket *so) { struct inpcb *inp; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; /* ??? possible? panic instead? */ } INP_LOCK(inp); soisdisconnected(so); in_pcbdetach(inp); INP_INFO_WUNLOCK(&udbinfo); return 0; } static int udp_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp != 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; } error = soreserve(so, udp_sendspace, udp_recvspace); if (error) { INP_INFO_WUNLOCK(&udbinfo); return error; } error = in_pcballoc(so, &udbinfo, "udpinp"); if (error) { INP_INFO_WUNLOCK(&udbinfo); return error; } inp = (struct inpcb *)so->so_pcb; INP_LOCK(inp); INP_INFO_WUNLOCK(&udbinfo); inp->inp_vflag |= INP_IPV4; inp->inp_ip_ttl = ip_defttl; INP_UNLOCK(inp); return 0; } static int udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; int error; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; } INP_LOCK(inp); error = in_pcbbind(inp, nam, td->td_ucred); INP_UNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return error; } static int udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; int error; struct sockaddr_in *sin; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; } INP_LOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_UNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return EISCONN; } sin = (struct sockaddr_in *)nam; if (jailed(td->td_ucred)) prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr); error = in_pcbconnect(inp, nam, td->td_ucred); if (error == 0) soisconnected(so); INP_UNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); return error; } static int udp_detach(struct socket *so) { struct inpcb *inp; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; } INP_LOCK(inp); in_pcbdetach(inp); INP_INFO_WUNLOCK(&udbinfo); return 0; } static int udp_disconnect(struct socket *so) { struct inpcb *inp; INP_INFO_WLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_WUNLOCK(&udbinfo); return EINVAL; } INP_LOCK(inp); if (inp->inp_faddr.s_addr == INADDR_ANY) { INP_INFO_WUNLOCK(&udbinfo); INP_UNLOCK(inp); return ENOTCONN; } in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_UNLOCK(inp); INP_INFO_WUNLOCK(&udbinfo); so->so_state &= ~SS_ISCONNECTED; /* XXX */ return 0; } static int udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; inp = sotoinpcb(so); return udp_output(inp, m, addr, control, td); } int udp_shutdown(struct socket *so) { struct inpcb *inp; INP_INFO_RLOCK(&udbinfo); inp = sotoinpcb(so); if (inp == 0) { INP_INFO_RUNLOCK(&udbinfo); return EINVAL; } INP_LOCK(inp); INP_INFO_RUNLOCK(&udbinfo); socantsendmore(so); INP_UNLOCK(inp); return 0; } /* * This is the wrapper function for in_setsockaddr. We just pass down * the pcbinfo for in_setsockaddr to lock. We don't want to do the locking * here because in_setsockaddr will call malloc and might block. */ static int udp_sockaddr(struct socket *so, struct sockaddr **nam) { return (in_setsockaddr(so, nam, &udbinfo)); } /* * This is the wrapper function for in_setpeeraddr. We just pass down * the pcbinfo for in_setpeeraddr to lock. */ static int udp_peeraddr(struct socket *so, struct sockaddr **nam) { return (in_setpeeraddr(so, nam, &udbinfo)); } struct pr_usrreqs udp_usrreqs = { .pru_abort = udp_abort, .pru_attach = udp_attach, .pru_bind = udp_bind, .pru_connect = udp_connect, .pru_control = in_control, .pru_detach = udp_detach, .pru_disconnect = udp_disconnect, .pru_peeraddr = udp_peeraddr, .pru_send = udp_send, .pru_shutdown = udp_shutdown, .pru_sockaddr = udp_sockaddr, .pru_sosetlabel = in_pcbsosetlabel };