Changeset View
Standalone View
sys/netinet/ip_input.c
Show All 31 Lines | |||||
#include <sys/cdefs.h> | #include <sys/cdefs.h> | ||||
__FBSDID("$FreeBSD$"); | __FBSDID("$FreeBSD$"); | ||||
#include "opt_bootp.h" | #include "opt_bootp.h" | ||||
#include "opt_ipfw.h" | #include "opt_ipfw.h" | ||||
#include "opt_ipstealth.h" | #include "opt_ipstealth.h" | ||||
#include "opt_ipsec.h" | #include "opt_ipsec.h" | ||||
#include "opt_route.h" | #include "opt_route.h" | ||||
#include "opt_rss.h" | |||||
#include <sys/param.h> | #include <sys/param.h> | ||||
#include <sys/systm.h> | #include <sys/systm.h> | ||||
#include <sys/mbuf.h> | #include <sys/mbuf.h> | ||||
#include <sys/malloc.h> | #include <sys/malloc.h> | ||||
#include <sys/domain.h> | #include <sys/domain.h> | ||||
#include <sys/protosw.h> | #include <sys/protosw.h> | ||||
#include <sys/socket.h> | #include <sys/socket.h> | ||||
Show All 24 Lines | |||||
#include <netinet/ip_fw.h> | #include <netinet/ip_fw.h> | ||||
#include <netinet/ip_icmp.h> | #include <netinet/ip_icmp.h> | ||||
#include <netinet/ip_options.h> | #include <netinet/ip_options.h> | ||||
#include <machine/in_cksum.h> | #include <machine/in_cksum.h> | ||||
#include <netinet/ip_carp.h> | #include <netinet/ip_carp.h> | ||||
#ifdef IPSEC | #ifdef IPSEC | ||||
#include <netinet/ip_ipsec.h> | #include <netinet/ip_ipsec.h> | ||||
#endif /* IPSEC */ | #endif /* IPSEC */ | ||||
#include <netinet/in_rss.h> | |||||
#include <sys/socketvar.h> | #include <sys/socketvar.h> | ||||
#include <security/mac/mac_framework.h> | #include <security/mac/mac_framework.h> | ||||
#ifdef CTASSERT | #ifdef CTASSERT | ||||
CTASSERT(sizeof(struct ip) == 20); | CTASSERT(sizeof(struct ip) == 20); | ||||
#endif | #endif | ||||
Show All 33 Lines | |||||
/* | /* | ||||
* XXX - Setting ip_checkinterface mostly implements the receive side of | * XXX - Setting ip_checkinterface mostly implements the receive side of | ||||
* the Strong ES model described in RFC 1122, but since the routing table | * the Strong ES model described in RFC 1122, but since the routing table | ||||
* and transmit implementation do not implement the Strong ES model, | * and transmit implementation do not implement the Strong ES model, | ||||
* setting this to 1 results in an odd hybrid. | * setting this to 1 results in an odd hybrid. | ||||
* | * | ||||
* XXX - ip_checkinterface currently must be disabled if you use ipnat | * XXX - ip_checkinterface currently must be disabled if you use ipnat | ||||
* to translate the destination address to another local interface. | * to translate the destination address to another local interface. | ||||
grehan: If RSS is active, there is no option to direct-dispatch RSS-capable traffic or you risk sending… | |||||
* | * | ||||
* XXX - ip_checkinterface must be disabled if you add IP aliases | * XXX - ip_checkinterface must be disabled if you add IP aliases | ||||
* to the loopback interface instead of the interface where the | * to the loopback interface instead of the interface where the | ||||
* packets for those addresses are received. | * packets for those addresses are received. | ||||
*/ | */ | ||||
static VNET_DEFINE(int, ip_checkinterface); | static VNET_DEFINE(int, ip_checkinterface); | ||||
#define V_ip_checkinterface VNET(ip_checkinterface) | #define V_ip_checkinterface VNET(ip_checkinterface) | ||||
SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, | SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, | ||||
&VNET_NAME(ip_checkinterface), 0, | &VNET_NAME(ip_checkinterface), 0, | ||||
"Verify packet arrives on correct interface"); | "Verify packet arrives on correct interface"); | ||||
VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */ | VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */ | ||||
/* | |||||
* We may need to re-inject packets into the IP stack for further work. | |||||
* In this instance, use the CPU policy and query the RSS layer for the | |||||
Not Done Inline ActionsFragment should not be resubmitted to the stack: they have already been accounted for in terms of statistics. It would seem you need an additional netisr that is going to submit the already verified/accounted frame directly to the inetsw (even the M_FASTFWD_OURS bypass will still double-count ips_delivered) grehan: Fragment should not be resubmitted to the stack: they have already been accounted for in terms… | |||||
Not Done Inline ActionsOn further thought, I think the FASTFWD is what you want, modulo the byte order/length expectations in the first conditional in ip_input. ips_delivered won't be double counted since the frame is netisr'd prior to reaching inetsw. grehan: On further thought, I think the FASTFWD is what you want, modulo the byte order/length… | |||||
Not Done Inline ActionsI think that yes, perhaps we do need an ip reinject netisr that things get queued to. I'm just worried about unintended queue behaviour between the IP and IP-reinject queues. Thanks for picking that up though! I didn't even think about it double-accounting things. So hm, does that mean that the various IP tunnel de-encapsulation paths are falling similarly afoul? They also re-queue into the NETISR_IP queue. adrian: I think that yes, perhaps we do need an ip reinject netisr that things get queued to. I'm just… | |||||
Not Done Inline ActionsTunnel decaps is entirely different: you're dealing with a completely new packet once the outer layer has been stripped: it hasn't had any input processing. The requeueing is problematic. There has already been work done to process the packet, and you don't want to drop it because a netisr queue is full. That smells a lot like a DoS attack. A possible way to do this is to reserve a netisr queue slot for a reassembly entry, and only initiate a new reassembly if there is an available slot, so it is guaranteed that a successful reassembly will result in delivery to the transport layer. Seems like it's easy to spin out of control with complexity in this area :( grehan: Tunnel decaps is entirely different: you're dealing with a completely new packet once the outer… | |||||
* relevant CPU ID to use. | |||||
*/ | |||||
static struct netisr_handler ip_nh = { | static struct netisr_handler ip_nh = { | ||||
.nh_name = "ip", | .nh_name = "ip", | ||||
.nh_handler = ip_input, | .nh_handler = ip_input, | ||||
.nh_proto = NETISR_IP, | .nh_proto = NETISR_IP, | ||||
#ifdef RSS | |||||
.nh_m2cpuid = rss_m2cpuid, | |||||
.nh_policy = NETISR_POLICY_CPU, | |||||
#else | |||||
.nh_policy = NETISR_POLICY_FLOW, | .nh_policy = NETISR_POLICY_FLOW, | ||||
#endif | |||||
}; | }; | ||||
extern struct domain inetdomain; | extern struct domain inetdomain; | ||||
extern struct protosw inetsw[]; | extern struct protosw inetsw[]; | ||||
u_char ip_protox[IPPROTO_MAX]; | u_char ip_protox[IPPROTO_MAX]; | ||||
VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */ | VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */ | ||||
VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */ | VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */ | ||||
VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */ | VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */ | ||||
▲ Show 20 Lines • Show All 204 Lines • ▼ Show 20 Lines | ip_input(struct mbuf *m) | ||||
struct ip *ip = NULL; | struct ip *ip = NULL; | ||||
struct in_ifaddr *ia = NULL; | struct in_ifaddr *ia = NULL; | ||||
struct ifaddr *ifa; | struct ifaddr *ifa; | ||||
struct ifnet *ifp; | struct ifnet *ifp; | ||||
int checkif, hlen = 0; | int checkif, hlen = 0; | ||||
uint16_t sum, ip_len; | uint16_t sum, ip_len; | ||||
int dchg = 0; /* dest changed after fw */ | int dchg = 0; /* dest changed after fw */ | ||||
struct in_addr odst; /* original dst address */ | struct in_addr odst; /* original dst address */ | ||||
int reinjected = 0; | |||||
M_ASSERTPKTHDR(m); | M_ASSERTPKTHDR(m); | ||||
if (m->m_flags & M_REINJECT_OURS) { | |||||
grehanUnsubmitted Not Done Inline ActionsYou're now making changes that are outside of the RSS conditional, so it seems like this needs additional review. My suggestion is that collapse the 2 tests of *_OURS into 1 to avoid the extra conditional test in a common path if (m->m_flags & M_FASTFWD_OURS|M_REINJECT_OURS) { /* put individual tests here */ } Also, the use of the reinjected flag doesn't seem to be necessary. There is no need to do *any* tests again: the fragment code has always passed reassembled fragments directly to the stack, and all the intercept points knew this. Seems less invasive to have a label just prior to the inetsw dispatch and goto that, rather than a seemingly arbitrary set of tests bracketed with the reinjected flag, and some without. The real fix is to have a netisr (or other) dispatch that goes directly to the protocols. I'm not sure what the "unintended queue behaviour" is that you are referring to. Certainly the proprietary implementation I worked on did have a dispatch direct to protocols without any side effects. grehan: You're now making changes that are outside of the RSS conditional, so it seems like this needs… | |||||
m->m_flags &= ~M_REINJECT_OURS; | |||||
reinjected = 1; | |||||
} | |||||
if (m->m_flags & M_FASTFWD_OURS) { | if (m->m_flags & M_FASTFWD_OURS) { | ||||
m->m_flags &= ~M_FASTFWD_OURS; | m->m_flags &= ~M_FASTFWD_OURS; | ||||
/* Set up some basics that will be used later. */ | /* Set up some basics that will be used later. */ | ||||
ip = mtod(m, struct ip *); | ip = mtod(m, struct ip *); | ||||
hlen = ip->ip_hl << 2; | hlen = ip->ip_hl << 2; | ||||
ip_len = ntohs(ip->ip_len); | ip_len = ntohs(ip->ip_len); | ||||
goto ours; | goto ours; | ||||
} | } | ||||
if (! reinjected) | |||||
IPSTAT_INC(ips_total); | IPSTAT_INC(ips_total); | ||||
if (m->m_pkthdr.len < sizeof(struct ip)) | if (m->m_pkthdr.len < sizeof(struct ip)) | ||||
goto tooshort; | goto tooshort; | ||||
if (m->m_len < sizeof (struct ip) && | if (m->m_len < sizeof (struct ip) && | ||||
(m = m_pullup(m, sizeof (struct ip))) == NULL) { | (m = m_pullup(m, sizeof (struct ip))) == NULL) { | ||||
IPSTAT_INC(ips_toosmall); | IPSTAT_INC(ips_toosmall); | ||||
return; | return; | ||||
Show All 13 Lines | ip_input(struct mbuf *m) | ||||
if (hlen > m->m_len) { | if (hlen > m->m_len) { | ||||
if ((m = m_pullup(m, hlen)) == NULL) { | if ((m = m_pullup(m, hlen)) == NULL) { | ||||
IPSTAT_INC(ips_badhlen); | IPSTAT_INC(ips_badhlen); | ||||
return; | return; | ||||
} | } | ||||
ip = mtod(m, struct ip *); | ip = mtod(m, struct ip *); | ||||
} | } | ||||
if (! reinjected) | |||||
IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL); | IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL); | ||||
/* 127/8 must not appear on wire - RFC1122 */ | /* 127/8 must not appear on wire - RFC1122 */ | ||||
ifp = m->m_pkthdr.rcvif; | ifp = m->m_pkthdr.rcvif; | ||||
if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || | if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || | ||||
(ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { | (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { | ||||
if ((ifp->if_flags & IFF_LOOPBACK) == 0) { | if ((ifp->if_flags & IFF_LOOPBACK) == 0) { | ||||
IPSTAT_INC(ips_badaddr); | IPSTAT_INC(ips_badaddr); | ||||
goto bad; | goto bad; | ||||
} | } | ||||
} | } | ||||
/* XXX should we bypass this for reinjected frames? */ | |||||
if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { | if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { | ||||
sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); | sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); | ||||
} else { | } else { | ||||
if (hlen == sizeof(struct ip)) { | if (hlen == sizeof(struct ip)) { | ||||
sum = in_cksum_hdr(ip); | sum = in_cksum_hdr(ip); | ||||
} else { | } else { | ||||
sum = in_cksum(m, hlen); | sum = in_cksum(m, hlen); | ||||
} | } | ||||
} | } | ||||
if (sum) { | if (sum) { | ||||
IPSTAT_INC(ips_badsum); | IPSTAT_INC(ips_badsum); | ||||
goto bad; | goto bad; | ||||
} | } | ||||
#ifdef ALTQ | #ifdef ALTQ | ||||
if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) | |||||
if (altq_input != NULL && | |||||
(! reinjected) && | |||||
(*altq_input)(m, AF_INET) == 0) | |||||
/* packet is dropped by traffic conditioner */ | /* packet is dropped by traffic conditioner */ | ||||
return; | return; | ||||
#endif | #endif | ||||
ip_len = ntohs(ip->ip_len); | ip_len = ntohs(ip->ip_len); | ||||
if (ip_len < hlen) { | if (ip_len < hlen) { | ||||
IPSTAT_INC(ips_badlen); | IPSTAT_INC(ips_badlen); | ||||
goto bad; | goto bad; | ||||
▲ Show 20 Lines • Show All 270 Lines • ▼ Show 20 Lines | #ifdef IPSEC | ||||
* enforce IPsec policy checking if we are seeing last header. | * enforce IPsec policy checking if we are seeing last header. | ||||
* note that we do not visit this with protocols with pcb layer | * note that we do not visit this with protocols with pcb layer | ||||
* code - like udp/tcp/raw ip. | * code - like udp/tcp/raw ip. | ||||
*/ | */ | ||||
if (ip_ipsec_input(m)) | if (ip_ipsec_input(m)) | ||||
goto bad; | goto bad; | ||||
#endif /* IPSEC */ | #endif /* IPSEC */ | ||||
/* | /* | ||||
grehanUnsubmitted Not Done Inline Actionsgoto label for reinjected frames should go here. grehan: goto label for reinjected frames should go here. | |||||
adrianAuthorUnsubmitted Not Done Inline ActionsRight, but we should also at least run it through pfil once more, right? Hm, thinking about it it _isn't_ doing it for reassembled frames anyway, so. Ok. I think I'll do this for now. adrian: Right, but we should also at least run it through pfil once more, right?
Hm, thinking about it… | |||||
* Switch out to protocol's input routine. | * Switch out to protocol's input routine. | ||||
*/ | */ | ||||
IPSTAT_INC(ips_delivered); | IPSTAT_INC(ips_delivered); | ||||
(*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); | (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); | ||||
return; | return; | ||||
bad: | bad: | ||||
m_freem(m); | m_freem(m); | ||||
▲ Show 20 Lines • Show All 79 Lines • ▼ Show 20 Lines | |||||
{ | { | ||||
struct ip *ip; | struct ip *ip; | ||||
struct mbuf *p, *q, *nq, *t; | struct mbuf *p, *q, *nq, *t; | ||||
struct ipq *fp = NULL; | struct ipq *fp = NULL; | ||||
struct ipqhead *head; | struct ipqhead *head; | ||||
int i, hlen, next; | int i, hlen, next; | ||||
u_int8_t ecn, ecn0; | u_int8_t ecn, ecn0; | ||||
u_short hash; | u_short hash; | ||||
#ifdef RSS | |||||
uint32_t rss_hash, rss_type; | |||||
#endif | |||||
/* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ | /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ | ||||
if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { | if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { | ||||
IPSTAT_INC(ips_fragments); | IPSTAT_INC(ips_fragments); | ||||
IPSTAT_INC(ips_fragdropped); | IPSTAT_INC(ips_fragdropped); | ||||
m_freem(m); | m_freem(m); | ||||
return (NULL); | return (NULL); | ||||
} | } | ||||
▲ Show 20 Lines • Show All 273 Lines • ▼ Show 20 Lines | #endif | ||||
uma_zfree(V_ipq_zone, fp); | uma_zfree(V_ipq_zone, fp); | ||||
m->m_len += (ip->ip_hl << 2); | m->m_len += (ip->ip_hl << 2); | ||||
m->m_data -= (ip->ip_hl << 2); | m->m_data -= (ip->ip_hl << 2); | ||||
/* some debugging cruft by sklower, below, will go away soon */ | /* some debugging cruft by sklower, below, will go away soon */ | ||||
if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ | if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ | ||||
m_fixhdr(m); | m_fixhdr(m); | ||||
IPSTAT_INC(ips_reassembled); | IPSTAT_INC(ips_reassembled); | ||||
IPQ_UNLOCK(); | IPQ_UNLOCK(); | ||||
#ifdef RSS | |||||
/* | |||||
* Query the RSS layer for the flowid / flowtype for the | |||||
* mbuf payload. | |||||
* | |||||
* For now, just assume we have to calculate a new one. | |||||
* Later on we should check to see if the assigned flowid matches | |||||
* what RSS wants for the given IP protocol and if so, just keep it. | |||||
* | |||||
* We then queue into the relevant netisr so it can be dispatched | |||||
* to the correct CPU. | |||||
* | |||||
* Note - this may return 1, which means the flowid in the mbuf | |||||
* is correct for the configured RSS hash types and can be used. | |||||
*/ | |||||
if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) { | |||||
m->m_pkthdr.flowid = rss_hash; | |||||
M_HASHTYPE_SET(m, rss_type); | |||||
m->m_flags |= M_FLOWID; | |||||
} | |||||
#endif | |||||
#ifdef RSS | |||||
/* | |||||
* Queue/dispatch for reprocessing. | |||||
* | |||||
* Note: this is much slower than just handling the frame in the | |||||
* current receive context. It's likely worth investigating | |||||
* why this is. | |||||
*/ | |||||
m->m_flags |= M_REINJECT_OURS; | |||||
netisr_dispatch(NETISR_IP, m); | |||||
return (NULL); | |||||
#endif | |||||
/* Handle in-line */ | |||||
return (m); | return (m); | ||||
Not Done Inline ActionsThis should either be a panic if RSS is defined and the packet can be hashed, or perhaps verify if the current PCB context is the one the packet belongs to. grehan: This should either be a panic if RSS is defined and the packet can be hashed, or perhaps verify… | |||||
Not Done Inline ActionsThis was mostly for debugging/evaluation purposes for the (current) UDP situation where it's 2-tuple hashed. You're right though - if RSS is enabled then it should just netisr_dispatch() and not be configurable. adrian: This was mostly for debugging/evaluation purposes for the (current) UDP situation where it's 2… | |||||
dropfrag: | dropfrag: | ||||
IPSTAT_INC(ips_fragdropped); | IPSTAT_INC(ips_fragdropped); | ||||
if (fp != NULL) | if (fp != NULL) | ||||
fp->ipq_nfrags--; | fp->ipq_nfrags--; | ||||
m_freem(m); | m_freem(m); | ||||
done: | done: | ||||
IPQ_UNLOCK(); | IPQ_UNLOCK(); | ||||
▲ Show 20 Lines • Show All 633 Lines • Show Last 20 Lines |
If RSS is active, there is no option to direct-dispatch RSS-capable traffic or you risk sending a flow to the wrong PCB group.