Changeset View
Standalone View
sys/netinet/ip_input.c
Show All 31 Lines | |||||
#include <sys/cdefs.h> | #include <sys/cdefs.h> | ||||
__FBSDID("$FreeBSD$"); | __FBSDID("$FreeBSD$"); | ||||
#include "opt_bootp.h" | #include "opt_bootp.h" | ||||
#include "opt_ipfw.h" | #include "opt_ipfw.h" | ||||
#include "opt_ipstealth.h" | #include "opt_ipstealth.h" | ||||
#include "opt_ipsec.h" | #include "opt_ipsec.h" | ||||
#include "opt_route.h" | #include "opt_route.h" | ||||
#include "opt_rss.h" | |||||
#include <sys/param.h> | #include <sys/param.h> | ||||
#include <sys/systm.h> | #include <sys/systm.h> | ||||
#include <sys/mbuf.h> | #include <sys/mbuf.h> | ||||
#include <sys/malloc.h> | #include <sys/malloc.h> | ||||
#include <sys/domain.h> | #include <sys/domain.h> | ||||
#include <sys/protosw.h> | #include <sys/protosw.h> | ||||
#include <sys/socket.h> | #include <sys/socket.h> | ||||
Show All 24 Lines | |||||
#include <netinet/ip_fw.h> | #include <netinet/ip_fw.h> | ||||
#include <netinet/ip_icmp.h> | #include <netinet/ip_icmp.h> | ||||
#include <netinet/ip_options.h> | #include <netinet/ip_options.h> | ||||
#include <machine/in_cksum.h> | #include <machine/in_cksum.h> | ||||
#include <netinet/ip_carp.h> | #include <netinet/ip_carp.h> | ||||
#ifdef IPSEC | #ifdef IPSEC | ||||
#include <netinet/ip_ipsec.h> | #include <netinet/ip_ipsec.h> | ||||
#endif /* IPSEC */ | #endif /* IPSEC */ | ||||
#include <netinet/in_rss.h> | |||||
#include <sys/socketvar.h> | #include <sys/socketvar.h> | ||||
#include <security/mac/mac_framework.h> | #include <security/mac/mac_framework.h> | ||||
#ifdef CTASSERT | #ifdef CTASSERT | ||||
CTASSERT(sizeof(struct ip) == 20); | CTASSERT(sizeof(struct ip) == 20); | ||||
#endif | #endif | ||||
Show All 26 Lines | SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_RW, | ||||
&VNET_NAME(ip_sendsourcequench), 0, | &VNET_NAME(ip_sendsourcequench), 0, | ||||
"Enable the transmission of source quench packets"); | "Enable the transmission of source quench packets"); | ||||
VNET_DEFINE(int, ip_do_randomid); | VNET_DEFINE(int, ip_do_randomid); | ||||
SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW, | SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW, | ||||
&VNET_NAME(ip_do_randomid), 0, | &VNET_NAME(ip_do_randomid), 0, | ||||
"Assign random ip_id values"); | "Assign random ip_id values"); | ||||
#ifdef RSS | |||||
static int ip_reass_netisr_dispatch = 1; | |||||
#else | |||||
static int ip_reass_netisr_dispatch = 0; | |||||
#endif | |||||
SYSCTL_INT(_net_inet_ip, OID_AUTO, reass_netisr_dispatch, CTLFLAG_RW, | |||||
&ip_reass_netisr_dispatch, 0, | |||||
"IP fragment reassembly - direct=0, netisr dispatch=1, queue=2"); | |||||
grehan: If RSS is active, there is no option to direct-dispatch RSS-capable traffic or you risk sending… | |||||
/* | /* | ||||
* XXX - Setting ip_checkinterface mostly implements the receive side of | * XXX - Setting ip_checkinterface mostly implements the receive side of | ||||
* the Strong ES model described in RFC 1122, but since the routing table | * the Strong ES model described in RFC 1122, but since the routing table | ||||
* and transmit implementation do not implement the Strong ES model, | * and transmit implementation do not implement the Strong ES model, | ||||
* setting this to 1 results in an odd hybrid. | * setting this to 1 results in an odd hybrid. | ||||
* | * | ||||
* XXX - ip_checkinterface currently must be disabled if you use ipnat | * XXX - ip_checkinterface currently must be disabled if you use ipnat | ||||
* to translate the destination address to another local interface. | * to translate the destination address to another local interface. | ||||
* | * | ||||
* XXX - ip_checkinterface must be disabled if you add IP aliases | * XXX - ip_checkinterface must be disabled if you add IP aliases | ||||
* to the loopback interface instead of the interface where the | * to the loopback interface instead of the interface where the | ||||
* packets for those addresses are received. | * packets for those addresses are received. | ||||
*/ | */ | ||||
static VNET_DEFINE(int, ip_checkinterface); | static VNET_DEFINE(int, ip_checkinterface); | ||||
#define V_ip_checkinterface VNET(ip_checkinterface) | #define V_ip_checkinterface VNET(ip_checkinterface) | ||||
SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, | SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, | ||||
&VNET_NAME(ip_checkinterface), 0, | &VNET_NAME(ip_checkinterface), 0, | ||||
"Verify packet arrives on correct interface"); | "Verify packet arrives on correct interface"); | ||||
VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */ | VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */ | ||||
/* | |||||
* We may need to re-inject packets into the IP stack for further work. | |||||
* In this instance, use the CPU policy and query the RSS layer for the | |||||
grehanUnsubmitted Not Done Inline ActionsFragment should not be resubmitted to the stack: they have already been accounted for in terms of statistics. It would seem you need an additional netisr that is going to submit the already verified/accounted frame directly to the inetsw (even the M_FASTFWD_OURS bypass will still double-count ips_delivered) grehan: Fragment should not be resubmitted to the stack: they have already been accounted for in terms… | |||||
grehanUnsubmitted Not Done Inline ActionsOn further thought, I think the FASTFWD is what you want, modulo the byte order/length expectations in the first conditional in ip_input. ips_delivered won't be double counted since the frame is netisr'd prior to reaching inetsw. grehan: On further thought, I think the FASTFWD is what you want, modulo the byte order/length… | |||||
adrianAuthorUnsubmitted Not Done Inline ActionsI think that yes, perhaps we do need an ip reinject netisr that things get queued to. I'm just worried about unintended queue behaviour between the IP and IP-reinject queues. Thanks for picking that up though! I didn't even think about it double-accounting things. So hm, does that mean that the various IP tunnel de-encapsulation paths are falling similarly afoul? They also re-queue into the NETISR_IP queue. adrian: I think that yes, perhaps we do need an ip reinject netisr that things get queued to. I'm just… | |||||
grehanUnsubmitted Not Done Inline ActionsTunnel decaps is entirely different: you're dealing with a completely new packet once the outer layer has been stripped: it hasn't had any input processing. The requeueing is problematic. There has already been work done to process the packet, and you don't want to drop it because a netisr queue is full. That smells a lot like a DoS attack. A possible way to do this is to reserve a netisr queue slot for a reassembly entry, and only initiate a new reassembly if there is an available slot, so it is guaranteed that a successful reassembly will result in delivery to the transport layer. Seems like it's easy to spin out of control with complexity in this area :( grehan: Tunnel decaps is entirely different: you're dealing with a completely new packet once the outer… | |||||
* relevant CPU ID to use. | |||||
*/ | |||||
static struct netisr_handler ip_nh = { | static struct netisr_handler ip_nh = { | ||||
.nh_name = "ip", | .nh_name = "ip", | ||||
.nh_handler = ip_input, | .nh_handler = ip_input, | ||||
.nh_proto = NETISR_IP, | .nh_proto = NETISR_IP, | ||||
#ifdef RSS | |||||
.nh_m2cpuid = rss_m2cpuid, | |||||
.nh_policy = NETISR_POLICY_CPU, | |||||
#else | |||||
.nh_policy = NETISR_POLICY_FLOW, | .nh_policy = NETISR_POLICY_FLOW, | ||||
#endif | |||||
}; | }; | ||||
extern struct domain inetdomain; | extern struct domain inetdomain; | ||||
extern struct protosw inetsw[]; | extern struct protosw inetsw[]; | ||||
u_char ip_protox[IPPROTO_MAX]; | u_char ip_protox[IPPROTO_MAX]; | ||||
VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */ | VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */ | ||||
VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */ | VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */ | ||||
VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */ | VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */ | ||||
▲ Show 20 Lines • Show All 207 Lines • ▼ Show 20 Lines | ip_input(struct mbuf *m) | ||||
struct ifnet *ifp; | struct ifnet *ifp; | ||||
int checkif, hlen = 0; | int checkif, hlen = 0; | ||||
uint16_t sum, ip_len; | uint16_t sum, ip_len; | ||||
int dchg = 0; /* dest changed after fw */ | int dchg = 0; /* dest changed after fw */ | ||||
struct in_addr odst; /* original dst address */ | struct in_addr odst; /* original dst address */ | ||||
M_ASSERTPKTHDR(m); | M_ASSERTPKTHDR(m); | ||||
if (m->m_flags & M_FASTFWD_OURS) { | if (m->m_flags & M_FASTFWD_OURS) { | ||||
Not Done Inline ActionsYou're now making changes that are outside of the RSS conditional, so it seems like this needs additional review. My suggestion is that collapse the 2 tests of *_OURS into 1 to avoid the extra conditional test in a common path if (m->m_flags & M_FASTFWD_OURS|M_REINJECT_OURS) { /* put individual tests here */ } Also, the use of the reinjected flag doesn't seem to be necessary. There is no need to do *any* tests again: the fragment code has always passed reassembled fragments directly to the stack, and all the intercept points knew this. Seems less invasive to have a label just prior to the inetsw dispatch and goto that, rather than a seemingly arbitrary set of tests bracketed with the reinjected flag, and some without. The real fix is to have a netisr (or other) dispatch that goes directly to the protocols. I'm not sure what the "unintended queue behaviour" is that you are referring to. Certainly the proprietary implementation I worked on did have a dispatch direct to protocols without any side effects. grehan: You're now making changes that are outside of the RSS conditional, so it seems like this needs… | |||||
m->m_flags &= ~M_FASTFWD_OURS; | m->m_flags &= ~M_FASTFWD_OURS; | ||||
/* Set up some basics that will be used later. */ | /* Set up some basics that will be used later. */ | ||||
ip = mtod(m, struct ip *); | ip = mtod(m, struct ip *); | ||||
hlen = ip->ip_hl << 2; | hlen = ip->ip_hl << 2; | ||||
ip_len = ntohs(ip->ip_len); | ip_len = ntohs(ip->ip_len); | ||||
goto ours; | goto ours; | ||||
} | } | ||||
▲ Show 20 Lines • Show All 336 Lines • ▼ Show 20 Lines | #ifdef IPSEC | ||||
* enforce IPsec policy checking if we are seeing last header. | * enforce IPsec policy checking if we are seeing last header. | ||||
* note that we do not visit this with protocols with pcb layer | * note that we do not visit this with protocols with pcb layer | ||||
* code - like udp/tcp/raw ip. | * code - like udp/tcp/raw ip. | ||||
*/ | */ | ||||
if (ip_ipsec_input(m)) | if (ip_ipsec_input(m)) | ||||
goto bad; | goto bad; | ||||
#endif /* IPSEC */ | #endif /* IPSEC */ | ||||
/* | /* | ||||
Not Done Inline Actionsgoto label for reinjected frames should go here. grehan: goto label for reinjected frames should go here. | |||||
Not Done Inline ActionsRight, but we should also at least run it through pfil once more, right? Hm, thinking about it it _isn't_ doing it for reassembled frames anyway, so. Ok. I think I'll do this for now. adrian: Right, but we should also at least run it through pfil once more, right?
Hm, thinking about it… | |||||
* Switch out to protocol's input routine. | * Switch out to protocol's input routine. | ||||
*/ | */ | ||||
IPSTAT_INC(ips_delivered); | IPSTAT_INC(ips_delivered); | ||||
(*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); | (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); | ||||
return; | return; | ||||
bad: | bad: | ||||
m_freem(m); | m_freem(m); | ||||
▲ Show 20 Lines • Show All 79 Lines • ▼ Show 20 Lines | |||||
{ | { | ||||
struct ip *ip; | struct ip *ip; | ||||
struct mbuf *p, *q, *nq, *t; | struct mbuf *p, *q, *nq, *t; | ||||
struct ipq *fp = NULL; | struct ipq *fp = NULL; | ||||
struct ipqhead *head; | struct ipqhead *head; | ||||
int i, hlen, next; | int i, hlen, next; | ||||
u_int8_t ecn, ecn0; | u_int8_t ecn, ecn0; | ||||
u_short hash; | u_short hash; | ||||
#ifdef RSS | |||||
uint32_t rss_hash, rss_type; | |||||
#endif | |||||
/* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ | /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ | ||||
if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { | if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { | ||||
IPSTAT_INC(ips_fragments); | IPSTAT_INC(ips_fragments); | ||||
IPSTAT_INC(ips_fragdropped); | IPSTAT_INC(ips_fragdropped); | ||||
m_freem(m); | m_freem(m); | ||||
return (NULL); | return (NULL); | ||||
} | } | ||||
▲ Show 20 Lines • Show All 273 Lines • ▼ Show 20 Lines | #endif | ||||
uma_zfree(V_ipq_zone, fp); | uma_zfree(V_ipq_zone, fp); | ||||
m->m_len += (ip->ip_hl << 2); | m->m_len += (ip->ip_hl << 2); | ||||
m->m_data -= (ip->ip_hl << 2); | m->m_data -= (ip->ip_hl << 2); | ||||
/* some debugging cruft by sklower, below, will go away soon */ | /* some debugging cruft by sklower, below, will go away soon */ | ||||
if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ | if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ | ||||
m_fixhdr(m); | m_fixhdr(m); | ||||
IPSTAT_INC(ips_reassembled); | IPSTAT_INC(ips_reassembled); | ||||
IPQ_UNLOCK(); | IPQ_UNLOCK(); | ||||
#ifdef RSS | |||||
/* | |||||
* Query the RSS layer for the flowid / flowtype for the | |||||
* mbuf payload. | |||||
* | |||||
* We then queue into the relevant netisr so it can be dispatched | |||||
* to the correct CPU. | |||||
* | |||||
* Note - this may return 1, which means the flowid in the mbuf | |||||
* is correct for the configured RSS hash types and can be used. | |||||
*/ | |||||
if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) { | |||||
m->m_pkthdr.flowid = rss_hash; | |||||
M_HASHTYPE_SET(m, rss_type); | |||||
m->m_flags |= M_FLOWID; | |||||
} | |||||
#endif | |||||
/* | |||||
* Queue/dispatch for reprocessing. | |||||
* | |||||
* When doing queue to the same CPU, the netisr and the NIC queue | |||||
* end up taking most of the CPU, starving it of time for userland. | |||||
* This means that most of the packets do get dropped. | |||||
* | |||||
* If another CPU handles the netisr side - and/or another CPU handles | |||||
* the userland side - then performance is much, much better. | |||||
* | |||||
* So use dispatch for now. | |||||
* | |||||
* Note: Doing a dispatch with the nh_m2cpuid method and netisr | |||||
* versus handling the fragment via the normal path gives some | |||||
* pretty spectacularly crappy performance in comparison. | |||||
* That needs to be addressed. | |||||
*/ | |||||
if (ip_reass_netisr_dispatch == 1) { | |||||
netisr_dispatch(NETISR_IP, m); | |||||
return (NULL); | |||||
} else if (ip_reass_netisr_dispatch == 2) { | |||||
netisr_queue(NETISR_IP, m); | |||||
return (NULL); | |||||
} | |||||
/* No netisr dispatch; handle inline */ | |||||
return (m); | return (m); | ||||
grehanUnsubmitted Not Done Inline ActionsThis should either be a panic if RSS is defined and the packet can be hashed, or perhaps verify if the current PCB context is the one the packet belongs to. grehan: This should either be a panic if RSS is defined and the packet can be hashed, or perhaps verify… | |||||
adrianAuthorUnsubmitted Not Done Inline ActionsThis was mostly for debugging/evaluation purposes for the (current) UDP situation where it's 2-tuple hashed. You're right though - if RSS is enabled then it should just netisr_dispatch() and not be configurable. adrian: This was mostly for debugging/evaluation purposes for the (current) UDP situation where it's 2… | |||||
dropfrag: | dropfrag: | ||||
IPSTAT_INC(ips_fragdropped); | IPSTAT_INC(ips_fragdropped); | ||||
if (fp != NULL) | if (fp != NULL) | ||||
fp->ipq_nfrags--; | fp->ipq_nfrags--; | ||||
m_freem(m); | m_freem(m); | ||||
done: | done: | ||||
IPQ_UNLOCK(); | IPQ_UNLOCK(); | ||||
▲ Show 20 Lines • Show All 626 Lines • Show Last 20 Lines |
If RSS is active, there is no option to direct-dispatch RSS-capable traffic or you risk sending a flow to the wrong PCB group.