Changeset View
Standalone View
sys/netinet/in_pcb.c
Show All 36 Lines | |||||
#include <sys/cdefs.h> | #include <sys/cdefs.h> | ||||
__FBSDID("$FreeBSD$"); | __FBSDID("$FreeBSD$"); | ||||
#include "opt_ddb.h" | #include "opt_ddb.h" | ||||
#include "opt_ipsec.h" | #include "opt_ipsec.h" | ||||
#include "opt_inet.h" | #include "opt_inet.h" | ||||
#include "opt_inet6.h" | #include "opt_inet6.h" | ||||
#include "opt_ratelimit.h" | |||||
#include "opt_pcbgroup.h" | #include "opt_pcbgroup.h" | ||||
#include "opt_rss.h" | #include "opt_rss.h" | ||||
#include <sys/param.h> | #include <sys/param.h> | ||||
#include <sys/systm.h> | #include <sys/systm.h> | ||||
#include <sys/lock.h> | #include <sys/lock.h> | ||||
#include <sys/malloc.h> | #include <sys/malloc.h> | ||||
#include <sys/mbuf.h> | #include <sys/mbuf.h> | ||||
#include <sys/callout.h> | #include <sys/callout.h> | ||||
#include <sys/eventhandler.h> | #include <sys/eventhandler.h> | ||||
#include <sys/domain.h> | #include <sys/domain.h> | ||||
#include <sys/protosw.h> | #include <sys/protosw.h> | ||||
#include <sys/rmlock.h> | #include <sys/rmlock.h> | ||||
#include <sys/socket.h> | #include <sys/socket.h> | ||||
#include <sys/socketvar.h> | #include <sys/socketvar.h> | ||||
#include <sys/sockio.h> | |||||
#include <sys/priv.h> | #include <sys/priv.h> | ||||
#include <sys/proc.h> | #include <sys/proc.h> | ||||
#include <sys/refcount.h> | #include <sys/refcount.h> | ||||
#include <sys/jail.h> | #include <sys/jail.h> | ||||
#include <sys/kernel.h> | #include <sys/kernel.h> | ||||
#include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||
#ifdef DDB | #ifdef DDB | ||||
▲ Show 20 Lines • Show All 63 Lines • ▼ Show 20 Lines | |||||
VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ | VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ | ||||
VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ | VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ | ||||
VNET_DEFINE(int, ipport_tcpallocs); | VNET_DEFINE(int, ipport_tcpallocs); | ||||
static VNET_DEFINE(int, ipport_tcplastcount); | static VNET_DEFINE(int, ipport_tcplastcount); | ||||
#define V_ipport_tcplastcount VNET(ipport_tcplastcount) | #define V_ipport_tcplastcount VNET(ipport_tcplastcount) | ||||
static void in_pcbremlists(struct inpcb *inp); | static void in_pcbremlists(struct inpcb *inp); | ||||
#ifdef RATELIMIT | |||||
static void in_pcbdetach_txrtlmt(struct inpcb *inp); | |||||
#endif | |||||
#ifdef INET | #ifdef INET | ||||
static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, | static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, | ||||
struct in_addr faddr, u_int fport_arg, | struct in_addr faddr, u_int fport_arg, | ||||
struct in_addr laddr, u_int lport_arg, | struct in_addr laddr, u_int lport_arg, | ||||
int lookupflags, struct ifnet *ifp); | int lookupflags, struct ifnet *ifp); | ||||
#define RANGECHK(var, min, max) \ | #define RANGECHK(var, min, max) \ | ||||
if ((var) < (min)) { (var) = (min); } \ | if ((var) < (min)) { (var) = (min); } \ | ||||
▲ Show 20 Lines • Show All 988 Lines • ▼ Show 20 Lines | |||||
* socket, in which case in_pcbfree() is deferred. | * socket, in which case in_pcbfree() is deferred. | ||||
*/ | */ | ||||
void | void | ||||
in_pcbdetach(struct inpcb *inp) | in_pcbdetach(struct inpcb *inp) | ||||
{ | { | ||||
KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); | KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); | ||||
#ifdef RATELIMIT | |||||
if (inp->inp_txring_ifp != NULL) | |||||
in_pcbdetach_txrtlmt(inp); | |||||
#endif | |||||
inp->inp_socket->so_pcb = NULL; | inp->inp_socket->so_pcb = NULL; | ||||
inp->inp_socket = NULL; | inp->inp_socket = NULL; | ||||
} | } | ||||
/* | /* | ||||
* in_pcbref() bumps the reference count on an inpcb in order to maintain | * in_pcbref() bumps the reference count on an inpcb in order to maintain | ||||
* stability of an inpcb pointer despite the inpcb lock being released. This | * stability of an inpcb pointer despite the inpcb lock being released. This | ||||
* is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, | * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, | ||||
▲ Show 20 Lines • Show All 1,527 Lines • ▼ Show 20 Lines | if (!have_addr) { | ||||
db_printf("usage: show inpcb <addr>\n"); | db_printf("usage: show inpcb <addr>\n"); | ||||
return; | return; | ||||
} | } | ||||
inp = (struct inpcb *)addr; | inp = (struct inpcb *)addr; | ||||
db_print_inpcb(inp, "inpcb", 0); | db_print_inpcb(inp, "inpcb", 0); | ||||
} | } | ||||
#endif /* DDB */ | #endif /* DDB */ | ||||
#ifdef RATELIMIT | |||||
/* | |||||
* Modify existing TX rate limit on inp_txring_ifp and update | |||||
* inpcb info: | |||||
*/ | |||||
static int | |||||
in_pcbmodify_txrtlmt(struct inpcb *inp, struct ifnet *ifp, | |||||
uint32_t max_pacing_rate) | |||||
{ | |||||
struct ifreq_txrtlmt req; | |||||
int error; | |||||
INP_WLOCK_ASSERT(inp); | |||||
req.txring_max_rate = max_pacing_rate; | |||||
req.txring_id = inp->inp_txring_id; | |||||
req.txring_flow_id = inp->inp_flowid; | |||||
req.txring_flow_type = inp->inp_flowtype; | |||||
error = ifp->if_ioctl(ifp, SIOCSRATECTL, (caddr_t)&req); | |||||
if (error) | |||||
return (error); | |||||
inp->inp_txring_max_rate = max_pacing_rate; | |||||
return (0); | |||||
} | |||||
/* | |||||
* Create a TX rate limit on ifp and attach it to inpcb: | |||||
*/ | |||||
static int | |||||
in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, | |||||
uint32_t max_pacing_rate) | |||||
{ | |||||
struct ifreq_txrtlmt req; | |||||
int error; | |||||
INP_WLOCK_ASSERT(inp); | |||||
KASSERT(inp->inp_txring_ifp == NULL, | |||||
("%s: inp_txring_ifp != NULL", __func__)); | |||||
gallatin: I wonder if it would be possible to un-staticize these functions for using h/w rate limiting… | |||||
Done Inline ActionsI'll make these functions global. Might also make debugging easier. hselasky: I'll make these functions global. Might also make debugging easier. | |||||
req.txring_max_rate = max_pacing_rate; | |||||
req.txring_flow_id = inp->inp_flowid; | |||||
req.txring_flow_type = inp->inp_flowtype; | |||||
if_ref(ifp); | |||||
error = ifp->if_ioctl(ifp, SIOCARATECTL, (caddr_t)&req); | |||||
if (error) { | |||||
if_rele(ifp); | |||||
return (error); | |||||
} | |||||
inp->inp_txring_ifp = ifp; | |||||
inp->inp_txring_max_rate = max_pacing_rate; | |||||
inp->inp_txring_id = req.txring_id; | |||||
return (0); | |||||
} | |||||
/* | |||||
* Remove TX rate limit from inp_txring_ifp and detach it from | |||||
* the inpcb: | |||||
*/ | |||||
static void | |||||
in_pcbdetach_txrtlmt(struct inpcb *inp) | |||||
{ | |||||
struct ifreq_txrtlmt req; | |||||
struct ifnet *ifp; | |||||
Done Inline ActionsBecause you completely drop your lock in this case, you can't be sure any information you gathered before in_pcb_getwlock is still valid after you call it. A brief skim says that you do make some assumptions that some basic state (e.g. value of some inp->inp_txringid_* variables) stays the same across these calls. Have you done the analysis to ensure this is always the case and you can't ever run into race conditions? jtl: Because you completely drop your lock in this case, you can't be sure any information you… | |||||
Done Inline ActionsCorrect. There is a race there and we're looking into how to close it. Possibly we'll need a separate mutex for the RL state. That way we won't block the RX side from getting the INP_LOCK(). hselasky: Correct. There is a race there and we're looking into how to close it. Possibly we'll need a… | |||||
INP_WLOCK_ASSERT(inp); | |||||
KASSERT(inp->inp_txring_ifp != NULL, | |||||
("%s: inp->inp_txring_ifp == NULL", __func__)); | |||||
ifp = inp->inp_txring_ifp; | |||||
req.txring_id = inp->inp_txring_id; | |||||
req.txring_flow_id = inp->inp_flowid; | |||||
req.txring_flow_type = inp->inp_flowtype; | |||||
inp->inp_txring_ifp = NULL; | |||||
inp->inp_txring_id = 0; | |||||
inp->inp_txring_max_rate = 0; | |||||
/* | |||||
* If the device was detached while we still had reference on | |||||
* ifp, we assume if_dead() was called and replaced callbacks | |||||
* with stubs. | |||||
*/ | |||||
ifp->if_ioctl(ifp, SIOCDRATECTL, (caddr_t)&req); | |||||
if_rele(ifp); | |||||
} | |||||
/* | |||||
* Track route changes and modify the TX rate limit hint in the given | |||||
* mbuf to match what the network driver expects. | |||||
*/ | |||||
void | |||||
in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) | |||||
{ | |||||
struct socket *socket; | |||||
uint32_t max_pacing_rate; | |||||
int error; | |||||
if (inp == NULL) | |||||
Done Inline ActionsI think it would be helpful to have an (optional) callback in the inpcb here. The way we (netflix) intend to use this is not via the socket ioclts, but via direct interactions with the TCP stack. If a connection that was previously paced becomes unpaced, we need to know about it so that it can be transitioned to a software pacer. gallatin: I think it would be helpful to have an (optional) callback in the inpcb here.
The way we… | |||||
Done Inline ActionsThere has been implemented a function to query the current rate. Is that sufficient? hselasky: There has been implemented a function to query the current rate. Is that sufficient? | |||||
Done Inline ActionsI think we'd prefer a callback gallatin: I think we'd prefer a callback | |||||
return; | |||||
socket = inp->inp_socket; | |||||
if (socket == NULL) | |||||
return; | |||||
/* | |||||
* NOTE: The so_max_pacing_rate value is read unlocked, | |||||
* because atomic updates are not required since the variable | |||||
* is checked at every mbuf we send. It is assumed that the | |||||
* variable read itself will be atomic. | |||||
*/ | |||||
max_pacing_rate = socket->so_max_pacing_rate; | |||||
if (max_pacing_rate == 0 && inp->inp_txring_ifp == NULL) | |||||
return; | |||||
/* | |||||
* NOTE: When attaching to a network interface a reference is | |||||
Done Inline ActionsTypo: "referece" wblock: Typo: "referece" | |||||
* made to ensure the network interface doesn't go away until | |||||
* all ratelimit connections are gone. The network interface | |||||
* pointers compared below represent valid network interfaces, | |||||
Done Inline ActionsShould you clear INP_RATE_LIMIT_CHANGED at this point? gallatin: Should you clear INP_RATE_LIMIT_CHANGED at this point? | |||||
Done Inline ActionsYes, you're right. I'll fix. hselasky: Yes, you're right. I'll fix. | |||||
* except when comparing towards NULL. | |||||
*/ | |||||
if (ifp != inp->inp_txring_ifp) { | |||||
Done Inline ActionsWhat about adding a callback to this routine to allow the TCP stack to be notified of a rate-limit change (so that it may decide to move a connection to a software pacer, for example). gallatin: What about adding a callback to this routine to allow the TCP stack to be notified of a rate… | |||||
Done Inline ActionsI think there will be a locking issue, LOR, if you try that. But possibly a polling function to poll the status of the current mbuf send tag is appropriate. hselasky: I think there will be a locking issue, LOR, if you try that. But possibly a polling function to… | |||||
bool wlocked = INP_WLOCKED(inp); | |||||
if (!wlocked) { | |||||
/* | |||||
* NOTE: If the write locking fails, we need | |||||
* to bail out and use the non-ratelimited | |||||
* ring for the transmit until there is a new | |||||
* chance to get the write lock. | |||||
*/ | |||||
if (!INP_TRY_UPGRADE(inp)) | |||||
return; | |||||
} | |||||
if (inp->inp_txring_ifp != NULL) | |||||
in_pcbdetach_txrtlmt(inp); | |||||
/* | |||||
* In order to utilize packet pacing with RSS, we need | |||||
* to wait until there is a valid RSS hash before we | |||||
* can proceed: | |||||
*/ | |||||
if (inp->inp_flowtype == M_HASHTYPE_NONE) { | |||||
if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { | |||||
if (!wlocked) | |||||
INP_DOWNGRADE(inp); | |||||
return; | |||||
} | |||||
Not Done Inline ActionsIt would be nice to have a counter here to keep track of howmany ratelimit escapes there were. gallatin: It would be nice to have a counter here to keep track of howmany ratelimit escapes there were. | |||||
/* typically UDP ends up here */ | |||||
inp->inp_flowid = mb->m_pkthdr.flowid; | |||||
inp->inp_flowtype = M_HASHTYPE_GET(mb); | |||||
} | |||||
error = in_pcbattach_txrtlmt(inp, ifp, max_pacing_rate); | |||||
if (!wlocked) | |||||
INP_DOWNGRADE(inp); | |||||
if (error) | |||||
return; | |||||
} else if (inp->inp_txring_max_rate != max_pacing_rate) { | |||||
bool wlocked = INP_WLOCKED(inp); | |||||
if (!wlocked) { | |||||
/* | |||||
* NOTE: If the write locking fails, use the | |||||
* current pacing rate until there is a new | |||||
* chance to write lock: | |||||
*/ | |||||
if (!INP_TRY_UPGRADE(inp)) | |||||
goto done; | |||||
} | |||||
error = in_pcbmodify_txrtlmt(inp, ifp, max_pacing_rate); | |||||
if (!wlocked) | |||||
INP_DOWNGRADE(inp); | |||||
if (error) | |||||
Done Inline ActionsI thought we were going to stop clobbering the flowid, and use a different field as we discussed at BSDCan? gallatin: I thought we were going to stop clobbering the flowid, and use a different field as we… | |||||
Done Inline ActionsHi Drew, The RSS hashes have been passed onto the IOCTL requests, so that the IOCTL handler can decide what to do. I'm still waiting for Navdeep to reply on this approach. Like stated before only the 7 lowest bits are used in the TX path for CPU/bucket selection and the rest can be discarded, from my point of view. --HPS hselasky: Hi Drew,
The RSS hashes have been passed onto the IOCTL requests, so that the IOCTL handler… | |||||
goto done; /* use old rate */ | |||||
} | |||||
done: | |||||
/* | |||||
* Update the flow ID and RSS hash for the transmitted mbuf. | |||||
*/ | |||||
mb->m_pkthdr.flowid = inp->inp_txring_id; | |||||
M_HASHTYPE_SET(mb, M_HASHTYPE_TXRTLMT); | |||||
} | |||||
#endif /* RATELIMIT */ | |||||
Done Inline ActionsMaybe rename this "upgraded" & move the INP_WLOCKED() check directly into the if() -- the existing code is a bit hard to understand (at least for me). On my first read, I thought INP_DOWNGRADE() was unreachable because wlocked would have to be true (which is incorrect) gallatin: Maybe rename this "upgraded" & move the INP_WLOCKED() check directly into the if() -- the… | |||||
Done Inline ActionsNoted. hselasky: Noted. |
I wonder if it would be possible to un-staticize these functions for using h/w rate limiting internally in TCP.
We'd probably also want an API to query the available hardware pacing rates on a given inp.