Changeset View
Standalone View
sys/netinet/in_pcb.c
Show All 36 Lines | |||||
#include <sys/cdefs.h> | #include <sys/cdefs.h> | ||||
__FBSDID("$FreeBSD$"); | __FBSDID("$FreeBSD$"); | ||||
#include "opt_ddb.h" | #include "opt_ddb.h" | ||||
#include "opt_ipsec.h" | #include "opt_ipsec.h" | ||||
#include "opt_inet.h" | #include "opt_inet.h" | ||||
#include "opt_inet6.h" | #include "opt_inet6.h" | ||||
#include "opt_ratelimit.h" | |||||
#include "opt_pcbgroup.h" | #include "opt_pcbgroup.h" | ||||
#include "opt_rss.h" | #include "opt_rss.h" | ||||
#include <sys/param.h> | #include <sys/param.h> | ||||
#include <sys/systm.h> | #include <sys/systm.h> | ||||
#include <sys/lock.h> | #include <sys/lock.h> | ||||
#include <sys/malloc.h> | #include <sys/malloc.h> | ||||
#include <sys/mbuf.h> | #include <sys/mbuf.h> | ||||
#include <sys/callout.h> | #include <sys/callout.h> | ||||
#include <sys/eventhandler.h> | #include <sys/eventhandler.h> | ||||
#include <sys/domain.h> | #include <sys/domain.h> | ||||
#include <sys/protosw.h> | #include <sys/protosw.h> | ||||
#include <sys/rmlock.h> | #include <sys/rmlock.h> | ||||
#include <sys/socket.h> | #include <sys/socket.h> | ||||
#include <sys/socketvar.h> | #include <sys/socketvar.h> | ||||
#include <sys/sockio.h> | |||||
#include <sys/priv.h> | #include <sys/priv.h> | ||||
#include <sys/proc.h> | #include <sys/proc.h> | ||||
#include <sys/refcount.h> | #include <sys/refcount.h> | ||||
#include <sys/jail.h> | #include <sys/jail.h> | ||||
#include <sys/kernel.h> | #include <sys/kernel.h> | ||||
#include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||
#ifdef DDB | #ifdef DDB | ||||
▲ Show 20 Lines • Show All 1,067 Lines • ▼ Show 20 Lines | |||||
* socket, in which case in_pcbfree() is deferred. | * socket, in which case in_pcbfree() is deferred. | ||||
*/ | */ | ||||
void | void | ||||
in_pcbdetach(struct inpcb *inp) | in_pcbdetach(struct inpcb *inp) | ||||
{ | { | ||||
KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); | KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); | ||||
#ifdef RATELIMIT | |||||
if (inp->inp_snd_tag != NULL) | |||||
in_pcbdetach_txrtlmt(inp); | |||||
#endif | |||||
inp->inp_socket->so_pcb = NULL; | inp->inp_socket->so_pcb = NULL; | ||||
inp->inp_socket = NULL; | inp->inp_socket = NULL; | ||||
} | } | ||||
/* | /* | ||||
* in_pcbref() bumps the reference count on an inpcb in order to maintain | * in_pcbref() bumps the reference count on an inpcb in order to maintain | ||||
* stability of an inpcb pointer despite the inpcb lock being released. This | * stability of an inpcb pointer despite the inpcb lock being released. This | ||||
* is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, | * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, | ||||
▲ Show 20 Lines • Show All 1,521 Lines • ▼ Show 20 Lines | if (!have_addr) { | ||||
db_printf("usage: show inpcb <addr>\n"); | db_printf("usage: show inpcb <addr>\n"); | ||||
return; | return; | ||||
} | } | ||||
inp = (struct inpcb *)addr; | inp = (struct inpcb *)addr; | ||||
db_print_inpcb(inp, "inpcb", 0); | db_print_inpcb(inp, "inpcb", 0); | ||||
} | } | ||||
#endif /* DDB */ | #endif /* DDB */ | ||||
#ifdef RATELIMIT | |||||
/* | |||||
* Modify TX rate limit based on the existing "inp->inp_snd_tag", | |||||
* if any. | |||||
*/ | |||||
int | |||||
in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) | |||||
{ | |||||
union if_snd_tag_modify_params params = { | |||||
.rate_limit.max_rate = max_pacing_rate, | |||||
}; | |||||
struct m_snd_tag *mst; | |||||
struct ifnet *ifp; | |||||
int error; | |||||
mst = inp->inp_snd_tag; | |||||
if (mst == NULL) | |||||
return (EINVAL); | |||||
ifp = mst->ifp; | |||||
if (ifp == NULL) | |||||
return (EINVAL); | |||||
if (ifp->if_snd_tag_modify == NULL) { | |||||
error = EOPNOTSUPP; | |||||
} else { | |||||
error = ifp->if_snd_tag_modify(mst, ¶ms); | |||||
} | |||||
return (error); | |||||
} | |||||
/* | |||||
* Query existing TX rate limit based on the existing | |||||
* "inp->inp_snd_tag", if any. | |||||
*/ | |||||
int | |||||
in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) | |||||
{ | |||||
union if_snd_tag_query_params params = { }; | |||||
struct m_snd_tag *mst; | |||||
struct ifnet *ifp; | |||||
gallatin: I wonder if it would be possible to un-staticize these functions for using h/w rate limiting… | |||||
Done Inline ActionsI'll make these functions global. Might also make debugging easier. hselasky: I'll make these functions global. Might also make debugging easier. | |||||
int error; | |||||
mst = inp->inp_snd_tag; | |||||
if (mst == NULL) | |||||
return (EINVAL); | |||||
ifp = mst->ifp; | |||||
if (ifp == NULL) | |||||
return (EINVAL); | |||||
if (ifp->if_snd_tag_query == NULL) { | |||||
error = EOPNOTSUPP; | |||||
} else { | |||||
error = ifp->if_snd_tag_query(mst, ¶ms); | |||||
if (error == 0 && p_max_pacing_rate != NULL) | |||||
*p_max_pacing_rate = params.rate_limit.max_rate; | |||||
} | |||||
return (error); | |||||
} | |||||
/* | |||||
* Allocate a new TX rate limit send tag from the network interface | |||||
* given by the "ifp" argument and save it in "inp->inp_snd_tag": | |||||
*/ | |||||
int | |||||
in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, | |||||
uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate) | |||||
{ | |||||
union if_snd_tag_alloc_params params = { | |||||
.rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, | |||||
.rate_limit.hdr.flowid = flowid, | |||||
.rate_limit.hdr.flowtype = flowtype, | |||||
.rate_limit.max_rate = max_pacing_rate, | |||||
}; | |||||
int error; | |||||
INP_WLOCK_ASSERT(inp); | |||||
Done Inline ActionsBecause you completely drop your lock in this case, you can't be sure any information you gathered before in_pcb_getwlock is still valid after you call it. A brief skim says that you do make some assumptions that some basic state (e.g. value of some inp->inp_txringid_* variables) stays the same across these calls. Have you done the analysis to ensure this is always the case and you can't ever run into race conditions? jtl: Because you completely drop your lock in this case, you can't be sure any information you… | |||||
Done Inline ActionsCorrect. There is a race there and we're looking into how to close it. Possibly we'll need a separate mutex for the RL state. That way we won't block the RX side from getting the INP_LOCK(). hselasky: Correct. There is a race there and we're looking into how to close it. Possibly we'll need a… | |||||
if (inp->inp_snd_tag != NULL) | |||||
return (EINVAL); | |||||
if (ifp->if_snd_tag_alloc == NULL) { | |||||
error = EOPNOTSUPP; | |||||
} else { | |||||
error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag); | |||||
/* | |||||
* At success increment the refcount on | |||||
* the send tag's network interface: | |||||
*/ | |||||
if (error == 0) | |||||
if_ref(inp->inp_snd_tag->ifp); | |||||
} | |||||
return (error); | |||||
} | |||||
/* | |||||
* Free an existing TX rate limit tag based on the "inp->inp_snd_tag", | |||||
* if any: | |||||
*/ | |||||
void | |||||
in_pcbdetach_txrtlmt(struct inpcb *inp) | |||||
{ | |||||
struct m_snd_tag *mst; | |||||
Done Inline ActionsI think it would be helpful to have an (optional) callback in the inpcb here. The way we (netflix) intend to use this is not via the socket ioclts, but via direct interactions with the TCP stack. If a connection that was previously paced becomes unpaced, we need to know about it so that it can be transitioned to a software pacer. gallatin: I think it would be helpful to have an (optional) callback in the inpcb here.
The way we… | |||||
Done Inline ActionsThere has been implemented a function to query the current rate. Is that sufficient? hselasky: There has been implemented a function to query the current rate. Is that sufficient? | |||||
Done Inline ActionsI think we'd prefer a callback gallatin: I think we'd prefer a callback | |||||
struct ifnet *ifp; | |||||
INP_WLOCK_ASSERT(inp); | |||||
mst = inp->inp_snd_tag; | |||||
inp->inp_snd_tag = NULL; | |||||
if (mst == NULL) | |||||
return; | |||||
ifp = mst->ifp; | |||||
if (ifp == NULL) | |||||
return; | |||||
/* | |||||
* If the device was detached while we still had reference(s) | |||||
* on the ifp, we assume if_snd_tag_free() was replaced with | |||||
* stubs. | |||||
*/ | |||||
Done Inline ActionsTypo: "referece" wblock: Typo: "referece" | |||||
ifp->if_snd_tag_free(mst); | |||||
/* release reference count on network interface */ | |||||
Done Inline ActionsShould you clear INP_RATE_LIMIT_CHANGED at this point? gallatin: Should you clear INP_RATE_LIMIT_CHANGED at this point? | |||||
Done Inline ActionsYes, you're right. I'll fix. hselasky: Yes, you're right. I'll fix. | |||||
if_rele(ifp); | |||||
} | |||||
Done Inline ActionsWhat about adding a callback to this routine to allow the TCP stack to be notified of a rate-limit change (so that it may decide to move a connection to a software pacer, for example). gallatin: What about adding a callback to this routine to allow the TCP stack to be notified of a rate… | |||||
Done Inline ActionsI think there will be a locking issue, LOR, if you try that. But possibly a polling function to poll the status of the current mbuf send tag is appropriate. hselasky: I think there will be a locking issue, LOR, if you try that. But possibly a polling function to… | |||||
/* | |||||
* This function should be called when the INP_RATE_LIMIT_CHANGED flag | |||||
* is set in the fast path and will attach/detach/modify the TX rate | |||||
* limit send tag based on the socket's so_max_pacing_rate value. | |||||
*/ | |||||
void | |||||
in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) | |||||
{ | |||||
struct socket *socket; | |||||
uint32_t max_pacing_rate; | |||||
bool did_upgrade; | |||||
int error; | |||||
if (inp == NULL) | |||||
return; | |||||
socket = inp->inp_socket; | |||||
if (socket == NULL) | |||||
return; | |||||
if (!INP_WLOCKED(inp)) { | |||||
/* | |||||
* NOTE: If the write locking fails, we need to bail | |||||
* out and use the non-ratelimited ring for the | |||||
* transmit until there is a new chance to get the | |||||
* write lock. | |||||
*/ | |||||
Not Done Inline ActionsIt would be nice to have a counter here to keep track of howmany ratelimit escapes there were. gallatin: It would be nice to have a counter here to keep track of howmany ratelimit escapes there were. | |||||
if (!INP_TRY_UPGRADE(inp)) | |||||
return; | |||||
did_upgrade = 1; | |||||
} else { | |||||
did_upgrade = 0; | |||||
} | |||||
/* | |||||
* NOTE: The so_max_pacing_rate value is read unlocked, | |||||
* because atomic updates are not required since the variable | |||||
* is checked at every mbuf we send. It is assumed that the | |||||
* variable read itself will be atomic. | |||||
*/ | |||||
max_pacing_rate = socket->so_max_pacing_rate; | |||||
/* | |||||
* NOTE: When attaching to a network interface a reference is | |||||
* made to ensure the network interface doesn't go away until | |||||
* all ratelimit connections are gone. The network interface | |||||
* pointers compared below represent valid network interfaces, | |||||
* except when comparing towards NULL. | |||||
*/ | |||||
if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { | |||||
error = 0; | |||||
} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { | |||||
if (inp->inp_snd_tag != NULL) | |||||
in_pcbdetach_txrtlmt(inp); | |||||
error = 0; | |||||
} else if (inp->inp_snd_tag == NULL) { | |||||
Done Inline ActionsI thought we were going to stop clobbering the flowid, and use a different field as we discussed at BSDCan? gallatin: I thought we were going to stop clobbering the flowid, and use a different field as we… | |||||
Done Inline ActionsHi Drew, The RSS hashes have been passed onto the IOCTL requests, so that the IOCTL handler can decide what to do. I'm still waiting for Navdeep to reply on this approach. Like stated before only the 7 lowest bits are used in the TX path for CPU/bucket selection and the rest can be discarded, from my point of view. --HPS hselasky: Hi Drew,
The RSS hashes have been passed onto the IOCTL requests, so that the IOCTL handler… | |||||
/* | |||||
* In order to utilize packet pacing with RSS, we need | |||||
* to wait until there is a valid RSS hash before we | |||||
* can proceed: | |||||
*/ | |||||
if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { | |||||
error = EAGAIN; | |||||
} else { | |||||
error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), | |||||
mb->m_pkthdr.flowid, max_pacing_rate); | |||||
} | |||||
} else { | |||||
error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); | |||||
} | |||||
if (error == 0 || error == EOPNOTSUPP) | |||||
inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; | |||||
if (did_upgrade) | |||||
INP_DOWNGRADE(inp); | |||||
} | |||||
/* | |||||
* Track route changes for TX rate limiting. | |||||
*/ | |||||
void | |||||
in_pcboutput_eagain(struct inpcb *inp) | |||||
{ | |||||
struct socket *socket; | |||||
bool did_upgrade; | |||||
if (inp == NULL) | |||||
return; | |||||
socket = inp->inp_socket; | |||||
if (socket == NULL) | |||||
return; | |||||
if (inp->inp_snd_tag == NULL) | |||||
return; | |||||
if (!INP_WLOCKED(inp)) { | |||||
/* | |||||
* NOTE: If the write locking fails, we need to bail | |||||
* out and use the non-ratelimited ring for the | |||||
* transmit until there is a new chance to get the | |||||
* write lock. | |||||
*/ | |||||
if (!INP_TRY_UPGRADE(inp)) | |||||
return; | |||||
did_upgrade = 1; | |||||
} else { | |||||
did_upgrade = 0; | |||||
} | |||||
/* detach rate limiting */ | |||||
Done Inline ActionsMaybe rename this "upgraded" & move the INP_WLOCKED() check directly into the if() -- the existing code is a bit hard to understand (at least for me). On my first read, I thought INP_DOWNGRADE() was unreachable because wlocked would have to be true (which is incorrect) gallatin: Maybe rename this "upgraded" & move the INP_WLOCKED() check directly into the if() -- the… | |||||
Done Inline ActionsNoted. hselasky: Noted. | |||||
in_pcbdetach_txrtlmt(inp); | |||||
/* make sure new mbuf send tag allocation is made */ | |||||
inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; | |||||
if (did_upgrade) | |||||
INP_DOWNGRADE(inp); | |||||
} | |||||
#endif /* RATELIMIT */ |
I wonder if it would be possible to un-staticize these functions for using h/w rate limiting internally in TCP.
We'd probably also want an API to query the available hardware pacing rates on a given inp.