Page MenuHomeFreeBSD

D55967.diff
No OneTemporary

D55967.diff

diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -321,10 +321,14 @@
* Almost all fields of struct inpcb are static after creation or protected by
* a per-inpcb rwlock, inp_lock.
*
- * A inpcb database is indexed by addresses/ports hash as well as list of
- * all pcbs that belong to a certain proto. Database lookups or list traversals
- * are be performed inside SMR section. Once desired PCB is found its own
- * lock is to be obtained and SMR section exited.
+ * A inpcb database consist of two hash tables: one for connected pcbs and the
+ * other to wildcard-bound pcbs. The newborn pcbs reside on the unconnected
+ * list. Although a pcb can be on either of these three lists, we can't share
+ * the linkage pointer due to its CK-nature. The only thing that can be
+ * unionized is the load-balance table and exact hash, as a pcb can never
+ * participate in both tables through its entire life time. Database lookups
+ * or list traversals are to be performed inside SMR section. Once desired PCB
+ * is found its own lock is to be obtained and SMR section exited.
*
* Key:
* (c) - Constant after initialization
@@ -352,10 +356,11 @@
struct inpcb {
/* Cache line #1 (amd64) */
union {
- CK_LIST_ENTRY(inpcb) inp_hash_exact; /* hash table linkage */
- LIST_ENTRY(inpcb) inp_lbgroup_list; /* lb group list */
+ CK_LIST_ENTRY(inpcb) inp_hash_exact;
+ LIST_ENTRY(inpcb) inp_lbgroup_list;
};
- CK_LIST_ENTRY(inpcb) inp_hash_wild; /* hash table linkage */
+ CK_LIST_ENTRY(inpcb) inp_hash_wild;
+ CK_LIST_ENTRY(inpcb) inp_unconn_list;
struct rwlock inp_lock;
/* Cache line #2 (amd64) */
#define inp_start_zero inp_refcount
@@ -412,7 +417,6 @@
struct route inp_route;
struct route_in6 inp_route6;
};
- CK_LIST_ENTRY(inpcb) inp_list; /* (r:e/w:p) all PCBs for proto */
};
#define inp_vnet inp_pcbinfo->ipi_vnet
@@ -431,7 +435,6 @@
* (h) Locked by ipi_hash_lock
*/
struct inpcbinfo {
- struct inpcbhead ipi_listhead; /* (r:e/w:h) */
u_int ipi_count; /* (h) */
/*
@@ -460,6 +463,7 @@
* address, and "wild" holds the rest.
*/
struct mtx ipi_hash_lock;
+ struct inpcbhead ipi_list_unconn; /* (r:e/w:h) */
struct inpcbhead *ipi_hash_exact; /* (r:e/w:h) */
struct inpcbhead *ipi_hash_wild; /* (r:e/w:h) */
u_long ipi_hashmask; /* (c) */
@@ -671,13 +675,20 @@
int (*ctloutput_set)(struct inpcb *, struct sockopt *));
#endif
+/*
+ * struct inpcb_iterator is located on the stack of a function that uses
+ * inp_next(). The const members are set by the caller with INP_ITERATOR().
+ * (c) - caller
+ * (n) - inp_next()
+ */
typedef bool inp_match_t(const struct inpcb *, void *);
struct inpcb_iterator {
const struct inpcbinfo *ipi;
- struct inpcb *inp;
+ struct inpcb *inp; /* c:r, n:w */
inp_match_t *match;
void *ctx;
- int hash;
+ int hash; /* c:w, n:rw */
+ int mode; /* n:rw */
#define INP_ALL_LIST -1
const inp_lookup_t lock;
};
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -241,8 +241,6 @@
"Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)");
#endif
-static void in_pcbremhash(struct inpcb *);
-
/*
* in_pcb.c: manage the Protocol Control Blocks.
*
@@ -562,7 +560,7 @@
#ifdef VIMAGE
pcbinfo->ipi_vnet = curvnet;
#endif
- CK_LIST_INIT(&pcbinfo->ipi_listhead);
+ CK_LIST_INIT(&pcbinfo->ipi_list_unconn);
pcbinfo->ipi_count = 0;
pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB,
&pcbinfo->ipi_hashmask);
@@ -691,7 +689,7 @@
INP_HASH_WLOCK(pcbinfo);
pcbinfo->ipi_count++;
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
- CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
+ CK_LIST_INSERT_HEAD(&pcbinfo->ipi_list_unconn, inp, inp_unconn_list);
INP_HASH_WUNLOCK(pcbinfo);
so->so_pcb = inp;
@@ -1423,7 +1421,9 @@
KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
("%s: inp %p was already disconnected", __func__, inp));
- in_pcbremhash_locked(inp);
+ in_pcbremhash(inp);
+ CK_LIST_INSERT_HEAD(&inp->inp_pcbinfo->ipi_list_unconn, inp,
+ inp_unconn_list);
if ((inp->inp_socket->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
/* See the comment in in_pcbinshash(). */
@@ -1546,8 +1546,13 @@
*
* - Iterator can have either write-lock or read-lock semantics, that can not
* be changed later.
- * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
- * a single hash slot. Note: only rip_input() does the latter.
+ * - Iterator can iterate either over all possible lists where inpcb resides:
+ * if initialized with INP_ALL_LIST hash the iterator will go through the
+ * unconnected list, then all wildcard hash slots and then all exact hash
+ * slots. Alternatively iterator can go through a single exact hash slot.
+ * Note: only rip_input() and sysctl_setsockopt() does the latter.
+ * The interface may be extended for iteration over single wildcard hash
+ * slot, but there is no use case for that today.
* - Iterator may have optional bool matching function. The matching function
* will be executed for each inpcb in the SMR context, so it can not acquire
* locks and can safely access only immutable fields of inpcb.
@@ -1565,49 +1570,72 @@
* - Removed entries won't stop traversal as long as they are not added to
* a different list. This is violated by in_pcbrehash().
*/
-#define II_LIST_FIRST(ipi, hash) \
- (((hash) == INP_ALL_LIST) ? \
- CK_LIST_FIRST(&(ipi)->ipi_listhead) : \
- CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)]))
-#define II_LIST_NEXT(inp, hash) \
- (((hash) == INP_ALL_LIST) ? \
- CK_LIST_NEXT((inp), inp_list) : \
- CK_LIST_NEXT((inp), inp_hash_exact))
-#define II_LOCK_ASSERT(inp, lock) \
- rw_assert(&(inp)->inp_lock, \
- (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED )
+static inline struct inpcb *
+ii_list_first(const struct inpcb_iterator *ii)
+{
+ const struct inpcbinfo *ipi = ii->ipi;
+ const int hash = ii->hash;
+
+ if (hash == INP_ALL_LIST)
+ return (CK_LIST_FIRST(&ipi->ipi_list_unconn));
+ else if (hash <= ipi->ipi_hashmask)
+ return (CK_LIST_FIRST(&ipi->ipi_hash_wild[hash]));
+ else
+ return (CK_LIST_FIRST(
+ &ipi->ipi_hash_exact[hash - ipi->ipi_hashmask - 1]));
+}
+
+static inline struct inpcb *
+ii_list_next(const struct inpcb_iterator *ii, struct inpcb *inp)
+{
+ if (ii->hash == INP_ALL_LIST)
+ return (CK_LIST_NEXT(inp, inp_unconn_list));
+ else if (ii->hash <= ii->ipi->ipi_hashmask)
+ return (CK_LIST_NEXT(inp, inp_hash_wild));
+ else
+ return (CK_LIST_NEXT(inp, inp_hash_exact));
+}
+
struct inpcb *
inp_next(struct inpcb_iterator *ii)
{
const struct inpcbinfo *ipi = ii->ipi;
+ const int hashmax = (ipi->ipi_hashmask + 1) * 2;
inp_match_t *match = ii->match;
void *ctx = ii->ctx;
inp_lookup_t lock = ii->lock;
- int hash = ii->hash;
struct inpcb *inp;
if (ii->inp == NULL) { /* First call. */
+ if ((ii->mode = ii->hash) != INP_ALL_LIST) {
+ /* Targeted iterators support only the exact hash. */
+ MPASS(ii->hash <= ipi->ipi_hashmask);
+ ii->hash += ipi->ipi_hashmask + 1;
+ }
smr_enter(ipi->ipi_smr);
- /* This is unrolled CK_LIST_FOREACH(). */
- for (inp = II_LIST_FIRST(ipi, hash);
+next_first:
+ /* This is unrolled CK_LIST_FOREACH() over different headers. */
+ for (inp = ii_list_first(ii);
inp != NULL;
- inp = II_LIST_NEXT(inp, hash)) {
+ inp = ii_list_next(ii, inp)) {
if (match != NULL && (match)(inp, ctx) == false)
continue;
if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED)))
break;
else {
smr_enter(ipi->ipi_smr);
- MPASS(inp != II_LIST_FIRST(ipi, hash));
- inp = II_LIST_FIRST(ipi, hash);
+ MPASS(inp != ii_list_first(ii));
+ inp = ii_list_first(ii);
if (inp == NULL)
break;
}
}
- if (inp == NULL)
+ if (inp == NULL) {
+ if (ii->mode == INP_ALL_LIST && ++ii->hash < hashmax)
+ goto next_first;
smr_exit(ipi->ipi_smr);
- else
+ } else
ii->inp = inp;
return (inp);
@@ -1617,11 +1645,18 @@
smr_enter(ipi->ipi_smr);
restart:
inp = ii->inp;
- II_LOCK_ASSERT(inp, lock);
+ rw_assert(&inp->inp_lock,
+ lock == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED);
next:
- inp = II_LIST_NEXT(inp, hash);
+ inp = ii_list_next(ii, inp);
if (inp == NULL) {
+ if (ii->mode == INP_ALL_LIST && ++ii->hash < hashmax) {
+ inp_unlock(ii->inp, lock);
+ ii->inp = NULL;
+ goto next_first;
+ }
smr_exit(ipi->ipi_smr);
+ ii->hash = ii->mode; /* Make ii reusable. */
goto found;
}
@@ -1793,10 +1828,11 @@
*/
INP_HASH_WLOCK(pcbinfo);
if (inp->inp_flags & INP_INHASHLIST)
- in_pcbremhash_locked(inp);
+ in_pcbremhash(inp);
+ else
+ CK_LIST_REMOVE(inp, inp_unconn_list);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
pcbinfo->ipi_count--;
- CK_LIST_REMOVE(inp, inp_list);
INP_HASH_WUNLOCK(pcbinfo);
#ifdef RATELIMIT
@@ -1876,8 +1912,13 @@
INP_WLOCK_ASSERT(inp);
inp->inp_flags |= INP_DROPPED;
- if (inp->inp_flags & INP_INHASHLIST)
+ if (inp->inp_flags & INP_INHASHLIST) {
+ INP_HASH_WLOCK(inp->inp_pcbinfo);
in_pcbremhash(inp);
+ CK_LIST_INSERT_HEAD(&inp->inp_pcbinfo->ipi_list_unconn, inp,
+ inp_unconn_list);
+ INP_HASH_WUNLOCK(inp->inp_pcbinfo);
+ }
}
#ifdef INET
@@ -2687,6 +2728,8 @@
inp->inp_smr = SMR_SEQ_INVALID;
}
+ CK_LIST_REMOVE(inp, inp_unconn_list);
+
if (connected)
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
else {
@@ -2704,7 +2747,7 @@
}
void
-in_pcbremhash_locked(struct inpcb *inp)
+in_pcbremhash(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
@@ -2731,14 +2774,6 @@
inp->inp_flags &= ~INP_INHASHLIST;
}
-static void
-in_pcbremhash(struct inpcb *inp)
-{
- INP_HASH_WLOCK(inp->inp_pcbinfo);
- in_pcbremhash_locked(inp);
- INP_HASH_WUNLOCK(inp->inp_pcbinfo);
-}
-
/*
* Move PCB to the proper hash bucket when { faddr, fport } have been
* changed. NOTE: This does not handle the case of the lport changing (the
@@ -2781,15 +2816,12 @@
* When rehashing, the caller must ensure that either the new or the old
* foreign address was unspecified.
*/
- if (connected)
- CK_LIST_REMOVE(inp, inp_hash_wild);
- else
- CK_LIST_REMOVE(inp, inp_hash_exact);
-
if (connected) {
+ CK_LIST_REMOVE(inp, inp_hash_wild);
head = &pcbinfo->ipi_hash_exact[hash];
CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact);
} else {
+ CK_LIST_REMOVE(inp, inp_hash_exact);
head = &pcbinfo->ipi_hash_wild[hash];
CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild);
}
diff --git a/sys/netinet/in_pcb_var.h b/sys/netinet/in_pcb_var.h
--- a/sys/netinet/in_pcb_var.h
+++ b/sys/netinet/in_pcb_var.h
@@ -57,7 +57,7 @@
int, int, struct ucred *);
int in_pcbinshash(struct inpcb *);
void in_pcbrehash(struct inpcb *);
-void in_pcbremhash_locked(struct inpcb *);
+void in_pcbremhash(struct inpcb *);
/*
* Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -2841,9 +2841,12 @@
static void
db_print_tcpcb(struct tcpcb *tp, const char *name, int indent, bool show_bblog,
- bool show_inpcb)
+ bool show_inpcb, bool only_locked)
{
+ if (only_locked && tp->t_inpcb.inp_lock.rw_lock == RW_UNLOCKED)
+ return;
+
db_print_indent(indent);
db_printf("%s at %p\n", name, tp);
@@ -2987,14 +2990,13 @@
show_bblog = strchr(modif, 'b') != NULL;
show_inpcb = strchr(modif, 'i') != NULL;
tp = (struct tcpcb *)addr;
- db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb);
+ db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb, false);
}
DB_SHOW_ALL_COMMAND(tcpcbs, db_show_all_tcpcbs)
{
VNET_ITERATOR_DECL(vnet_iter);
struct inpcb *inp;
- struct tcpcb *tp;
bool only_locked, show_bblog, show_inpcb;
only_locked = strchr(modif, 'l') != NULL;
@@ -3002,18 +3004,23 @@
show_inpcb = strchr(modif, 'i') != NULL;
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_listhead, inp_list) {
- if (only_locked &&
- inp->inp_lock.rw_lock == RW_UNLOCKED)
- continue;
- tp = intotcpcb(inp);
- db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb);
+ for (u_int i = 0; i <= V_tcbinfo.ipi_porthashmask; i++)
+ CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_porthashbase[i],
+ inp_portlist) {
+ db_print_tcpcb(intotcpcb(inp), "tcpcb", 0,
+ show_bblog, show_inpcb, only_locked);
+ if (db_pager_quit)
+ goto break_hash;
+ }
+break_hash:
+ CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_list_unconn,
+ inp_unconn_list) {
+ db_print_tcpcb(intotcpcb(inp), "tcpcb", 0,
+ show_bblog, show_inpcb, only_locked);
if (db_pager_quit)
break;
}
CURVNET_RESTORE();
- if (db_pager_quit)
- break;
}
}
#endif
diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c
--- a/sys/netinet6/in6_pcb.c
+++ b/sys/netinet6/in6_pcb.c
@@ -511,7 +511,9 @@
KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
("%s: inp %p was already disconnected", __func__, inp));
- in_pcbremhash_locked(inp);
+ in_pcbremhash(inp);
+ CK_LIST_INSERT_HEAD(&inp->inp_pcbinfo->ipi_list_unconn, inp,
+ inp_unconn_list);
if ((inp->inp_socket->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
/* See the comment in in_pcbinshash(). */

File Metadata

Mime Type
text/plain
Expires
Thu, Mar 26, 6:37 AM (44 m, 24 s)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
30338501
Default Alt Text
D55967.diff (13 KB)

Event Timeline