Changeset View
Standalone View
sys/netinet/tcp_hostcache.c
/*- | /*- | ||||||||
* SPDX-License-Identifier: BSD-3-Clause | * SPDX-License-Identifier: BSD-3-Clause | ||||||||
* | * | ||||||||
* Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG | * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG | ||||||||
* Copyright (c) 2021 Gleb Smirnoff <glebius@FreeBSD.org> | |||||||||
* All rights reserved. | * All rights reserved. | ||||||||
* | * | ||||||||
* Redistribution and use in source and binary forms, with or without | * Redistribution and use in source and binary forms, with or without | ||||||||
* modification, are permitted provided that the following conditions | * modification, are permitted provided that the following conditions | ||||||||
* are met: | * are met: | ||||||||
* 1. Redistributions of source code must retain the above copyright | * 1. Redistributions of source code must retain the above copyright | ||||||||
* notice, this list of conditions and the following disclaimer. | * notice, this list of conditions and the following disclaimer. | ||||||||
* 2. Redistributions in binary form must reproduce the above copyright | * 2. Redistributions in binary form must reproduce the above copyright | ||||||||
Show All 27 Lines | |||||||||
* | * | ||||||||
* Due to the tcp_hostcache, all TCP-specific metrics information in the | * Due to the tcp_hostcache, all TCP-specific metrics information in the | ||||||||
* routing table have been removed. The inpcb no longer keeps a pointer to | * routing table have been removed. The inpcb no longer keeps a pointer to | ||||||||
* the routing entry, and protocol-initiated route cloning has been removed | * the routing entry, and protocol-initiated route cloning has been removed | ||||||||
* as well. With these changes, the routing table has gone back to being | * as well. With these changes, the routing table has gone back to being | ||||||||
* more lightwight and only carries information related to packet forwarding. | * more lightwight and only carries information related to packet forwarding. | ||||||||
* | * | ||||||||
* tcp_hostcache is designed for multiple concurrent access in SMP | * tcp_hostcache is designed for multiple concurrent access in SMP | ||||||||
* environments and high contention. All bucket rows have their own lock and | * environments and high contention. It is a straight hash. Each bucket row | ||||||||
* thus multiple lookups and modifies can be done at the same time as long as | * is protected by its own lock for modification. Readers are protected by | ||||||||
* they are in different bucket rows. If a request for insertion of a new | * SMR. This puts certain restrictions on writers, e.g. a writer shall only | ||||||||
* record can't be satisfied, it simply returns an empty structure. Nobody | * insert a fully populated entry into a row. Writer can't reuse least used | ||||||||
* and nothing outside of tcp_hostcache.c will ever point directly to any | * entry if a hash is full. Value updates for an entry shall be atomic. | ||||||||
* entry in the tcp_hostcache. All communication is done in an | * | ||||||||
* object-oriented way and only functions of tcp_hostcache will manipulate | * TCP stack(s) communication with tcp_hostcache() is done via KBI functions | ||||||||
* hostcache entries. Otherwise, we are unable to achieve good behaviour in | * tcp_hc_*() and the hc_metrics_lite structure. | ||||||||
* concurrent access situations. Since tcp_hostcache is only caching | * | ||||||||
* information, there are no fatal consequences if we either can't satisfy | * Since tcp_hostcache is only caching information, there are no fatal | ||||||||
* any particular request or have to drop/overwrite an existing entry because | * consequences if we either can't allocate a new entry or have to drop | ||||||||
* of bucket limit memory constrains. | * an existing entry, or return somewhat stale information. | ||||||||
*/ | */ | ||||||||
/* | /* | ||||||||
* Many thanks to jlemon for basic structure of tcp_syncache which is being | * Many thanks to jlemon for basic structure of tcp_syncache which is being | ||||||||
* followed here. | * followed here. | ||||||||
*/ | */ | ||||||||
#include <sys/cdefs.h> | #include <sys/cdefs.h> | ||||||||
__FBSDID("$FreeBSD$"); | __FBSDID("$FreeBSD$"); | ||||||||
#include "opt_inet6.h" | #include "opt_inet6.h" | ||||||||
#include <sys/param.h> | #include <sys/param.h> | ||||||||
#include <sys/systm.h> | #include <sys/systm.h> | ||||||||
#include <sys/hash.h> | #include <sys/hash.h> | ||||||||
#include <sys/jail.h> | #include <sys/jail.h> | ||||||||
#include <sys/kernel.h> | #include <sys/kernel.h> | ||||||||
#include <sys/lock.h> | #include <sys/lock.h> | ||||||||
#include <sys/mutex.h> | #include <sys/mutex.h> | ||||||||
#include <sys/malloc.h> | #include <sys/malloc.h> | ||||||||
#include <sys/proc.h> | #include <sys/proc.h> | ||||||||
#include <sys/sbuf.h> | #include <sys/sbuf.h> | ||||||||
#include <sys/smr.h> | |||||||||
#include <sys/socket.h> | #include <sys/socket.h> | ||||||||
#include <sys/socketvar.h> | #include <sys/socketvar.h> | ||||||||
#include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||||||
#include <net/vnet.h> | #include <net/vnet.h> | ||||||||
#include <netinet/in.h> | #include <netinet/in.h> | ||||||||
#include <netinet/in_pcb.h> | #include <netinet/in_pcb.h> | ||||||||
#include <netinet/tcp.h> | #include <netinet/tcp.h> | ||||||||
#include <netinet/tcp_var.h> | #include <netinet/tcp_var.h> | ||||||||
#include <vm/uma.h> | #include <vm/uma.h> | ||||||||
TAILQ_HEAD(hc_qhead, hc_metrics); | |||||||||
struct hc_head { | struct hc_head { | ||||||||
struct hc_qhead hch_bucket; | CK_SLIST_HEAD(hc_qhead, hc_metrics) hch_bucket; | ||||||||
u_int hch_length; | u_int hch_length; | ||||||||
struct mtx hch_mtx; | struct mtx hch_mtx; | ||||||||
}; | }; | ||||||||
struct hc_metrics { | struct hc_metrics { | ||||||||
/* housekeeping */ | /* housekeeping */ | ||||||||
TAILQ_ENTRY(hc_metrics) rmx_q; | CK_SLIST_ENTRY(hc_metrics) rmx_q; | ||||||||
struct hc_head *rmx_head; /* head of bucket tail queue */ | |||||||||
struct in_addr ip4; /* IP address */ | struct in_addr ip4; /* IP address */ | ||||||||
struct in6_addr ip6; /* IP6 address */ | struct in6_addr ip6; /* IP6 address */ | ||||||||
uint32_t ip6_zoneid; /* IPv6 scope zone id */ | uint32_t ip6_zoneid; /* IPv6 scope zone id */ | ||||||||
/* endpoint specific values for tcp */ | /* endpoint specific values for tcp */ | ||||||||
uint32_t rmx_mtu; /* MTU for this path */ | uint32_t rmx_mtu; /* MTU for this path */ | ||||||||
uint32_t rmx_ssthresh; /* outbound gateway buffer limit */ | uint32_t rmx_ssthresh; /* outbound gateway buffer limit */ | ||||||||
uint32_t rmx_rtt; /* estimated round trip time */ | uint32_t rmx_rtt; /* estimated round trip time */ | ||||||||
uint32_t rmx_rttvar; /* estimated rtt variance */ | uint32_t rmx_rttvar; /* estimated rtt variance */ | ||||||||
uint32_t rmx_cwnd; /* congestion window */ | uint32_t rmx_cwnd; /* congestion window */ | ||||||||
uint32_t rmx_sendpipe; /* outbound delay-bandwidth product */ | uint32_t rmx_sendpipe; /* outbound delay-bandwidth product */ | ||||||||
uint32_t rmx_recvpipe; /* inbound delay-bandwidth product */ | uint32_t rmx_recvpipe; /* inbound delay-bandwidth product */ | ||||||||
/* TCP hostcache internal data */ | /* TCP hostcache internal data */ | ||||||||
int rmx_expire; /* lifetime for object */ | int rmx_expire; /* lifetime for object */ | ||||||||
#ifdef TCP_HC_COUNTERS | #ifdef TCP_HC_COUNTERS | ||||||||
u_long rmx_hits; /* number of hits */ | u_long rmx_hits; /* number of hits */ | ||||||||
u_long rmx_updates; /* number of updates */ | u_long rmx_updates; /* number of updates */ | ||||||||
#endif | #endif | ||||||||
}; | }; | ||||||||
struct tcp_hostcache { | struct tcp_hostcache { | ||||||||
struct hc_head *hashbase; | struct hc_head *hashbase; | ||||||||
uma_zone_t zone; | uma_zone_t zone; | ||||||||
smr_t smr; | |||||||||
u_int hashsize; | u_int hashsize; | ||||||||
u_int hashmask; | u_int hashmask; | ||||||||
u_int hashsalt; | u_int hashsalt; | ||||||||
u_int bucket_limit; | u_int bucket_limit; | ||||||||
u_int cache_count; | u_int cache_count; | ||||||||
u_int cache_limit; | u_int cache_limit; | ||||||||
int expire; | int expire; | ||||||||
int prune; | int prune; | ||||||||
int purgeall; | int purgeall; | ||||||||
}; | }; | ||||||||
/* Arbitrary values */ | /* Arbitrary values */ | ||||||||
#define TCP_HOSTCACHE_HASHSIZE 512 | #define TCP_HOSTCACHE_HASHSIZE 512 | ||||||||
#define TCP_HOSTCACHE_BUCKETLIMIT 30 | #define TCP_HOSTCACHE_BUCKETLIMIT 30 | ||||||||
#define TCP_HOSTCACHE_EXPIRE 60*60 /* one hour */ | #define TCP_HOSTCACHE_EXPIRE 60*60 /* one hour */ | ||||||||
#define TCP_HOSTCACHE_PRUNE 5*60 /* every 5 minutes */ | #define TCP_HOSTCACHE_PRUNE 5*60 /* every 5 minutes */ | ||||||||
VNET_DEFINE_STATIC(struct tcp_hostcache, tcp_hostcache); | VNET_DEFINE_STATIC(struct tcp_hostcache, tcp_hostcache); | ||||||||
#define V_tcp_hostcache VNET(tcp_hostcache) | #define V_tcp_hostcache VNET(tcp_hostcache) | ||||||||
VNET_DEFINE_STATIC(struct callout, tcp_hc_callout); | VNET_DEFINE_STATIC(struct callout, tcp_hc_callout); | ||||||||
#define V_tcp_hc_callout VNET(tcp_hc_callout) | #define V_tcp_hc_callout VNET(tcp_hc_callout) | ||||||||
static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *, bool); | static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *, bool, | ||||||||
static struct hc_metrics *tcp_hc_insert(struct in_conninfo *); | struct hc_head **, struct hc_metrics **); | ||||||||
static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS); | static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS); | ||||||||
static int sysctl_tcp_hc_histo(SYSCTL_HANDLER_ARGS); | static int sysctl_tcp_hc_histo(SYSCTL_HANDLER_ARGS); | ||||||||
static int sysctl_tcp_hc_purgenow(SYSCTL_HANDLER_ARGS); | static int sysctl_tcp_hc_purgenow(SYSCTL_HANDLER_ARGS); | ||||||||
static void tcp_hc_purge_internal(int); | static void tcp_hc_purge_internal(int); | ||||||||
static void tcp_hc_purge(void *); | static void tcp_hc_purge(void *); | ||||||||
static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, | static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, | ||||||||
CTLFLAG_RW | CTLFLAG_MPSAFE, 0, | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, | ||||||||
▲ Show 20 Lines • Show All 103 Lines • ▼ Show 20 Lines | tcp_hc_init(void) | ||||||||
V_tcp_hostcache.hashbase = (struct hc_head *) | V_tcp_hostcache.hashbase = (struct hc_head *) | ||||||||
malloc(V_tcp_hostcache.hashsize * sizeof(struct hc_head), | malloc(V_tcp_hostcache.hashsize * sizeof(struct hc_head), | ||||||||
M_HOSTCACHE, M_WAITOK | M_ZERO); | M_HOSTCACHE, M_WAITOK | M_ZERO); | ||||||||
/* | /* | ||||||||
* Initialize the hash buckets. | * Initialize the hash buckets. | ||||||||
*/ | */ | ||||||||
for (i = 0; i < V_tcp_hostcache.hashsize; i++) { | for (i = 0; i < V_tcp_hostcache.hashsize; i++) { | ||||||||
TAILQ_INIT(&V_tcp_hostcache.hashbase[i].hch_bucket); | CK_SLIST_INIT(&V_tcp_hostcache.hashbase[i].hch_bucket); | ||||||||
V_tcp_hostcache.hashbase[i].hch_length = 0; | V_tcp_hostcache.hashbase[i].hch_length = 0; | ||||||||
mtx_init(&V_tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry", | mtx_init(&V_tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry", | ||||||||
NULL, MTX_DEF); | NULL, MTX_DEF); | ||||||||
} | } | ||||||||
/* | /* | ||||||||
* Allocate the hostcache entries. | * Allocate the hostcache entries. | ||||||||
*/ | */ | ||||||||
V_tcp_hostcache.zone = | V_tcp_hostcache.zone = | ||||||||
uma_zcreate("hostcache", sizeof(struct hc_metrics), | uma_zcreate("hostcache", sizeof(struct hc_metrics), | ||||||||
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); | NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR); | ||||||||
uma_zone_set_max(V_tcp_hostcache.zone, V_tcp_hostcache.cache_limit); | uma_zone_set_max(V_tcp_hostcache.zone, V_tcp_hostcache.cache_limit); | ||||||||
V_tcp_hostcache.smr = uma_zone_get_smr(V_tcp_hostcache.zone); | |||||||||
markj: Really it is somewhat strange to allocate a new SMR zone for each VNET. I understand that it's… | |||||||||
Done Inline ActionsI totally agree. Can do this single zone change prior to SMR change. glebius: I totally agree. Can do this single zone change prior to SMR change. | |||||||||
Done Inline ActionsIt appears that all TCP UMA zones are VNET local. I don't like that, but I would prefer to leave it as is and let VIMAGE fans and experts (I'm an neither) to decide whether they should stay VNET local or become global. glebius: It appears that all TCP UMA zones are VNET local. I don't like that, but I would prefer to… | |||||||||
/* | /* | ||||||||
* Set up periodic cache cleanup. | * Set up periodic cache cleanup. | ||||||||
*/ | */ | ||||||||
callout_init(&V_tcp_hc_callout, 1); | callout_init(&V_tcp_hc_callout, 1); | ||||||||
callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, | callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, | ||||||||
tcp_hc_purge, curvnet); | tcp_hc_purge, curvnet); | ||||||||
} | } | ||||||||
Show All 14 Lines | tcp_hc_destroy(void) | ||||||||
for (i = 0; i < V_tcp_hostcache.hashsize; i++) | for (i = 0; i < V_tcp_hostcache.hashsize; i++) | ||||||||
mtx_destroy(&V_tcp_hostcache.hashbase[i].hch_mtx); | mtx_destroy(&V_tcp_hostcache.hashbase[i].hch_mtx); | ||||||||
free(V_tcp_hostcache.hashbase, M_HOSTCACHE); | free(V_tcp_hostcache.hashbase, M_HOSTCACHE); | ||||||||
} | } | ||||||||
#endif | #endif | ||||||||
/* | /* | ||||||||
* Internal function: look up an entry in the hostcache or return NULL. | * Internal function: look up an entry in the hostcache. | ||||||||
* | * | ||||||||
* If an entry has been returned, the caller becomes responsible for | * If looked up for update, returns locked and provides hash row pointer and | ||||||||
* unlocking the bucket row after he is done reading/modifying the entry. | * pointer to previous element, with the following semantics: | ||||||||
* - on success, prevp points to the previous element unless the element | |||||||||
* found is the very first one. | |||||||||
* - on failure, prevp points to the one to the last element, unless the | |||||||||
* hash row has 1 element or 0 elements. | |||||||||
* | |||||||||
* If looked up for read, returns in SMR section. | |||||||||
*/ | */ | ||||||||
static struct hc_metrics * | static struct hc_metrics * | ||||||||
tcp_hc_lookup(struct in_conninfo *inc, bool update) | tcp_hc_lookup(struct in_conninfo *inc, bool update, struct hc_head **headp, | ||||||||
struct hc_metrics **prevp) | |||||||||
markjUnsubmitted Not Done Inline ActionsI would split this into two functions. There would some duplication, but the checks in the main loop can be lifted into a function to reduce that, and it's nicer to have separate code paths for readers and writers when they use different synchronization primitives. markj: I would split this into two functions. There would some duplication, but the checks in the main… | |||||||||
glebiusAuthorUnsubmitted Done Inline ActionsMy original version was like this, but then I collapsed it back in one as to me this looks nicer. Very subjective of course. If more reviewers prefer to split it, I'll do it. glebius: My original version was like this, but then I collapsed it back in one as to me this looks… | |||||||||
markjUnsubmitted Not Done Inline ActionsOk. I just generally dislike using flag variables to determine what kind of synchronization is being used. It is not just a matter of style, this kind of thing makes it harder to read, to write assertions, to get useful results from static analysis, etc.. But this is not my code. :) markj: Ok. I just generally dislike using flag variables to determine what kind of synchronization is… | |||||||||
{ | { | ||||||||
int hash; | int hash; | ||||||||
struct hc_head *hc_head; | struct hc_head *hc_head; | ||||||||
struct hc_metrics *hc_entry; | struct hc_metrics *hc_entry; | ||||||||
KASSERT(inc != NULL, ("%s: NULL in_conninfo", __func__)); | KASSERT(inc != NULL, ("%s: NULL in_conninfo", __func__)); | ||||||||
/* | /* | ||||||||
* Hash the foreign ip address. | * Hash the foreign ip address. | ||||||||
*/ | */ | ||||||||
if (inc->inc_flags & INC_ISIPV6) | if (inc->inc_flags & INC_ISIPV6) | ||||||||
hash = HOSTCACHE_HASH6(&inc->inc6_faddr); | hash = HOSTCACHE_HASH6(&inc->inc6_faddr); | ||||||||
else | else | ||||||||
hash = HOSTCACHE_HASH(&inc->inc_faddr); | hash = HOSTCACHE_HASH(&inc->inc_faddr); | ||||||||
hc_head = &V_tcp_hostcache.hashbase[hash]; | hc_head = &V_tcp_hostcache.hashbase[hash]; | ||||||||
/* | if (update) { | ||||||||
* Acquire lock for this bucket row; we release the lock if we don't | KASSERT(headp != NULL && prevp != NULL, | ||||||||
* find an entry, otherwise the caller has to unlock after he is | ("%s: NULL headp or prevp", __func__)); | ||||||||
* done. | *headp = hc_head; | ||||||||
*/ | *prevp = NULL; | ||||||||
THC_LOCK(hc_head); | THC_LOCK(hc_head); | ||||||||
} else | |||||||||
smr_enter(V_tcp_hostcache.smr); | |||||||||
/* | /* | ||||||||
* Iterate through entries in bucket row looking for a match. | * Iterate through entries in bucket row looking for a match. | ||||||||
*/ | */ | ||||||||
TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) { | CK_SLIST_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) { | ||||||||
Not Done Inline ActionsTAILQs are not SMR-safe. You need to use CK_LIST or CK_STAILQ for now. There is a diff to add SMR_LIST_* macros with extra parameters to allow assertions to be embedded, and to distinguish between readers and writers, but they did not land yet. markj: TAILQs are not SMR-safe. You need to use CK_LIST or CK_STAILQ for now. There is a diff to add… | |||||||||
if (inc->inc_flags & INC_ISIPV6) { | if (inc->inc_flags & INC_ISIPV6) { | ||||||||
/* XXX: check ip6_zoneid */ | /* XXX: check ip6_zoneid */ | ||||||||
if (memcmp(&inc->inc6_faddr, &hc_entry->ip6, | if (memcmp(&inc->inc6_faddr, &hc_entry->ip6, | ||||||||
sizeof(inc->inc6_faddr)) == 0) | sizeof(inc->inc6_faddr)) == 0) | ||||||||
goto found; | goto found; | ||||||||
} else { | } else { | ||||||||
if (memcmp(&inc->inc_faddr, &hc_entry->ip4, | if (memcmp(&inc->inc_faddr, &hc_entry->ip4, | ||||||||
sizeof(inc->inc_faddr)) == 0) | sizeof(inc->inc_faddr)) == 0) | ||||||||
goto found; | goto found; | ||||||||
} | } | ||||||||
if (update && CK_SLIST_NEXT(hc_entry, rmx_q) != NULL) | |||||||||
*prevp = hc_entry; | |||||||||
} | } | ||||||||
/* | |||||||||
* We were unsuccessful and didn't find anything. | |||||||||
*/ | |||||||||
THC_UNLOCK(hc_head); | |||||||||
return (NULL); | return (NULL); | ||||||||
found: | found: | ||||||||
#ifdef TCP_HC_COUNTERS | #ifdef TCP_HC_COUNTERS | ||||||||
if (update) | if (update) | ||||||||
hc_entry->rmx_updates++; | hc_entry->rmx_updates++; | ||||||||
else | else | ||||||||
hc_entry->rmx_hits++; | hc_entry->rmx_hits++; | ||||||||
#endif | #endif | ||||||||
hc_entry->rmx_expire = V_tcp_hostcache.expire; | if (hc_entry->rmx_expire != V_tcp_hostcache.expire) | ||||||||
atomic_store_int(&hc_entry->rmx_expire, V_tcp_hostcache.expire); | |||||||||
markjUnsubmitted Not Done Inline ActionsIs there any point in having a branch here? atomic_store_int() is just a plain store. markj: Is there any point in having a branch here? atomic_store_int() is just a plain store. | |||||||||
glebiusAuthorUnsubmitted Done Inline ActionsMy goal was to avoid unnecessary cache line trashing on every lookup. Imagine two CPUs looking up the same entry million times a second (a SYN flood scenario), if each of them updates the entry on each lookup, that would create cache misses. With the check, that would happen only once per prune interval. glebius: My goal was to avoid unnecessary cache line trashing on every lookup. Imagine two CPUs looking… | |||||||||
Done Inline Actions
markj: | |||||||||
return (hc_entry); | return (hc_entry); | ||||||||
} | } | ||||||||
/* | /* | ||||||||
* Internal function: insert an entry into the hostcache or return NULL if | |||||||||
* unable to allocate a new one. | |||||||||
* | |||||||||
* If an entry has been returned, the caller becomes responsible for | |||||||||
* unlocking the bucket row after he is done reading/modifying the entry. | |||||||||
*/ | |||||||||
static struct hc_metrics * | |||||||||
tcp_hc_insert(struct in_conninfo *inc) | |||||||||
{ | |||||||||
int hash; | |||||||||
struct hc_head *hc_head; | |||||||||
struct hc_metrics *hc_entry; | |||||||||
KASSERT(inc != NULL, ("%s: NULL in_conninfo", __func__)); | |||||||||
/* | |||||||||
* Hash the foreign ip address. | |||||||||
*/ | |||||||||
if (inc->inc_flags & INC_ISIPV6) | |||||||||
hash = HOSTCACHE_HASH6(&inc->inc6_faddr); | |||||||||
else | |||||||||
hash = HOSTCACHE_HASH(&inc->inc_faddr); | |||||||||
hc_head = &V_tcp_hostcache.hashbase[hash]; | |||||||||
/* | |||||||||
* Acquire lock for this bucket row; we release the lock if we don't | |||||||||
* find an entry, otherwise the caller has to unlock after he is | |||||||||
* done. | |||||||||
*/ | |||||||||
THC_LOCK(hc_head); | |||||||||
/* | |||||||||
* If the bucket limit is reached, reuse the least-used element. | |||||||||
*/ | |||||||||
if (hc_head->hch_length >= V_tcp_hostcache.bucket_limit || | |||||||||
atomic_load_int(&V_tcp_hostcache.cache_count) >= V_tcp_hostcache.cache_limit) { | |||||||||
hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead); | |||||||||
/* | |||||||||
* At first we were dropping the last element, just to | |||||||||
* reacquire it in the next two lines again, which isn't very | |||||||||
* efficient. Instead just reuse the least used element. | |||||||||
* We may drop something that is still "in-use" but we can be | |||||||||
* "lossy". | |||||||||
* Just give up if this bucket row is empty and we don't have | |||||||||
* anything to replace. | |||||||||
*/ | |||||||||
if (hc_entry == NULL) { | |||||||||
THC_UNLOCK(hc_head); | |||||||||
return (NULL); | |||||||||
} | |||||||||
TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q); | |||||||||
KASSERT(V_tcp_hostcache.hashbase[hash].hch_length > 0 && | |||||||||
V_tcp_hostcache.hashbase[hash].hch_length <= | |||||||||
V_tcp_hostcache.bucket_limit, | |||||||||
("tcp_hostcache: bucket length range violated at %u: %u", | |||||||||
hash, V_tcp_hostcache.hashbase[hash].hch_length)); | |||||||||
V_tcp_hostcache.hashbase[hash].hch_length--; | |||||||||
atomic_subtract_int(&V_tcp_hostcache.cache_count, 1); | |||||||||
TCPSTAT_INC(tcps_hc_bucketoverflow); | |||||||||
#if 0 | |||||||||
uma_zfree(V_tcp_hostcache.zone, hc_entry); | |||||||||
#endif | |||||||||
} else { | |||||||||
/* | |||||||||
* Allocate a new entry, or balk if not possible. | |||||||||
*/ | |||||||||
hc_entry = uma_zalloc(V_tcp_hostcache.zone, M_NOWAIT); | |||||||||
if (hc_entry == NULL) { | |||||||||
THC_UNLOCK(hc_head); | |||||||||
return (NULL); | |||||||||
} | |||||||||
} | |||||||||
/* | |||||||||
* Initialize basic information of hostcache entry. | |||||||||
*/ | |||||||||
bzero(hc_entry, sizeof(*hc_entry)); | |||||||||
if (inc->inc_flags & INC_ISIPV6) { | |||||||||
hc_entry->ip6 = inc->inc6_faddr; | |||||||||
hc_entry->ip6_zoneid = inc->inc6_zoneid; | |||||||||
} else | |||||||||
hc_entry->ip4 = inc->inc_faddr; | |||||||||
hc_entry->rmx_head = hc_head; | |||||||||
hc_entry->rmx_expire = V_tcp_hostcache.expire; | |||||||||
/* | |||||||||
* Put it upfront. | |||||||||
*/ | |||||||||
TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q); | |||||||||
V_tcp_hostcache.hashbase[hash].hch_length++; | |||||||||
KASSERT(V_tcp_hostcache.hashbase[hash].hch_length < | |||||||||
V_tcp_hostcache.bucket_limit, | |||||||||
("tcp_hostcache: bucket length too high at %u: %u", | |||||||||
hash, V_tcp_hostcache.hashbase[hash].hch_length)); | |||||||||
atomic_add_int(&V_tcp_hostcache.cache_count, 1); | |||||||||
TCPSTAT_INC(tcps_hc_added); | |||||||||
return (hc_entry); | |||||||||
} | |||||||||
/* | |||||||||
* External function: look up an entry in the hostcache and fill out the | * External function: look up an entry in the hostcache and fill out the | ||||||||
* supplied TCP metrics structure. Fills in NULL when no entry was found or | * supplied TCP metrics structure. Fills in NULL when no entry was found or | ||||||||
* a value is not set. | * a value is not set. | ||||||||
*/ | */ | ||||||||
void | void | ||||||||
tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite) | tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite) | ||||||||
{ | { | ||||||||
struct hc_metrics *hc_entry; | struct hc_metrics *hc_entry; | ||||||||
if (!V_tcp_use_hostcache) { | if (!V_tcp_use_hostcache) { | ||||||||
bzero(hc_metrics_lite, sizeof(*hc_metrics_lite)); | bzero(hc_metrics_lite, sizeof(*hc_metrics_lite)); | ||||||||
return; | return; | ||||||||
} | } | ||||||||
/* | /* | ||||||||
* Find the right bucket. | * Find the right bucket. | ||||||||
*/ | */ | ||||||||
hc_entry = tcp_hc_lookup(inc, false); | hc_entry = tcp_hc_lookup(inc, false, NULL, NULL); | ||||||||
/* | /* | ||||||||
* If we don't have an existing object. | * If we don't have an existing object. | ||||||||
*/ | */ | ||||||||
if (hc_entry == NULL) { | if (hc_entry == NULL) { | ||||||||
smr_exit(V_tcp_hostcache.smr); | |||||||||
bzero(hc_metrics_lite, sizeof(*hc_metrics_lite)); | bzero(hc_metrics_lite, sizeof(*hc_metrics_lite)); | ||||||||
return; | return; | ||||||||
} | } | ||||||||
hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu; | hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu; | ||||||||
hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh; | hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh; | ||||||||
hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt; | hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt; | ||||||||
hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar; | hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar; | ||||||||
hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd; | hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd; | ||||||||
hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe; | hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe; | ||||||||
hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe; | hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe; | ||||||||
/* | smr_exit(V_tcp_hostcache.smr); | ||||||||
* Unlock bucket row. | |||||||||
*/ | |||||||||
THC_UNLOCK(hc_entry->rmx_head); | |||||||||
} | } | ||||||||
/* | /* | ||||||||
* External function: look up an entry in the hostcache and return the | * External function: look up an entry in the hostcache and return the | ||||||||
* discovered path MTU. Returns 0 if no entry is found or value is not | * discovered path MTU. Returns 0 if no entry is found or value is not | ||||||||
* set. | * set. | ||||||||
*/ | */ | ||||||||
uint32_t | uint32_t | ||||||||
tcp_hc_getmtu(struct in_conninfo *inc) | tcp_hc_getmtu(struct in_conninfo *inc) | ||||||||
{ | { | ||||||||
struct hc_metrics *hc_entry; | struct hc_metrics *hc_entry; | ||||||||
uint32_t mtu; | uint32_t mtu; | ||||||||
if (!V_tcp_use_hostcache) | if (!V_tcp_use_hostcache) | ||||||||
return (0); | return (0); | ||||||||
hc_entry = tcp_hc_lookup(inc, false); | hc_entry = tcp_hc_lookup(inc, false, NULL, NULL); | ||||||||
if (hc_entry == NULL) { | if (hc_entry == NULL) { | ||||||||
smr_exit(V_tcp_hostcache.smr); | |||||||||
return (0); | return (0); | ||||||||
} | } | ||||||||
mtu = hc_entry->rmx_mtu; | mtu = hc_entry->rmx_mtu; | ||||||||
THC_UNLOCK(hc_entry->rmx_head); | smr_exit(V_tcp_hostcache.smr); | ||||||||
return (mtu); | return (mtu); | ||||||||
} | } | ||||||||
/* | /* | ||||||||
* External function: update the MTU value of an entry in the hostcache. | * External function: update the MTU value of an entry in the hostcache. | ||||||||
* Creates a new entry if none was found. | * Creates a new entry if none was found. | ||||||||
*/ | */ | ||||||||
void | void | ||||||||
tcp_hc_updatemtu(struct in_conninfo *inc, uint32_t mtu) | tcp_hc_updatemtu(struct in_conninfo *inc, uint32_t mtu) | ||||||||
{ | { | ||||||||
struct hc_metrics_lite hcml = { .rmx_mtu = mtu }; | struct hc_metrics_lite hcml = { .rmx_mtu = mtu }; | ||||||||
return (tcp_hc_update(inc, &hcml)); | return (tcp_hc_update(inc, &hcml)); | ||||||||
} | } | ||||||||
/* | /* | ||||||||
* External function: update the TCP metrics of an entry in the hostcache. | * External function: update the TCP metrics of an entry in the hostcache. | ||||||||
* Creates a new entry if none was found. | * Creates a new entry if none was found. | ||||||||
*/ | */ | ||||||||
void | void | ||||||||
tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml) | tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml) | ||||||||
{ | { | ||||||||
struct hc_metrics *hc_entry; | struct hc_head *hc_head; | ||||||||
struct hc_metrics *hc_entry, *hc_prev; | |||||||||
uint32_t v; | |||||||||
bool new; | |||||||||
if (!V_tcp_use_hostcache) | if (!V_tcp_use_hostcache) | ||||||||
return; | return; | ||||||||
hc_entry = tcp_hc_lookup(inc, true); | hc_entry = tcp_hc_lookup(inc, true, &hc_head, &hc_prev); | ||||||||
if (hc_entry == NULL) { | if (hc_entry == NULL) { | ||||||||
hc_entry = tcp_hc_insert(inc); | /* | ||||||||
if (hc_entry == NULL) | * Try to allocate a new entry. If the bucket limit is | ||||||||
* reached, delete the least-used element, located at the end | |||||||||
* of the CK_SLIST. Give up if the row is empty. | |||||||||
* | |||||||||
* tcp_hc_lookup() has provided us with pointer to the one to | |||||||||
markjUnsubmitted Done Inline Actions"second to last" or "next to last" is a more standard way of referring to hc_prev, IMO. markj: "second to last" or "next to last" is a more standard way of referring to `hc_prev`, IMO. | |||||||||
* the last element, in case if list has at least 2 elements. | |||||||||
*/ | |||||||||
if (hc_head->hch_length >= V_tcp_hostcache.bucket_limit || | |||||||||
atomic_load_int(&V_tcp_hostcache.cache_count) >= | |||||||||
V_tcp_hostcache.cache_limit) { | |||||||||
if (hc_prev != NULL) { | |||||||||
hc_entry = CK_SLIST_NEXT(hc_prev, rmx_q); | |||||||||
KASSERT(CK_SLIST_NEXT(hc_entry, rmx_q) == NULL, | |||||||||
("%s: %p is not one to last", | |||||||||
__func__, hc_prev)); | |||||||||
CK_SLIST_REMOVE_AFTER(hc_prev, rmx_q); | |||||||||
} else if ((hc_entry = | |||||||||
CK_SLIST_FIRST(&hc_head->hch_bucket)) != NULL) { | |||||||||
KASSERT(CK_SLIST_NEXT(hc_entry, rmx_q) == NULL, | |||||||||
Done Inline ActionsThese checks should probably be in their own function. markj: These checks should probably be in their own function. | |||||||||
Done Inline ActionsThe cycles are now different. For update we keep the previous pointer and for read lookup we don't. I can reduce copy-n-paste in the hash calculation, though. glebius: The cycles are now different. For update we keep the previous pointer and for read lookup we… | |||||||||
("%s: %p is not the only element", | |||||||||
__func__, hc_entry)); | |||||||||
CK_SLIST_REMOVE_HEAD(&hc_head->hch_bucket, | |||||||||
rmx_q); | |||||||||
} else { | |||||||||
THC_UNLOCK(hc_head); | |||||||||
return; | return; | ||||||||
} | } | ||||||||
KASSERT(hc_head->hch_length > 0 && | |||||||||
hc_head->hch_length <= V_tcp_hostcache.bucket_limit, | |||||||||
("tcp_hostcache: bucket length violated at %p", | |||||||||
hc_head)); | |||||||||
hc_head->hch_length--; | |||||||||
atomic_subtract_int(&V_tcp_hostcache.cache_count, 1); | |||||||||
TCPSTAT_INC(tcps_hc_bucketoverflow); | |||||||||
uma_zfree_smr(V_tcp_hostcache.zone, hc_entry); | |||||||||
} | |||||||||
/* | |||||||||
* Allocate a new entry, or balk if not possible. | |||||||||
*/ | |||||||||
hc_entry = uma_zalloc_smr(V_tcp_hostcache.zone, M_NOWAIT); | |||||||||
if (hc_entry == NULL) { | |||||||||
THC_UNLOCK(hc_head); | |||||||||
return; | |||||||||
} | |||||||||
/* | |||||||||
* Initialize basic information of hostcache entry. | |||||||||
*/ | |||||||||
Not Done Inline ActionsThis check was added to make sure no undetected race (depite the row lock) exists and leads to accounting errors. rscheff: This check was added to make sure no undetected race (depite the row lock) exists and leads to… | |||||||||
bzero(hc_entry, sizeof(*hc_entry)); | |||||||||
if (inc->inc_flags & INC_ISIPV6) { | |||||||||
hc_entry->ip6 = inc->inc6_faddr; | |||||||||
hc_entry->ip6_zoneid = inc->inc6_zoneid; | |||||||||
} else | |||||||||
hc_entry->ip4 = inc->inc_faddr; | |||||||||
hc_entry->rmx_expire = V_tcp_hostcache.expire; | |||||||||
new = true; | |||||||||
} else | |||||||||
new = false; | |||||||||
/* | |||||||||
* Fill in data. Use atomics, since an existing entry is | |||||||||
* accessible by readers in SMR section. | |||||||||
markjUnsubmitted Not Done Inline ActionsThis comment is a bit misleading to me: plain (aligned) stores are already assumed to be atomic. atomic_store here just ensures that the stores will not be reordered with each other atomic(9) operations, which doesn't seem to be very important. However, it does make it clear that data races are possible and expected, so we should keep them. But then tcp_hc_get() and similar functions should also use atomic_load. markj: This comment is a bit misleading to me: plain (aligned) stores are already assumed to be atomic. | |||||||||
glebiusAuthorUnsubmitted Done Inline ActionsFor the TCP hostcache it is non-fatal to read a stale value, when a new value is being written. However, we don't want any trash value to be read, we need either old or new. On amd64 aligned stores can't produce any intermediate values, and hc_entry is most likely aligned, although there is no enforcement for its each field. So, all these atomics aren't necessary in practice. I don't know if any other or any future arch can write 32-bit word in two steps. I sought advice with Kostik and he suggested to keep these atomics as self documenting code. glebius: For the TCP hostcache it is non-fatal to read a stale value, when a new value is being written. | |||||||||
markjUnsubmitted Not Done Inline ActionsThe compiler will ensure that each field is naturally aligned, unless you specifically force it not to. I agree with what you wrote. I think you should keep the atomics. The change I'm suggesting is to add corresponding atomic_loads. Again, it shows that data races are possible, and thus provides hints for both the reader and for tools that try to automatically detect data races, like KCSAN. Right now we are not very good about this, but at least we can try to avoid it in new code. markj: The compiler will ensure that each field is naturally aligned, unless you specifically force it… | |||||||||
*/ | |||||||||
if (hcml->rmx_mtu != 0) { | if (hcml->rmx_mtu != 0) { | ||||||||
hc_entry->rmx_mtu = hcml->rmx_mtu; | atomic_store_32(&hc_entry->rmx_mtu, hcml->rmx_mtu); | ||||||||
} | } | ||||||||
if (hcml->rmx_rtt != 0) { | if (hcml->rmx_rtt != 0) { | ||||||||
if (hc_entry->rmx_rtt == 0) | if (hc_entry->rmx_rtt == 0) | ||||||||
hc_entry->rmx_rtt = hcml->rmx_rtt; | v = hcml->rmx_rtt; | ||||||||
else | else | ||||||||
hc_entry->rmx_rtt = ((uint64_t)hc_entry->rmx_rtt + | v = ((uint64_t)hc_entry->rmx_rtt + | ||||||||
(uint64_t)hcml->rmx_rtt) / 2; | (uint64_t)hcml->rmx_rtt) / 2; | ||||||||
atomic_store_32(&hc_entry->rmx_rtt, v); | |||||||||
TCPSTAT_INC(tcps_cachedrtt); | TCPSTAT_INC(tcps_cachedrtt); | ||||||||
} | } | ||||||||
if (hcml->rmx_rttvar != 0) { | if (hcml->rmx_rttvar != 0) { | ||||||||
if (hc_entry->rmx_rttvar == 0) | if (hc_entry->rmx_rttvar == 0) | ||||||||
hc_entry->rmx_rttvar = hcml->rmx_rttvar; | v = hcml->rmx_rttvar; | ||||||||
else | else | ||||||||
hc_entry->rmx_rttvar = ((uint64_t)hc_entry->rmx_rttvar + | v = ((uint64_t)hc_entry->rmx_rttvar + | ||||||||
(uint64_t)hcml->rmx_rttvar) / 2; | (uint64_t)hcml->rmx_rttvar) / 2; | ||||||||
atomic_store_32(&hc_entry->rmx_rttvar, v); | |||||||||
TCPSTAT_INC(tcps_cachedrttvar); | TCPSTAT_INC(tcps_cachedrttvar); | ||||||||
} | } | ||||||||
if (hcml->rmx_ssthresh != 0) { | if (hcml->rmx_ssthresh != 0) { | ||||||||
if (hc_entry->rmx_ssthresh == 0) | if (hc_entry->rmx_ssthresh == 0) | ||||||||
hc_entry->rmx_ssthresh = hcml->rmx_ssthresh; | v = hcml->rmx_ssthresh; | ||||||||
else | else | ||||||||
hc_entry->rmx_ssthresh = | v = (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2; | ||||||||
(hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2; | atomic_store_32(&hc_entry->rmx_ssthresh, v); | ||||||||
TCPSTAT_INC(tcps_cachedssthresh); | TCPSTAT_INC(tcps_cachedssthresh); | ||||||||
} | } | ||||||||
if (hcml->rmx_cwnd != 0) { | if (hcml->rmx_cwnd != 0) { | ||||||||
if (hc_entry->rmx_cwnd == 0) | if (hc_entry->rmx_cwnd == 0) | ||||||||
hc_entry->rmx_cwnd = hcml->rmx_cwnd; | v = hcml->rmx_cwnd; | ||||||||
else | else | ||||||||
hc_entry->rmx_cwnd = ((uint64_t)hc_entry->rmx_cwnd + | v = ((uint64_t)hc_entry->rmx_cwnd + | ||||||||
(uint64_t)hcml->rmx_cwnd) / 2; | (uint64_t)hcml->rmx_cwnd) / 2; | ||||||||
atomic_store_32(&hc_entry->rmx_cwnd, v); | |||||||||
/* TCPSTAT_INC(tcps_cachedcwnd); */ | /* TCPSTAT_INC(tcps_cachedcwnd); */ | ||||||||
} | } | ||||||||
if (hcml->rmx_sendpipe != 0) { | if (hcml->rmx_sendpipe != 0) { | ||||||||
if (hc_entry->rmx_sendpipe == 0) | if (hc_entry->rmx_sendpipe == 0) | ||||||||
hc_entry->rmx_sendpipe = hcml->rmx_sendpipe; | v = hcml->rmx_sendpipe; | ||||||||
else | else | ||||||||
hc_entry->rmx_sendpipe = | v = ((uint64_t)hc_entry->rmx_sendpipe + | ||||||||
((uint64_t)hc_entry->rmx_sendpipe + | |||||||||
(uint64_t)hcml->rmx_sendpipe) /2; | (uint64_t)hcml->rmx_sendpipe) /2; | ||||||||
atomic_store_32(&hc_entry->rmx_sendpipe, v); | |||||||||
/* TCPSTAT_INC(tcps_cachedsendpipe); */ | /* TCPSTAT_INC(tcps_cachedsendpipe); */ | ||||||||
} | } | ||||||||
if (hcml->rmx_recvpipe != 0) { | if (hcml->rmx_recvpipe != 0) { | ||||||||
if (hc_entry->rmx_recvpipe == 0) | if (hc_entry->rmx_recvpipe == 0) | ||||||||
hc_entry->rmx_recvpipe = hcml->rmx_recvpipe; | v = hcml->rmx_recvpipe; | ||||||||
else | else | ||||||||
hc_entry->rmx_recvpipe = | v = ((uint64_t)hc_entry->rmx_recvpipe + | ||||||||
((uint64_t)hc_entry->rmx_recvpipe + | |||||||||
(uint64_t)hcml->rmx_recvpipe) /2; | (uint64_t)hcml->rmx_recvpipe) /2; | ||||||||
atomic_store_32(&hc_entry->rmx_recvpipe, v); | |||||||||
/* TCPSTAT_INC(tcps_cachedrecvpipe); */ | /* TCPSTAT_INC(tcps_cachedrecvpipe); */ | ||||||||
} | } | ||||||||
TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); | /* | ||||||||
TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); | * Put it upfront. | ||||||||
THC_UNLOCK(hc_entry->rmx_head); | */ | ||||||||
if (new) { | |||||||||
CK_SLIST_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q); | |||||||||
Not Done Inline ActionsThis illustrates why we need special queue macros here. Since reads are now unlocked, something needs to ensure that all stores to the nascent hc_entry are visible to other CPUs before it is added to the queue. markj: This illustrates why we need special queue macros here. Since reads are now unlocked, something… | |||||||||
Done Inline ActionsThe nature of the hostcache is that a lookup failure isn't fatal. So, I think, the change is safe with TAILQ. As long as SMR guarantees that we won't read entry contents from the previous allocation, we are good. glebius: The nature of the hostcache is that a lookup failure isn't fatal. So, I think, the change is… | |||||||||
Not Done Inline ActionsIt is not just lookup failures that need to be considered. There are many fields that get initialized before the structure is inserted into the hash bucket. Without synchronization it is possible to match on a half-constructed hostcache entry. This is not memory safe anyway. There is nothing guaranteeing that the update of the "next" pointer of a new entry will be visible before the update to the queue head, so a concurrent lookup might follow a garbage pointer. It may work fine in practice on amd64 but the standard queue.h macros do not give the right semantics on platforms with weak memory ordering guarantees. markj: It is not just lookup failures that need to be considered. There are many fields that get… | |||||||||
Done Inline ActionsGot it, understood. Thanks, Mark. Will update the revision. glebius: Got it, understood. Thanks, Mark. Will update the revision. | |||||||||
hc_head->hch_length++; | |||||||||
KASSERT(hc_head->hch_length < V_tcp_hostcache.bucket_limit, | |||||||||
("tcp_hostcache: bucket length too high at %p", hc_head)); | |||||||||
atomic_add_int(&V_tcp_hostcache.cache_count, 1); | |||||||||
TCPSTAT_INC(tcps_hc_added); | |||||||||
} else if (hc_entry != CK_SLIST_FIRST(&hc_head->hch_bucket)) { | |||||||||
KASSERT(CK_SLIST_NEXT(hc_prev, rmx_q) == hc_entry, | |||||||||
("%s: %p next is not %p", __func__, hc_prev, hc_entry)); | |||||||||
CK_SLIST_REMOVE_AFTER(hc_prev, rmx_q); | |||||||||
CK_SLIST_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q); | |||||||||
} | } | ||||||||
THC_UNLOCK(hc_head); | |||||||||
} | |||||||||
/* | /* | ||||||||
* Sysctl function: prints the list and values of all hostcache entries in | * Sysctl function: prints the list and values of all hostcache entries in | ||||||||
* unsorted order. | * unsorted order. | ||||||||
*/ | */ | ||||||||
static int | static int | ||||||||
sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS) | sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS) | ||||||||
{ | { | ||||||||
Show All 32 Lines | #ifdef TCP_HC_COUNTERS | ||||||||
"HITS UPD " | "HITS UPD " | ||||||||
#endif | #endif | ||||||||
"EXP\n"); | "EXP\n"); | ||||||||
sbuf_drain(&sb); | sbuf_drain(&sb); | ||||||||
#define msec(u) (((u) + 500) / 1000) | #define msec(u) (((u) + 500) / 1000) | ||||||||
for (i = 0; i < V_tcp_hostcache.hashsize; i++) { | for (i = 0; i < V_tcp_hostcache.hashsize; i++) { | ||||||||
THC_LOCK(&V_tcp_hostcache.hashbase[i]); | THC_LOCK(&V_tcp_hostcache.hashbase[i]); | ||||||||
TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket, | CK_SLIST_FOREACH(hc_entry, | ||||||||
rmx_q) { | &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q) { | ||||||||
sbuf_printf(&sb, | sbuf_printf(&sb, | ||||||||
"%-15s %5u %8u %6lums %6lums %8u %8u %8u " | "%-15s %5u %8u %6lums %6lums %8u %8u %8u " | ||||||||
#ifdef TCP_HC_COUNTERS | #ifdef TCP_HC_COUNTERS | ||||||||
"%4lu %4lu " | "%4lu %4lu " | ||||||||
#endif | #endif | ||||||||
"%4i\n", | "%4i\n", | ||||||||
hc_entry->ip4.s_addr ? | hc_entry->ip4.s_addr ? | ||||||||
inet_ntoa_r(hc_entry->ip4, ip4buf) : | inet_ntoa_r(hc_entry->ip4, ip4buf) : | ||||||||
▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines | |||||||||
} | } | ||||||||
/* | /* | ||||||||
* Caller has to make sure the curvnet is set properly. | * Caller has to make sure the curvnet is set properly. | ||||||||
*/ | */ | ||||||||
static void | static void | ||||||||
tcp_hc_purge_internal(int all) | tcp_hc_purge_internal(int all) | ||||||||
{ | { | ||||||||
struct hc_metrics *hc_entry, *hc_next; | struct hc_head *head; | ||||||||
struct hc_metrics *hc_entry, *hc_next, *hc_prev; | |||||||||
int i; | int i; | ||||||||
for (i = 0; i < V_tcp_hostcache.hashsize; i++) { | for (i = 0; i < V_tcp_hostcache.hashsize; i++) { | ||||||||
THC_LOCK(&V_tcp_hostcache.hashbase[i]); | head = &V_tcp_hostcache.hashbase[i]; | ||||||||
TAILQ_FOREACH_SAFE(hc_entry, | hc_prev = NULL; | ||||||||
&V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q, hc_next) { | THC_LOCK(head); | ||||||||
KASSERT(V_tcp_hostcache.hashbase[i].hch_length > 0 && | CK_SLIST_FOREACH_SAFE(hc_entry, &head->hch_bucket, rmx_q, | ||||||||
V_tcp_hostcache.hashbase[i].hch_length <= | hc_next) { | ||||||||
KASSERT(head->hch_length > 0 && head->hch_length <= | |||||||||
markjUnsubmitted Done Inline ActionsHow is it guaranteed that hch_length > 0? markj: How is it guaranteed that `hch_length > 0`? | |||||||||
glebiusAuthorUnsubmitted Done Inline ActionsIf we entered the list traversal, then there is at least one entry on the list. This logic was here before and my patch doesn't change it. glebius: If we entered the list traversal, then there is at least one entry on the list. This logic was… | |||||||||
V_tcp_hostcache.bucket_limit, ("tcp_hostcache: " | V_tcp_hostcache.bucket_limit, ("tcp_hostcache: " | ||||||||
"bucket length out of range at %u: %u", | "bucket length out of range at %u: %u", i, | ||||||||
i, V_tcp_hostcache.hashbase[i].hch_length)); | head->hch_length)); | ||||||||
if (all || hc_entry->rmx_expire <= 0) { | if (all || hc_entry->rmx_expire <= 0) { | ||||||||
TAILQ_REMOVE( | if (hc_prev != NULL) { | ||||||||
&V_tcp_hostcache.hashbase[i].hch_bucket, | KASSERT(hc_entry == | ||||||||
hc_entry, rmx_q); | CK_SLIST_NEXT(hc_prev, rmx_q), | ||||||||
uma_zfree(V_tcp_hostcache.zone, hc_entry); | ("%s: %p is not next to %p", | ||||||||
__func__, hc_entry, hc_prev)); | |||||||||
CK_SLIST_REMOVE_AFTER(hc_prev, rmx_q); | |||||||||
} else { | |||||||||
KASSERT(hc_entry == | |||||||||
CK_SLIST_FIRST(&head->hch_bucket), | |||||||||
("%s: %p is not first", | |||||||||
__func__, hc_entry)); | |||||||||
CK_SLIST_REMOVE_HEAD(&head->hch_bucket, | |||||||||
rmx_q); | |||||||||
} | |||||||||
uma_zfree_smr(V_tcp_hostcache.zone, hc_entry); | |||||||||
Done Inline Actions
markj: | |||||||||
V_tcp_hostcache.hashbase[i].hch_length--; | V_tcp_hostcache.hashbase[i].hch_length--; | ||||||||
markjUnsubmitted Done Inline ActionsThis can become head->hch_length--. markj: This can become `head->hch_length--`. | |||||||||
atomic_subtract_int(&V_tcp_hostcache.cache_count, 1); | atomic_subtract_int(&V_tcp_hostcache.cache_count, 1); | ||||||||
} else | } else { | ||||||||
hc_entry->rmx_expire -= V_tcp_hostcache.prune; | atomic_subtract_int(&hc_entry->rmx_expire, | ||||||||
V_tcp_hostcache.prune); | |||||||||
hc_prev = hc_entry; | |||||||||
} | } | ||||||||
THC_UNLOCK(&V_tcp_hostcache.hashbase[i]); | } | ||||||||
THC_UNLOCK(head); | |||||||||
} | } | ||||||||
} | } | ||||||||
/* | /* | ||||||||
* Expire and purge (old|all) entries in the tcp_hostcache. Runs | * Expire and purge (old|all) entries in the tcp_hostcache. Runs | ||||||||
* periodically from the callout. | * periodically from the callout. | ||||||||
*/ | */ | ||||||||
static void | static void | ||||||||
▲ Show 20 Lines • Show All 41 Lines • Show Last 20 Lines |
Really it is somewhat strange to allocate a new SMR zone for each VNET. I understand that it's the pattern we use today, but we should permit VNET zones to share slabs and SMR trackers. This is just an observation.