Changeset View
Standalone View
sys/kern/uipc_ktls.c
Show First 20 Lines • Show All 103 Lines • ▼ Show 20 Lines | |||||
}; | }; | ||||
struct ktls_domain_info ktls_domains[MAXMEMDOM]; | struct ktls_domain_info ktls_domains[MAXMEMDOM]; | ||||
static struct ktls_wq *ktls_wq; | static struct ktls_wq *ktls_wq; | ||||
static struct proc *ktls_proc; | static struct proc *ktls_proc; | ||||
static uma_zone_t ktls_session_zone; | static uma_zone_t ktls_session_zone; | ||||
static uma_zone_t ktls_buffer_zone; | static uma_zone_t ktls_buffer_zone; | ||||
static uint16_t ktls_cpuid_lookup[MAXCPU]; | static uint16_t ktls_cpuid_lookup[MAXCPU]; | ||||
static int ktls_init_state; | |||||
static struct sx ktls_init_lock; | |||||
SX_SYSINIT(ktls_init_lock, &ktls_init_lock, "ktls init"); | |||||
SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, | SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, | ||||
"Kernel TLS offload"); | "Kernel TLS offload"); | ||||
SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, | SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, | ||||
"Kernel TLS offload stats"); | "Kernel TLS offload stats"); | ||||
#ifdef RSS | #ifdef RSS | ||||
static int ktls_bind_threads = 1; | static int ktls_bind_threads = 1; | ||||
▲ Show 20 Lines • Show All 254 Lines • ▼ Show 20 Lines | |||||
static void | static void | ||||
ktls_free_mext_contig(struct mbuf *m) | ktls_free_mext_contig(struct mbuf *m) | ||||
{ | { | ||||
M_ASSERTEXTPG(m); | M_ASSERTEXTPG(m); | ||||
uma_zfree(ktls_buffer_zone, (void *)PHYS_TO_DMAP(m->m_epg_pa[0])); | uma_zfree(ktls_buffer_zone, (void *)PHYS_TO_DMAP(m->m_epg_pa[0])); | ||||
} | } | ||||
static void | static int | ||||
ktls_init(void *dummy __unused) | ktls_init(void) | ||||
{ | { | ||||
struct thread *td; | struct thread *td; | ||||
struct pcpu *pc; | struct pcpu *pc; | ||||
cpuset_t mask; | |||||
int count, domain, error, i; | int count, domain, error, i; | ||||
ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS, | ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS, | ||||
M_WAITOK | M_ZERO); | M_WAITOK | M_ZERO); | ||||
ktls_session_zone = uma_zcreate("ktls_session", | ktls_session_zone = uma_zcreate("ktls_session", | ||||
sizeof(struct ktls_session), | sizeof(struct ktls_session), | ||||
NULL, NULL, NULL, NULL, | NULL, NULL, NULL, NULL, | ||||
Show All 9 Lines | ktls_init(void) | ||||
/* | /* | ||||
* Initialize the workqueues to run the TLS work. We create a | * Initialize the workqueues to run the TLS work. We create a | ||||
* work queue for each CPU. | * work queue for each CPU. | ||||
*/ | */ | ||||
CPU_FOREACH(i) { | CPU_FOREACH(i) { | ||||
STAILQ_INIT(&ktls_wq[i].m_head); | STAILQ_INIT(&ktls_wq[i].m_head); | ||||
STAILQ_INIT(&ktls_wq[i].so_head); | STAILQ_INIT(&ktls_wq[i].so_head); | ||||
mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF); | mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF); | ||||
error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i], | |||||
&ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i); | |||||
if (error) | |||||
panic("Can't add KTLS thread %d error %d", i, error); | |||||
/* | |||||
* Bind threads to cores. If ktls_bind_threads is > | |||||
* 1, then we bind to the NUMA domain. | |||||
*/ | |||||
if (ktls_bind_threads) { | |||||
if (ktls_bind_threads > 1) { | if (ktls_bind_threads > 1) { | ||||
pc = pcpu_find(i); | pc = pcpu_find(i); | ||||
domain = pc->pc_domain; | domain = pc->pc_domain; | ||||
CPU_COPY(&cpuset_domain[domain], &mask); | |||||
count = ktls_domains[domain].count; | count = ktls_domains[domain].count; | ||||
ktls_domains[domain].cpu[count] = i; | ktls_domains[domain].cpu[count] = i; | ||||
ktls_domains[domain].count++; | ktls_domains[domain].count++; | ||||
} else { | |||||
CPU_SETOF(i, &mask); | |||||
} | } | ||||
error = cpuset_setthread(td->td_tid, &mask); | |||||
if (error) | |||||
panic( | |||||
"Unable to bind KTLS thread for CPU %d error %d", | |||||
i, error); | |||||
} | |||||
ktls_cpuid_lookup[ktls_number_threads] = i; | ktls_cpuid_lookup[ktls_number_threads] = i; | ||||
ktls_number_threads++; | ktls_number_threads++; | ||||
} | } | ||||
/* | /* | ||||
* If we somehow have an empty domain, fall back to choosing | |||||
* among all KTLS threads. | |||||
markj: I know you didn't add this, but this comment doesn't make sense to me. If there an empty domain… | |||||
Done Inline ActionsI think the comment is about the code in ktls_get_cpu() where we use the value of this variable to decide if we choose a KTLS thread in a connection's NUMA domain vs choosing any KTLS thread. It is also true that by moving this check earlier before starting worker threads we are now consistent in how we bind kthreads in the case we failed back to 1. jhb: I think the comment is about the code in ktls_get_cpu() where we use the value of this variable… | |||||
*/ | |||||
if (ktls_bind_threads > 1) { | |||||
for (i = 0; i < vm_ndomains; i++) { | |||||
if (ktls_domains[i].count == 0) { | |||||
ktls_bind_threads = 1; | |||||
break; | |||||
} | |||||
} | |||||
} | |||||
/* Start kthreads for each workqueue. */ | |||||
CPU_FOREACH(i) { | |||||
error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i], | |||||
&ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i); | |||||
if (error) { | |||||
printf("Can't add KTLS thread %d error %d\n", i, error); | |||||
return (error); | |||||
} | |||||
} | |||||
/* | |||||
* Start an allocation thread per-domain to perform blocking allocations | * Start an allocation thread per-domain to perform blocking allocations | ||||
* of 16k physically contiguous TLS crypto destination buffers. | * of 16k physically contiguous TLS crypto destination buffers. | ||||
*/ | */ | ||||
if (ktls_sw_buffer_cache) { | if (ktls_sw_buffer_cache) { | ||||
for (domain = 0; domain < vm_ndomains; domain++) { | for (domain = 0; domain < vm_ndomains; domain++) { | ||||
if (VM_DOMAIN_EMPTY(domain)) | if (VM_DOMAIN_EMPTY(domain)) | ||||
continue; | continue; | ||||
if (CPU_EMPTY(&cpuset_domain[domain])) | if (CPU_EMPTY(&cpuset_domain[domain])) | ||||
continue; | continue; | ||||
error = kproc_kthread_add(ktls_alloc_thread, | error = kproc_kthread_add(ktls_alloc_thread, | ||||
&ktls_domains[domain], &ktls_proc, | &ktls_domains[domain], &ktls_proc, | ||||
&ktls_domains[domain].alloc_td.td, | &ktls_domains[domain].alloc_td.td, | ||||
0, 0, "KTLS", "alloc_%d", domain); | 0, 0, "KTLS", "alloc_%d", domain); | ||||
if (error) | if (error) { | ||||
panic("Can't add KTLS alloc thread %d error %d", | printf("Can't add KTLS alloc thread %d error %d\n", | ||||
domain, error); | domain, error); | ||||
CPU_COPY(&cpuset_domain[domain], &mask); | return (error); | ||||
error = cpuset_setthread(ktls_domains[domain].alloc_td.td->td_tid, | |||||
&mask); | |||||
if (error) | |||||
panic("Unable to bind KTLS alloc %d error %d", | |||||
domain, error); | |||||
} | } | ||||
} | } | ||||
/* | |||||
* If we somehow have an empty domain, fall back to choosing | |||||
* among all KTLS threads. | |||||
*/ | |||||
if (ktls_bind_threads > 1) { | |||||
for (i = 0; i < vm_ndomains; i++) { | |||||
if (ktls_domains[i].count == 0) { | |||||
ktls_bind_threads = 1; | |||||
break; | |||||
} | } | ||||
} | |||||
} | |||||
if (bootverbose) | if (bootverbose) | ||||
printf("KTLS: Initialized %d threads\n", ktls_number_threads); | printf("KTLS: Initialized %d threads\n", ktls_number_threads); | ||||
return (0); | |||||
} | } | ||||
SYSINIT(ktls, SI_SUB_SMP + 1, SI_ORDER_ANY, ktls_init, NULL); | |||||
static int | |||||
ktls_start_kthreads(void) | |||||
{ | |||||
int error, state; | |||||
start: | |||||
state = atomic_load_acq_int(&ktls_init_state); | |||||
if (__predict_true(state > 0)) | |||||
return (0); | |||||
if (state < 0) | |||||
return (ENXIO); | |||||
sx_xlock(&ktls_init_lock); | |||||
if (ktls_init_state != 0) { | |||||
sx_xunlock(&ktls_init_lock); | |||||
goto start; | |||||
} | |||||
error = ktls_init(); | |||||
if (error == 0) | |||||
state = 1; | |||||
else | |||||
state = -1; | |||||
atomic_store_rel_int(&ktls_init_state, state); | |||||
sx_xunlock(&ktls_init_lock); | |||||
return (error); | |||||
} | |||||
#if defined(INET) || defined(INET6) | #if defined(INET) || defined(INET6) | ||||
static int | static int | ||||
ktls_create_session(struct socket *so, struct tls_enable *en, | ktls_create_session(struct socket *so, struct tls_enable *en, | ||||
struct ktls_session **tlsp) | struct ktls_session **tlsp) | ||||
{ | { | ||||
struct ktls_session *tls; | struct ktls_session *tls; | ||||
int error; | int error; | ||||
▲ Show 20 Lines • Show All 83 Lines • ▼ Show 20 Lines | if (en->tls_vminor != TLS_MINOR_VER_TWO && | ||||
return (EINVAL); | return (EINVAL); | ||||
if (en->iv_len != TLS_CHACHA20_IV_LEN) | if (en->iv_len != TLS_CHACHA20_IV_LEN) | ||||
return (EINVAL); | return (EINVAL); | ||||
break; | break; | ||||
default: | default: | ||||
return (EINVAL); | return (EINVAL); | ||||
} | } | ||||
error = ktls_start_kthreads(); | |||||
if (error != 0) | |||||
return (error); | |||||
tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO); | tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO); | ||||
counter_u64_add(ktls_offload_active, 1); | counter_u64_add(ktls_offload_active, 1); | ||||
refcount_init(&tls->refcount, 1); | refcount_init(&tls->refcount, 1); | ||||
TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls); | TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls); | ||||
tls->wq_index = ktls_get_cpu(so); | tls->wq_index = ktls_get_cpu(so); | ||||
▲ Show 20 Lines • Show All 1,858 Lines • ▼ Show 20 Lines | if (error != 0) { | ||||
mb_free_notready(m, total_pages - npages); | mb_free_notready(m, total_pages - npages); | ||||
} | } | ||||
SOCK_LOCK(so); | SOCK_LOCK(so); | ||||
sorele(so); | sorele(so); | ||||
CURVNET_RESTORE(); | CURVNET_RESTORE(); | ||||
} | } | ||||
static int | |||||
ktls_bind_domain(int domain) | |||||
{ | |||||
int error; | |||||
error = cpuset_setthread(curthread->td_tid, &cpuset_domain[domain]); | |||||
if (error != 0) | |||||
return (error); | |||||
curthread->td_domain.dr_policy = DOMAINSET_PREF(domain); | |||||
return (0); | |||||
} | |||||
static void | static void | ||||
ktls_alloc_thread(void *ctx) | ktls_alloc_thread(void *ctx) | ||||
{ | { | ||||
struct ktls_domain_info *ktls_domain = ctx; | struct ktls_domain_info *ktls_domain = ctx; | ||||
struct ktls_alloc_thread *sc = &ktls_domain->alloc_td; | struct ktls_alloc_thread *sc = &ktls_domain->alloc_td; | ||||
void **buf; | void **buf; | ||||
struct sysctl_oid *oid; | struct sysctl_oid *oid; | ||||
char name[80]; | char name[80]; | ||||
int i, nbufs; | int domain, error, i, nbufs; | ||||
curthread->td_domain.dr_policy = | domain = ktls_domain - ktls_domains; | ||||
DOMAINSET_PREF(PCPU_GET(domain)); | |||||
snprintf(name, sizeof(name), "domain%d", PCPU_GET(domain)); | |||||
if (bootverbose) | if (bootverbose) | ||||
printf("Starting KTLS alloc thread for domain %d\n", | printf("Starting KTLS alloc thread for domain %d\n", domain); | ||||
PCPU_GET(domain)); | error = ktls_bind_domain(domain); | ||||
if (error) | |||||
printf("Unable to bind KTLS alloc thread for domain %d: error %d\n", | |||||
domain, error); | |||||
snprintf(name, sizeof(name), "domain%d", domain); | |||||
oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_kern_ipc_tls), OID_AUTO, | oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_kern_ipc_tls), OID_AUTO, | ||||
name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); | name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); | ||||
SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "allocs", | SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "allocs", | ||||
CTLFLAG_RD, &sc->allocs, 0, "buffers allocated"); | CTLFLAG_RD, &sc->allocs, 0, "buffers allocated"); | ||||
SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "wakeups", | SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "wakeups", | ||||
CTLFLAG_RD, &sc->wakeups, 0, "thread wakeups"); | CTLFLAG_RD, &sc->wakeups, 0, "thread wakeups"); | ||||
SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "running", | SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "running", | ||||
CTLFLAG_RD, &sc->running, 0, "thread running"); | CTLFLAG_RD, &sc->running, 0, "thread running"); | ||||
Show All 38 Lines | |||||
static void | static void | ||||
ktls_work_thread(void *ctx) | ktls_work_thread(void *ctx) | ||||
{ | { | ||||
struct ktls_wq *wq = ctx; | struct ktls_wq *wq = ctx; | ||||
struct mbuf *m, *n; | struct mbuf *m, *n; | ||||
struct socket *so, *son; | struct socket *so, *son; | ||||
STAILQ_HEAD(, mbuf) local_m_head; | STAILQ_HEAD(, mbuf) local_m_head; | ||||
STAILQ_HEAD(, socket) local_so_head; | STAILQ_HEAD(, socket) local_so_head; | ||||
int cpu; | |||||
cpu = wq - ktls_wq; | |||||
if (bootverbose) | |||||
printf("Starting KTLS worker thread for CPU %d\n", cpu); | |||||
/* | |||||
* Bind to a core. If ktls_bind_threads is > 1, then | |||||
* we bind to the NUMA domain instead. | |||||
*/ | |||||
if (ktls_bind_threads) { | |||||
int error; | |||||
if (ktls_bind_threads > 1) { | if (ktls_bind_threads > 1) { | ||||
curthread->td_domain.dr_policy = | struct pcpu *pc = pcpu_find(cpu); | ||||
DOMAINSET_PREF(PCPU_GET(domain)); | |||||
error = ktls_bind_domain(pc->pc_domain); | |||||
} else { | |||||
cpuset_t mask; | |||||
CPU_SETOF(cpu, &mask); | |||||
Done Inline ActionsI wonder if this should just use sched_bind() instead? It doesn't fail. I guess 'cpuset -g -t <tid>' wouldn't show the binding, but that's the only difference I think? jhb: I wonder if this should just use sched_bind() instead? It doesn't fail. I guess 'cpuset -g -t… | |||||
Not Done Inline Actionssched_bind() does have the advantage that it binds immediately. cpuset_setthread() does not, it just (eventually) sets TDF_NEEDSCHED, so we'd have to wait for the next context switch. But I don't see why sched_bind() is really preferable, given that we use cpuset_setthread() for binding to domains as well. markj: sched_bind() does have the advantage that it binds immediately. cpuset_setthread() does not, it… | |||||
error = cpuset_setthread(curthread->td_tid, &mask); | |||||
} | |||||
if (error) | |||||
printf("Unable to bind KTLS worker thread for CPU %d: error %d\n", | |||||
cpu, error); | |||||
} | } | ||||
#if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) | #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) | ||||
fpu_kern_thread(0); | fpu_kern_thread(0); | ||||
#endif | #endif | ||||
for (;;) { | for (;;) { | ||||
mtx_lock(&wq->mtx); | mtx_lock(&wq->mtx); | ||||
while (STAILQ_EMPTY(&wq->m_head) && | while (STAILQ_EMPTY(&wq->m_head) && | ||||
STAILQ_EMPTY(&wq->so_head)) { | STAILQ_EMPTY(&wq->so_head)) { | ||||
▲ Show 20 Lines • Show All 125 Lines • Show Last 20 Lines |
I know you didn't add this, but this comment doesn't make sense to me. If there an empty domain (meaning that there is a memory domain with no CPUs), we fall back to pinning each thread to a CPU.