diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c index 881825bf1d9f..6815667594a4 100644 --- a/sys/kern/uipc_ktls.c +++ b/sys/kern/uipc_ktls.c @@ -1,3433 +1,3433 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2014-2019 Netflix Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) #include #endif #include #include #include #ifdef RSS #include #include #endif #include #include #include #include #include #ifdef TCP_OFFLOAD #include #endif #include #include #include #include #include #include struct ktls_wq { struct mtx mtx; STAILQ_HEAD(, mbuf) m_head; STAILQ_HEAD(, socket) so_head; bool running; int lastallocfail; } __aligned(CACHE_LINE_SIZE); struct ktls_reclaim_thread { uint64_t wakeups; uint64_t reclaims; struct thread *td; int running; }; struct ktls_domain_info { int count; int cpu[MAXCPU]; struct ktls_reclaim_thread reclaim_td; }; struct ktls_domain_info ktls_domains[MAXMEMDOM]; static struct ktls_wq *ktls_wq; static struct proc *ktls_proc; static uma_zone_t ktls_session_zone; static uma_zone_t ktls_buffer_zone; static uint16_t ktls_cpuid_lookup[MAXCPU]; static int ktls_init_state; static struct sx ktls_init_lock; SX_SYSINIT(ktls_init_lock, &ktls_init_lock, "ktls init"); SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Kernel TLS offload"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Kernel TLS offload stats"); #ifdef RSS static int ktls_bind_threads = 1; #else static int ktls_bind_threads; #endif SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN, &ktls_bind_threads, 0, "Bind crypto threads to cores (1) or cores and domains (2) at boot"); static u_int ktls_maxlen = 16384; SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RDTUN, &ktls_maxlen, 0, "Maximum TLS record size"); static int ktls_number_threads; SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD, &ktls_number_threads, 0, "Number of TLS threads in thread-pool"); unsigned int ktls_ifnet_max_rexmit_pct = 2; SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN, &ktls_ifnet_max_rexmit_pct, 2, "Max percent bytes retransmitted before ifnet TLS is disabled"); static bool ktls_offload_enable = true; SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN, &ktls_offload_enable, 0, "Enable support for kernel TLS offload"); static bool ktls_cbc_enable = true; SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RWTUN, &ktls_cbc_enable, 1, "Enable support of AES-CBC crypto for kernel TLS"); static bool ktls_sw_buffer_cache = true; SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, sw_buffer_cache, CTLFLAG_RDTUN, &ktls_sw_buffer_cache, 1, "Enable caching of output buffers for SW encryption"); static int ktls_max_reclaim = 1024; SYSCTL_INT(_kern_ipc_tls, OID_AUTO, max_reclaim, CTLFLAG_RWTUN, &ktls_max_reclaim, 128, "Max number of 16k buffers to reclaim in thread context"); static COUNTER_U64_DEFINE_EARLY(ktls_tasks_active); SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD, &ktls_tasks_active, "Number of active tasks"); static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_pending); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_pending, CTLFLAG_RD, &ktls_cnt_tx_pending, "Number of TLS 1.0 records waiting for earlier TLS records"); static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_queued); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD, &ktls_cnt_tx_queued, "Number of TLS records in queue to tasks for SW encryption"); static COUNTER_U64_DEFINE_EARLY(ktls_cnt_rx_queued); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_rx_inqueue, CTLFLAG_RD, &ktls_cnt_rx_queued, "Number of TLS sockets in queue to tasks for SW decryption"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_total); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total, CTLFLAG_RD, &ktls_offload_total, "Total successful TLS setups (parameters set)"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_enable_calls); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, enable_calls, CTLFLAG_RD, &ktls_offload_enable_calls, "Total number of TLS enable calls made"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_active); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD, &ktls_offload_active, "Total Active TLS sessions"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_corrupted_records); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, corrupted_records, CTLFLAG_RD, &ktls_offload_corrupted_records, "Total corrupted TLS records received"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_failed_crypto); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD, &ktls_offload_failed_crypto, "Total TLS crypto failures"); static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_ifnet); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_ifnet, CTLFLAG_RD, &ktls_switch_to_ifnet, "TLS sessions switched from SW to ifnet"); static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_sw); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_sw, CTLFLAG_RD, &ktls_switch_to_sw, "TLS sessions switched from ifnet to SW"); static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD, &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed, CTLFLAG_RD, &ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from ifnet"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok, CTLFLAG_RD, &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from ifnet"); static COUNTER_U64_DEFINE_EARLY(ktls_destroy_task); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, destroy_task, CTLFLAG_RD, &ktls_destroy_task, "Number of times ktls session was destroyed via taskqueue"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Software TLS session stats"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Hardware (ifnet) TLS session stats"); #ifdef TCP_OFFLOAD SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "TOE TLS session stats"); #endif static COUNTER_U64_DEFINE_EARLY(ktls_sw_cbc); SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, cbc, CTLFLAG_RD, &ktls_sw_cbc, "Active number of software TLS sessions using AES-CBC"); static COUNTER_U64_DEFINE_EARLY(ktls_sw_gcm); SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, gcm, CTLFLAG_RD, &ktls_sw_gcm, "Active number of software TLS sessions using AES-GCM"); static COUNTER_U64_DEFINE_EARLY(ktls_sw_chacha20); SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, chacha20, CTLFLAG_RD, &ktls_sw_chacha20, "Active number of software TLS sessions using Chacha20-Poly1305"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_cbc); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, cbc, CTLFLAG_RD, &ktls_ifnet_cbc, "Active number of ifnet TLS sessions using AES-CBC"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_gcm); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, gcm, CTLFLAG_RD, &ktls_ifnet_gcm, "Active number of ifnet TLS sessions using AES-GCM"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_chacha20); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, chacha20, CTLFLAG_RD, &ktls_ifnet_chacha20, "Active number of ifnet TLS sessions using Chacha20-Poly1305"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset, CTLFLAG_RD, &ktls_ifnet_reset, "TLS sessions updated to a new ifnet send tag"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_dropped); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_dropped, CTLFLAG_RD, &ktls_ifnet_reset_dropped, "TLS sessions dropped after failing to update ifnet send tag"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_failed); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_failed, CTLFLAG_RD, &ktls_ifnet_reset_failed, "TLS sessions that failed to allocate a new ifnet send tag"); static int ktls_ifnet_permitted = 1; SYSCTL_UINT(_kern_ipc_tls_ifnet, OID_AUTO, permitted, CTLFLAG_RWTUN, &ktls_ifnet_permitted, 1, "Whether to permit hardware (ifnet) TLS sessions"); #ifdef TCP_OFFLOAD static COUNTER_U64_DEFINE_EARLY(ktls_toe_cbc); SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, cbc, CTLFLAG_RD, &ktls_toe_cbc, "Active number of TOE TLS sessions using AES-CBC"); static COUNTER_U64_DEFINE_EARLY(ktls_toe_gcm); SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, gcm, CTLFLAG_RD, &ktls_toe_gcm, "Active number of TOE TLS sessions using AES-GCM"); static COUNTER_U64_DEFINE_EARLY(ktls_toe_chacha20); SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, chacha20, CTLFLAG_RD, &ktls_toe_chacha20, "Active number of TOE TLS sessions using Chacha20-Poly1305"); #endif static MALLOC_DEFINE(M_KTLS, "ktls", "Kernel TLS"); static void ktls_reclaim_thread(void *ctx); static void ktls_reset_receive_tag(void *context, int pending); static void ktls_reset_send_tag(void *context, int pending); static void ktls_work_thread(void *ctx); int ktls_copyin_tls_enable(struct sockopt *sopt, struct tls_enable *tls) { struct tls_enable_v0 tls_v0; int error; uint8_t *cipher_key = NULL, *iv = NULL, *auth_key = NULL; if (sopt->sopt_valsize == sizeof(tls_v0)) { error = sooptcopyin(sopt, &tls_v0, sizeof(tls_v0), sizeof(tls_v0)); if (error != 0) goto done; memset(tls, 0, sizeof(*tls)); tls->cipher_key = tls_v0.cipher_key; tls->iv = tls_v0.iv; tls->auth_key = tls_v0.auth_key; tls->cipher_algorithm = tls_v0.cipher_algorithm; tls->cipher_key_len = tls_v0.cipher_key_len; tls->iv_len = tls_v0.iv_len; tls->auth_algorithm = tls_v0.auth_algorithm; tls->auth_key_len = tls_v0.auth_key_len; tls->flags = tls_v0.flags; tls->tls_vmajor = tls_v0.tls_vmajor; tls->tls_vminor = tls_v0.tls_vminor; } else error = sooptcopyin(sopt, tls, sizeof(*tls), sizeof(*tls)); if (error != 0) return (error); if (tls->cipher_key_len < 0 || tls->cipher_key_len > TLS_MAX_PARAM_SIZE) return (EINVAL); if (tls->iv_len < 0 || tls->iv_len > sizeof(((struct ktls_session *)NULL)->params.iv)) return (EINVAL); if (tls->auth_key_len < 0 || tls->auth_key_len > TLS_MAX_PARAM_SIZE) return (EINVAL); /* All supported algorithms require a cipher key. */ if (tls->cipher_key_len == 0) return (EINVAL); /* * Now do a deep copy of the variable-length arrays in the struct, so that * subsequent consumers of it can reliably assume kernel memory. This * requires doing our own allocations, which we will free in the * error paths so that our caller need only worry about outstanding * allocations existing on successful return. */ if (tls->cipher_key_len != 0) { cipher_key = malloc(tls->cipher_key_len, M_KTLS, M_WAITOK); if (sopt->sopt_td != NULL) { error = copyin(tls->cipher_key, cipher_key, tls->cipher_key_len); if (error != 0) goto done; } else { bcopy(tls->cipher_key, cipher_key, tls->cipher_key_len); } } if (tls->iv_len != 0) { iv = malloc(tls->iv_len, M_KTLS, M_WAITOK); if (sopt->sopt_td != NULL) { error = copyin(tls->iv, iv, tls->iv_len); if (error != 0) goto done; } else { bcopy(tls->iv, iv, tls->iv_len); } } if (tls->auth_key_len != 0) { auth_key = malloc(tls->auth_key_len, M_KTLS, M_WAITOK); if (sopt->sopt_td != NULL) { error = copyin(tls->auth_key, auth_key, tls->auth_key_len); if (error != 0) goto done; } else { bcopy(tls->auth_key, auth_key, tls->auth_key_len); } } tls->cipher_key = cipher_key; tls->iv = iv; tls->auth_key = auth_key; done: if (error != 0) { zfree(cipher_key, M_KTLS); zfree(iv, M_KTLS); zfree(auth_key, M_KTLS); } return (error); } void ktls_cleanup_tls_enable(struct tls_enable *tls) { zfree(__DECONST(void *, tls->cipher_key), M_KTLS); zfree(__DECONST(void *, tls->iv), M_KTLS); zfree(__DECONST(void *, tls->auth_key), M_KTLS); } static u_int ktls_get_cpu(struct socket *so) { struct inpcb *inp; #ifdef NUMA struct ktls_domain_info *di; #endif u_int cpuid; inp = sotoinpcb(so); #ifdef RSS cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid != NETISR_CPUID_NONE) return (cpuid); #endif /* * Just use the flowid to shard connections in a repeatable * fashion. Note that TLS 1.0 sessions rely on the * serialization provided by having the same connection use * the same queue. */ #ifdef NUMA if (ktls_bind_threads > 1 && inp->inp_numa_domain != M_NODOM) { di = &ktls_domains[inp->inp_numa_domain]; cpuid = di->cpu[inp->inp_flowid % di->count]; } else #endif cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads]; return (cpuid); } static int ktls_buffer_import(void *arg, void **store, int count, int domain, int flags) { vm_page_t m; int i, req; KASSERT((ktls_maxlen & PAGE_MASK) == 0, ("%s: ktls max length %d is not page size-aligned", __func__, ktls_maxlen)); req = VM_ALLOC_WIRED | VM_ALLOC_NODUMP | malloc2vm_flags(flags); for (i = 0; i < count; i++) { m = vm_page_alloc_noobj_contig_domain(domain, req, atop(ktls_maxlen), 0, ~0ul, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); if (m == NULL) break; store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); } return (i); } static void ktls_buffer_release(void *arg __unused, void **store, int count) { vm_page_t m; int i, j; for (i = 0; i < count; i++) { m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i])); for (j = 0; j < atop(ktls_maxlen); j++) { (void)vm_page_unwire_noq(m + j); vm_page_free(m + j); } } } static void ktls_free_mext_contig(struct mbuf *m) { M_ASSERTEXTPG(m); uma_zfree(ktls_buffer_zone, (void *)PHYS_TO_DMAP(m->m_epg_pa[0])); } static int ktls_init(void) { struct thread *td; struct pcpu *pc; int count, domain, error, i; ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS, M_WAITOK | M_ZERO); ktls_session_zone = uma_zcreate("ktls_session", sizeof(struct ktls_session), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); if (ktls_sw_buffer_cache) { ktls_buffer_zone = uma_zcache_create("ktls_buffers", roundup2(ktls_maxlen, PAGE_SIZE), NULL, NULL, NULL, NULL, ktls_buffer_import, ktls_buffer_release, NULL, - UMA_ZONE_FIRSTTOUCH); + UMA_ZONE_FIRSTTOUCH | UMA_ZONE_NOTRIM); } /* * Initialize the workqueues to run the TLS work. We create a * work queue for each CPU. */ CPU_FOREACH(i) { STAILQ_INIT(&ktls_wq[i].m_head); STAILQ_INIT(&ktls_wq[i].so_head); mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF); if (ktls_bind_threads > 1) { pc = pcpu_find(i); domain = pc->pc_domain; count = ktls_domains[domain].count; ktls_domains[domain].cpu[count] = i; ktls_domains[domain].count++; } ktls_cpuid_lookup[ktls_number_threads] = i; ktls_number_threads++; } /* * If we somehow have an empty domain, fall back to choosing * among all KTLS threads. */ if (ktls_bind_threads > 1) { for (i = 0; i < vm_ndomains; i++) { if (ktls_domains[i].count == 0) { ktls_bind_threads = 1; break; } } } /* Start kthreads for each workqueue. */ CPU_FOREACH(i) { error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i], &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i); if (error) { printf("Can't add KTLS thread %d error %d\n", i, error); return (error); } } /* * Start an allocation thread per-domain to perform blocking allocations * of 16k physically contiguous TLS crypto destination buffers. */ if (ktls_sw_buffer_cache) { for (domain = 0; domain < vm_ndomains; domain++) { if (VM_DOMAIN_EMPTY(domain)) continue; if (CPU_EMPTY(&cpuset_domain[domain])) continue; error = kproc_kthread_add(ktls_reclaim_thread, &ktls_domains[domain], &ktls_proc, &ktls_domains[domain].reclaim_td.td, 0, 0, "KTLS", "reclaim_%d", domain); if (error) { printf("Can't add KTLS reclaim thread %d error %d\n", domain, error); return (error); } } } if (bootverbose) printf("KTLS: Initialized %d threads\n", ktls_number_threads); return (0); } static int ktls_start_kthreads(void) { int error, state; start: state = atomic_load_acq_int(&ktls_init_state); if (__predict_true(state > 0)) return (0); if (state < 0) return (ENXIO); sx_xlock(&ktls_init_lock); if (ktls_init_state != 0) { sx_xunlock(&ktls_init_lock); goto start; } error = ktls_init(); if (error == 0) state = 1; else state = -1; atomic_store_rel_int(&ktls_init_state, state); sx_xunlock(&ktls_init_lock); return (error); } static int ktls_create_session(struct socket *so, struct tls_enable *en, struct ktls_session **tlsp, int direction) { struct ktls_session *tls; int error; /* Only TLS 1.0 - 1.3 are supported. */ if (en->tls_vmajor != TLS_MAJOR_VER_ONE) return (EINVAL); if (en->tls_vminor < TLS_MINOR_VER_ZERO || en->tls_vminor > TLS_MINOR_VER_THREE) return (EINVAL); /* No flags are currently supported. */ if (en->flags != 0) return (EINVAL); /* Common checks for supported algorithms. */ switch (en->cipher_algorithm) { case CRYPTO_AES_NIST_GCM_16: /* * auth_algorithm isn't used, but permit GMAC values * for compatibility. */ switch (en->auth_algorithm) { case 0: #ifdef COMPAT_FREEBSD12 /* XXX: Really 13.0-current COMPAT. */ case CRYPTO_AES_128_NIST_GMAC: case CRYPTO_AES_192_NIST_GMAC: case CRYPTO_AES_256_NIST_GMAC: #endif break; default: return (EINVAL); } if (en->auth_key_len != 0) return (EINVAL); switch (en->tls_vminor) { case TLS_MINOR_VER_TWO: if (en->iv_len != TLS_AEAD_GCM_LEN) return (EINVAL); break; case TLS_MINOR_VER_THREE: if (en->iv_len != TLS_1_3_GCM_IV_LEN) return (EINVAL); break; default: return (EINVAL); } break; case CRYPTO_AES_CBC: switch (en->auth_algorithm) { case CRYPTO_SHA1_HMAC: break; case CRYPTO_SHA2_256_HMAC: case CRYPTO_SHA2_384_HMAC: if (en->tls_vminor != TLS_MINOR_VER_TWO) return (EINVAL); break; default: return (EINVAL); } if (en->auth_key_len == 0) return (EINVAL); /* * TLS 1.0 requires an implicit IV. TLS 1.1 and 1.2 * use explicit IVs. */ switch (en->tls_vminor) { case TLS_MINOR_VER_ZERO: if (en->iv_len != TLS_CBC_IMPLICIT_IV_LEN) return (EINVAL); break; case TLS_MINOR_VER_ONE: case TLS_MINOR_VER_TWO: /* Ignore any supplied IV. */ en->iv_len = 0; break; default: return (EINVAL); } break; case CRYPTO_CHACHA20_POLY1305: if (en->auth_algorithm != 0 || en->auth_key_len != 0) return (EINVAL); if (en->tls_vminor != TLS_MINOR_VER_TWO && en->tls_vminor != TLS_MINOR_VER_THREE) return (EINVAL); if (en->iv_len != TLS_CHACHA20_IV_LEN) return (EINVAL); break; default: return (EINVAL); } error = ktls_start_kthreads(); if (error != 0) return (error); tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO); counter_u64_add(ktls_offload_active, 1); refcount_init(&tls->refcount, 1); if (direction == KTLS_RX) { TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_receive_tag, tls); } else { TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls); tls->inp = so->so_pcb; in_pcbref(tls->inp); tls->tx = true; } tls->wq_index = ktls_get_cpu(so); tls->params.cipher_algorithm = en->cipher_algorithm; tls->params.auth_algorithm = en->auth_algorithm; tls->params.tls_vmajor = en->tls_vmajor; tls->params.tls_vminor = en->tls_vminor; tls->params.flags = en->flags; tls->params.max_frame_len = min(TLS_MAX_MSG_SIZE_V10_2, ktls_maxlen); /* Set the header and trailer lengths. */ tls->params.tls_hlen = sizeof(struct tls_record_layer); switch (en->cipher_algorithm) { case CRYPTO_AES_NIST_GCM_16: /* * TLS 1.2 uses a 4 byte implicit IV with an explicit 8 byte * nonce. TLS 1.3 uses a 12 byte implicit IV. */ if (en->tls_vminor < TLS_MINOR_VER_THREE) tls->params.tls_hlen += sizeof(uint64_t); tls->params.tls_tlen = AES_GMAC_HASH_LEN; tls->params.tls_bs = 1; break; case CRYPTO_AES_CBC: switch (en->auth_algorithm) { case CRYPTO_SHA1_HMAC: if (en->tls_vminor == TLS_MINOR_VER_ZERO) { /* Implicit IV, no nonce. */ tls->sequential_records = true; tls->next_seqno = be64dec(en->rec_seq); STAILQ_INIT(&tls->pending_records); } else { tls->params.tls_hlen += AES_BLOCK_LEN; } tls->params.tls_tlen = AES_BLOCK_LEN + SHA1_HASH_LEN; break; case CRYPTO_SHA2_256_HMAC: tls->params.tls_hlen += AES_BLOCK_LEN; tls->params.tls_tlen = AES_BLOCK_LEN + SHA2_256_HASH_LEN; break; case CRYPTO_SHA2_384_HMAC: tls->params.tls_hlen += AES_BLOCK_LEN; tls->params.tls_tlen = AES_BLOCK_LEN + SHA2_384_HASH_LEN; break; default: panic("invalid hmac"); } tls->params.tls_bs = AES_BLOCK_LEN; break; case CRYPTO_CHACHA20_POLY1305: /* * Chacha20 uses a 12 byte implicit IV. */ tls->params.tls_tlen = POLY1305_HASH_LEN; tls->params.tls_bs = 1; break; default: panic("invalid cipher"); } /* * TLS 1.3 includes optional padding which we do not support, * and also puts the "real" record type at the end of the * encrypted data. */ if (en->tls_vminor == TLS_MINOR_VER_THREE) tls->params.tls_tlen += sizeof(uint8_t); KASSERT(tls->params.tls_hlen <= MBUF_PEXT_HDR_LEN, ("TLS header length too long: %d", tls->params.tls_hlen)); KASSERT(tls->params.tls_tlen <= MBUF_PEXT_TRAIL_LEN, ("TLS trailer length too long: %d", tls->params.tls_tlen)); if (en->auth_key_len != 0) { tls->params.auth_key_len = en->auth_key_len; tls->params.auth_key = malloc(en->auth_key_len, M_KTLS, M_WAITOK); bcopy(en->auth_key, tls->params.auth_key, en->auth_key_len); } tls->params.cipher_key_len = en->cipher_key_len; tls->params.cipher_key = malloc(en->cipher_key_len, M_KTLS, M_WAITOK); bcopy(en->cipher_key, tls->params.cipher_key, en->cipher_key_len); /* * This holds the implicit portion of the nonce for AEAD * ciphers and the initial implicit IV for TLS 1.0. The * explicit portions of the IV are generated in ktls_frame(). */ if (en->iv_len != 0) { tls->params.iv_len = en->iv_len; bcopy(en->iv, tls->params.iv, en->iv_len); /* * For TLS 1.2 with GCM, generate an 8-byte nonce as a * counter to generate unique explicit IVs. * * Store this counter in the last 8 bytes of the IV * array so that it is 8-byte aligned. */ if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16 && en->tls_vminor == TLS_MINOR_VER_TWO) arc4rand(tls->params.iv + 8, sizeof(uint64_t), 0); } *tlsp = tls; return (0); } static struct ktls_session * ktls_clone_session(struct ktls_session *tls, int direction) { struct ktls_session *tls_new; tls_new = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO); counter_u64_add(ktls_offload_active, 1); refcount_init(&tls_new->refcount, 1); if (direction == KTLS_RX) { TASK_INIT(&tls_new->reset_tag_task, 0, ktls_reset_receive_tag, tls_new); } else { TASK_INIT(&tls_new->reset_tag_task, 0, ktls_reset_send_tag, tls_new); tls_new->inp = tls->inp; tls_new->tx = true; in_pcbref(tls_new->inp); } /* Copy fields from existing session. */ tls_new->params = tls->params; tls_new->wq_index = tls->wq_index; /* Deep copy keys. */ if (tls_new->params.auth_key != NULL) { tls_new->params.auth_key = malloc(tls->params.auth_key_len, M_KTLS, M_WAITOK); memcpy(tls_new->params.auth_key, tls->params.auth_key, tls->params.auth_key_len); } tls_new->params.cipher_key = malloc(tls->params.cipher_key_len, M_KTLS, M_WAITOK); memcpy(tls_new->params.cipher_key, tls->params.cipher_key, tls->params.cipher_key_len); return (tls_new); } #ifdef TCP_OFFLOAD static int ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction) { struct inpcb *inp; struct tcpcb *tp; int error; inp = so->so_pcb; INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); return (ECONNRESET); } if (inp->inp_socket == NULL) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); if (!(tp->t_flags & TF_TOE)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = tcp_offload_alloc_tls_session(tp, tls, direction); INP_WUNLOCK(inp); if (error == 0) { tls->mode = TCP_TLS_MODE_TOE; switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_toe_cbc, 1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_toe_gcm, 1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_toe_chacha20, 1); break; } } return (error); } #endif /* * Common code used when first enabling ifnet TLS on a connection or * when allocating a new ifnet TLS session due to a routing change. * This function allocates a new TLS send tag on whatever interface * the connection is currently routed over. */ static int ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force, struct m_snd_tag **mstp) { union if_snd_tag_alloc_params params; struct ifnet *ifp; struct nhop_object *nh; struct tcpcb *tp; int error; INP_RLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_RUNLOCK(inp); return (ECONNRESET); } if (inp->inp_socket == NULL) { INP_RUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); /* * Check administrative controls on ifnet TLS to determine if * ifnet TLS should be denied. * * - Always permit 'force' requests. * - ktls_ifnet_permitted == 0: always deny. */ if (!force && ktls_ifnet_permitted == 0) { INP_RUNLOCK(inp); return (ENXIO); } /* * XXX: Use the cached route in the inpcb to find the * interface. This should perhaps instead use * rtalloc1_fib(dst, 0, 0, fibnum). Since KTLS is only * enabled after a connection has completed key negotiation in * userland, the cached route will be present in practice. */ nh = inp->inp_route.ro_nh; if (nh == NULL) { INP_RUNLOCK(inp); return (ENXIO); } ifp = nh->nh_ifp; if_ref(ifp); /* * Allocate a TLS + ratelimit tag if the connection has an * existing pacing rate. */ if (tp->t_pacing_rate != -1 && (if_getcapenable(ifp) & IFCAP_TXTLS_RTLMT) != 0) { params.hdr.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT; params.tls_rate_limit.inp = inp; params.tls_rate_limit.tls = tls; params.tls_rate_limit.max_rate = tp->t_pacing_rate; } else { params.hdr.type = IF_SND_TAG_TYPE_TLS; params.tls.inp = inp; params.tls.tls = tls; } params.hdr.flowid = inp->inp_flowid; params.hdr.flowtype = inp->inp_flowtype; params.hdr.numa_domain = inp->inp_numa_domain; INP_RUNLOCK(inp); if ((if_getcapenable(ifp) & IFCAP_MEXTPG) == 0) { error = EOPNOTSUPP; goto out; } if (inp->inp_vflag & INP_IPV6) { if ((if_getcapenable(ifp) & IFCAP_TXTLS6) == 0) { error = EOPNOTSUPP; goto out; } } else { if ((if_getcapenable(ifp) & IFCAP_TXTLS4) == 0) { error = EOPNOTSUPP; goto out; } } error = m_snd_tag_alloc(ifp, ¶ms, mstp); out: if_rele(ifp); return (error); } /* * Allocate an initial TLS receive tag for doing HW decryption of TLS * data. * * This function allocates a new TLS receive tag on whatever interface * the connection is currently routed over. If the connection ends up * using a different interface for receive this will get fixed up via * ktls_input_ifp_mismatch as future packets arrive. */ static int ktls_alloc_rcv_tag(struct inpcb *inp, struct ktls_session *tls, struct m_snd_tag **mstp) { union if_snd_tag_alloc_params params; struct ifnet *ifp; struct nhop_object *nh; int error; if (!ktls_ocf_recrypt_supported(tls)) return (ENXIO); INP_RLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_RUNLOCK(inp); return (ECONNRESET); } if (inp->inp_socket == NULL) { INP_RUNLOCK(inp); return (ECONNRESET); } /* * Check administrative controls on ifnet TLS to determine if * ifnet TLS should be denied. */ if (ktls_ifnet_permitted == 0) { INP_RUNLOCK(inp); return (ENXIO); } /* * XXX: As with ktls_alloc_snd_tag, use the cached route in * the inpcb to find the interface. */ nh = inp->inp_route.ro_nh; if (nh == NULL) { INP_RUNLOCK(inp); return (ENXIO); } ifp = nh->nh_ifp; if_ref(ifp); tls->rx_ifp = ifp; params.hdr.type = IF_SND_TAG_TYPE_TLS_RX; params.hdr.flowid = inp->inp_flowid; params.hdr.flowtype = inp->inp_flowtype; params.hdr.numa_domain = inp->inp_numa_domain; params.tls_rx.inp = inp; params.tls_rx.tls = tls; params.tls_rx.vlan_id = 0; INP_RUNLOCK(inp); if (inp->inp_vflag & INP_IPV6) { if ((if_getcapenable2(ifp) & IFCAP2_BIT(IFCAP2_RXTLS6)) == 0) { error = EOPNOTSUPP; goto out; } } else { if ((if_getcapenable2(ifp) & IFCAP2_BIT(IFCAP2_RXTLS4)) == 0) { error = EOPNOTSUPP; goto out; } } error = m_snd_tag_alloc(ifp, ¶ms, mstp); /* * If this connection is over a vlan, vlan_snd_tag_alloc * rewrites vlan_id with the saved interface. Save the VLAN * ID for use in ktls_reset_receive_tag which allocates new * receive tags directly from the leaf interface bypassing * if_vlan. */ if (error == 0) tls->rx_vlan_id = params.tls_rx.vlan_id; out: return (error); } static int ktls_try_ifnet(struct socket *so, struct ktls_session *tls, int direction, bool force) { struct m_snd_tag *mst; int error; switch (direction) { case KTLS_TX: error = ktls_alloc_snd_tag(so->so_pcb, tls, force, &mst); if (__predict_false(error != 0)) goto done; break; case KTLS_RX: KASSERT(!force, ("%s: forced receive tag", __func__)); error = ktls_alloc_rcv_tag(so->so_pcb, tls, &mst); if (__predict_false(error != 0)) goto done; break; default: __assert_unreachable(); } tls->mode = TCP_TLS_MODE_IFNET; tls->snd_tag = mst; switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_ifnet_cbc, 1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_ifnet_gcm, 1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_ifnet_chacha20, 1); break; default: break; } done: return (error); } static void ktls_use_sw(struct ktls_session *tls) { tls->mode = TCP_TLS_MODE_SW; switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_sw_cbc, 1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_sw_gcm, 1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_sw_chacha20, 1); break; } } static int ktls_try_sw(struct ktls_session *tls, int direction) { int error; error = ktls_ocf_try(tls, direction); if (error) return (error); ktls_use_sw(tls); return (0); } /* * KTLS RX stores data in the socket buffer as a list of TLS records, * where each record is stored as a control message containg the TLS * header followed by data mbufs containing the decrypted data. This * is different from KTLS TX which always uses an mb_ext_pgs mbuf for * both encrypted and decrypted data. TLS records decrypted by a NIC * should be queued to the socket buffer as records, but encrypted * data which needs to be decrypted by software arrives as a stream of * regular mbufs which need to be converted. In addition, there may * already be pending encrypted data in the socket buffer when KTLS RX * is enabled. * * To manage not-yet-decrypted data for KTLS RX, the following scheme * is used: * * - A single chain of NOTREADY mbufs is hung off of sb_mtls. * * - ktls_check_rx checks this chain of mbufs reading the TLS header * from the first mbuf. Once all of the data for that TLS record is * queued, the socket is queued to a worker thread. * * - The worker thread calls ktls_decrypt to decrypt TLS records in * the TLS chain. Each TLS record is detached from the TLS chain, * decrypted, and inserted into the regular socket buffer chain as * record starting with a control message holding the TLS header and * a chain of mbufs holding the encrypted data. */ static void sb_mark_notready(struct sockbuf *sb) { struct mbuf *m; m = sb->sb_mb; sb->sb_mtls = m; sb->sb_mb = NULL; sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; for (; m != NULL; m = m->m_next) { KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL", __func__)); KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail", __func__)); KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len", __func__)); m->m_flags |= M_NOTREADY; sb->sb_acc -= m->m_len; sb->sb_tlscc += m->m_len; sb->sb_mtlstail = m; } KASSERT(sb->sb_acc == 0 && sb->sb_tlscc == sb->sb_ccc, ("%s: acc %u tlscc %u ccc %u", __func__, sb->sb_acc, sb->sb_tlscc, sb->sb_ccc)); } /* * Return information about the pending TLS data in a socket * buffer. On return, 'seqno' is set to the sequence number * of the next TLS record to be received, 'resid' is set to * the amount of bytes still needed for the last pending * record. The function returns 'false' if the last pending * record contains a partial TLS header. In that case, 'resid' * is the number of bytes needed to complete the TLS header. */ bool ktls_pending_rx_info(struct sockbuf *sb, uint64_t *seqnop, size_t *residp) { struct tls_record_layer hdr; struct mbuf *m; uint64_t seqno; size_t resid; u_int offset, record_len; SOCKBUF_LOCK_ASSERT(sb); MPASS(sb->sb_flags & SB_TLS_RX); seqno = sb->sb_tls_seqno; resid = sb->sb_tlscc; m = sb->sb_mtls; offset = 0; if (resid == 0) { *seqnop = seqno; *residp = 0; return (true); } for (;;) { seqno++; if (resid < sizeof(hdr)) { *seqnop = seqno; *residp = sizeof(hdr) - resid; return (false); } m_copydata(m, offset, sizeof(hdr), (void *)&hdr); record_len = sizeof(hdr) + ntohs(hdr.tls_length); if (resid <= record_len) { *seqnop = seqno; *residp = record_len - resid; return (true); } resid -= record_len; while (record_len != 0) { if (m->m_len - offset > record_len) { offset += record_len; break; } record_len -= (m->m_len - offset); offset = 0; m = m->m_next; } } } int ktls_enable_rx(struct socket *so, struct tls_enable *en) { struct ktls_session *tls; int error; if (!ktls_offload_enable) return (ENOTSUP); counter_u64_add(ktls_offload_enable_calls, 1); /* * This should always be true since only the TCP socket option * invokes this function. */ if (so->so_proto->pr_protocol != IPPROTO_TCP) return (EINVAL); /* * XXX: Don't overwrite existing sessions. We should permit * this to support rekeying in the future. */ if (so->so_rcv.sb_tls_info != NULL) return (EALREADY); if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable) return (ENOTSUP); error = ktls_create_session(so, en, &tls, KTLS_RX); if (error) return (error); error = ktls_ocf_try(tls, KTLS_RX); if (error) { ktls_free(tls); return (error); } /* * Serialize with soreceive_generic() and make sure that we're not * operating on a listening socket. */ error = SOCK_IO_RECV_LOCK(so, SBL_WAIT); if (error) { ktls_free(tls); return (error); } /* Mark the socket as using TLS offload. */ SOCK_RECVBUF_LOCK(so); if (__predict_false(so->so_rcv.sb_tls_info != NULL)) { SOCK_RECVBUF_UNLOCK(so); SOCK_IO_RECV_UNLOCK(so); ktls_free(tls); return (EALREADY); } so->so_rcv.sb_tls_seqno = be64dec(en->rec_seq); so->so_rcv.sb_tls_info = tls; so->so_rcv.sb_flags |= SB_TLS_RX; /* Mark existing data as not ready until it can be decrypted. */ sb_mark_notready(&so->so_rcv); ktls_check_rx(&so->so_rcv); SOCK_RECVBUF_UNLOCK(so); SOCK_IO_RECV_UNLOCK(so); /* Prefer TOE -> ifnet TLS -> software TLS. */ #ifdef TCP_OFFLOAD error = ktls_try_toe(so, tls, KTLS_RX); if (error) #endif error = ktls_try_ifnet(so, tls, KTLS_RX, false); if (error) ktls_use_sw(tls); counter_u64_add(ktls_offload_total, 1); return (0); } int ktls_enable_tx(struct socket *so, struct tls_enable *en) { struct ktls_session *tls; struct inpcb *inp; struct tcpcb *tp; int error; if (!ktls_offload_enable) return (ENOTSUP); counter_u64_add(ktls_offload_enable_calls, 1); /* * This should always be true since only the TCP socket option * invokes this function. */ if (so->so_proto->pr_protocol != IPPROTO_TCP) return (EINVAL); /* * XXX: Don't overwrite existing sessions. We should permit * this to support rekeying in the future. */ if (so->so_snd.sb_tls_info != NULL) return (EALREADY); if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable) return (ENOTSUP); /* TLS requires ext pgs */ if (mb_use_ext_pgs == 0) return (ENXIO); error = ktls_create_session(so, en, &tls, KTLS_TX); if (error) return (error); /* Prefer TOE -> ifnet TLS -> software TLS. */ #ifdef TCP_OFFLOAD error = ktls_try_toe(so, tls, KTLS_TX); if (error) #endif error = ktls_try_ifnet(so, tls, KTLS_TX, false); if (error) error = ktls_try_sw(tls, KTLS_TX); if (error) { ktls_free(tls); return (error); } /* * Serialize with sosend_generic() and make sure that we're not * operating on a listening socket. */ error = SOCK_IO_SEND_LOCK(so, SBL_WAIT); if (error) { ktls_free(tls); return (error); } /* * Write lock the INP when setting sb_tls_info so that * routines in tcp_ratelimit.c can read sb_tls_info while * holding the INP lock. */ inp = so->so_pcb; INP_WLOCK(inp); SOCK_SENDBUF_LOCK(so); if (__predict_false(so->so_snd.sb_tls_info != NULL)) { SOCK_SENDBUF_UNLOCK(so); INP_WUNLOCK(inp); SOCK_IO_SEND_UNLOCK(so); ktls_free(tls); return (EALREADY); } so->so_snd.sb_tls_seqno = be64dec(en->rec_seq); so->so_snd.sb_tls_info = tls; if (tls->mode != TCP_TLS_MODE_SW) { tp = intotcpcb(inp); MPASS(tp->t_nic_ktls_xmit == 0); tp->t_nic_ktls_xmit = 1; if (tp->t_fb->tfb_hwtls_change != NULL) (*tp->t_fb->tfb_hwtls_change)(tp, 1); } SOCK_SENDBUF_UNLOCK(so); INP_WUNLOCK(inp); SOCK_IO_SEND_UNLOCK(so); counter_u64_add(ktls_offload_total, 1); return (0); } int ktls_get_rx_mode(struct socket *so, int *modep) { struct ktls_session *tls; struct inpcb *inp __diagused; if (SOLISTENING(so)) return (EINVAL); inp = so->so_pcb; INP_WLOCK_ASSERT(inp); SOCK_RECVBUF_LOCK(so); tls = so->so_rcv.sb_tls_info; if (tls == NULL) *modep = TCP_TLS_MODE_NONE; else *modep = tls->mode; SOCK_RECVBUF_UNLOCK(so); return (0); } /* * ktls_get_rx_sequence - get the next TCP- and TLS- sequence number. * * This function gets information about the next TCP- and TLS- * sequence number to be processed by the TLS receive worker * thread. The information is extracted from the given "inpcb" * structure. The values are stored in host endian format at the two * given output pointer locations. The TCP sequence number points to * the beginning of the TLS header. * * This function returns zero on success, else a non-zero error code * is returned. */ int ktls_get_rx_sequence(struct inpcb *inp, uint32_t *tcpseq, uint64_t *tlsseq) { struct socket *so; struct tcpcb *tp; INP_RLOCK(inp); so = inp->inp_socket; if (__predict_false(so == NULL)) { INP_RUNLOCK(inp); return (EINVAL); } if (inp->inp_flags & INP_DROPPED) { INP_RUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); MPASS(tp != NULL); SOCKBUF_LOCK(&so->so_rcv); *tcpseq = tp->rcv_nxt - so->so_rcv.sb_tlscc; *tlsseq = so->so_rcv.sb_tls_seqno; SOCKBUF_UNLOCK(&so->so_rcv); INP_RUNLOCK(inp); return (0); } int ktls_get_tx_mode(struct socket *so, int *modep) { struct ktls_session *tls; struct inpcb *inp __diagused; if (SOLISTENING(so)) return (EINVAL); inp = so->so_pcb; INP_WLOCK_ASSERT(inp); SOCK_SENDBUF_LOCK(so); tls = so->so_snd.sb_tls_info; if (tls == NULL) *modep = TCP_TLS_MODE_NONE; else *modep = tls->mode; SOCK_SENDBUF_UNLOCK(so); return (0); } /* * Switch between SW and ifnet TLS sessions as requested. */ int ktls_set_tx_mode(struct socket *so, int mode) { struct ktls_session *tls, *tls_new; struct inpcb *inp; struct tcpcb *tp; int error; if (SOLISTENING(so)) return (EINVAL); switch (mode) { case TCP_TLS_MODE_SW: case TCP_TLS_MODE_IFNET: break; default: return (EINVAL); } inp = so->so_pcb; INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); if (mode == TCP_TLS_MODE_IFNET) { /* Don't allow enabling ifnet ktls multiple times */ if (tp->t_nic_ktls_xmit) return (EALREADY); /* * Don't enable ifnet ktls if we disabled it due to an * excessive retransmission rate */ if (tp->t_nic_ktls_xmit_dis) return (ENXIO); } SOCKBUF_LOCK(&so->so_snd); tls = so->so_snd.sb_tls_info; if (tls == NULL) { SOCKBUF_UNLOCK(&so->so_snd); return (0); } if (tls->mode == mode) { SOCKBUF_UNLOCK(&so->so_snd); return (0); } tls = ktls_hold(tls); SOCKBUF_UNLOCK(&so->so_snd); INP_WUNLOCK(inp); tls_new = ktls_clone_session(tls, KTLS_TX); if (mode == TCP_TLS_MODE_IFNET) error = ktls_try_ifnet(so, tls_new, KTLS_TX, true); else error = ktls_try_sw(tls_new, KTLS_TX); if (error) { counter_u64_add(ktls_switch_failed, 1); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (error); } error = SOCK_IO_SEND_LOCK(so, SBL_WAIT); if (error) { counter_u64_add(ktls_switch_failed, 1); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (error); } /* * If we raced with another session change, keep the existing * session. */ if (tls != so->so_snd.sb_tls_info) { counter_u64_add(ktls_switch_failed, 1); SOCK_IO_SEND_UNLOCK(so); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (EBUSY); } INP_WLOCK(inp); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_tls_info = tls_new; if (tls_new->mode != TCP_TLS_MODE_SW) { MPASS(tp->t_nic_ktls_xmit == 0); tp->t_nic_ktls_xmit = 1; if (tp->t_fb->tfb_hwtls_change != NULL) (*tp->t_fb->tfb_hwtls_change)(tp, 1); } SOCKBUF_UNLOCK(&so->so_snd); SOCK_IO_SEND_UNLOCK(so); /* * Drop two references on 'tls'. The first is for the * ktls_hold() above. The second drops the reference from the * socket buffer. */ KASSERT(tls->refcount >= 2, ("too few references on old session")); ktls_free(tls); ktls_free(tls); if (mode == TCP_TLS_MODE_IFNET) counter_u64_add(ktls_switch_to_ifnet, 1); else counter_u64_add(ktls_switch_to_sw, 1); return (0); } /* * Try to allocate a new TLS receive tag. This task is scheduled when * sbappend_ktls_rx detects an input path change. If a new tag is * allocated, replace the tag in the TLS session. If a new tag cannot * be allocated, let the session fall back to software decryption. */ static void ktls_reset_receive_tag(void *context, int pending) { union if_snd_tag_alloc_params params; struct ktls_session *tls; struct m_snd_tag *mst; struct inpcb *inp; struct ifnet *ifp; struct socket *so; int error; MPASS(pending == 1); tls = context; so = tls->so; inp = so->so_pcb; ifp = NULL; INP_RLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_RUNLOCK(inp); goto out; } SOCKBUF_LOCK(&so->so_rcv); mst = tls->snd_tag; tls->snd_tag = NULL; if (mst != NULL) m_snd_tag_rele(mst); ifp = tls->rx_ifp; if_ref(ifp); SOCKBUF_UNLOCK(&so->so_rcv); params.hdr.type = IF_SND_TAG_TYPE_TLS_RX; params.hdr.flowid = inp->inp_flowid; params.hdr.flowtype = inp->inp_flowtype; params.hdr.numa_domain = inp->inp_numa_domain; params.tls_rx.inp = inp; params.tls_rx.tls = tls; params.tls_rx.vlan_id = tls->rx_vlan_id; INP_RUNLOCK(inp); if (inp->inp_vflag & INP_IPV6) { if ((if_getcapenable2(ifp) & IFCAP2_RXTLS6) == 0) goto out; } else { if ((if_getcapenable2(ifp) & IFCAP2_RXTLS4) == 0) goto out; } error = m_snd_tag_alloc(ifp, ¶ms, &mst); if (error == 0) { SOCKBUF_LOCK(&so->so_rcv); tls->snd_tag = mst; SOCKBUF_UNLOCK(&so->so_rcv); counter_u64_add(ktls_ifnet_reset, 1); } else { /* * Just fall back to software decryption if a tag * cannot be allocated leaving the connection intact. * If a future input path change switches to another * interface this connection will resume ifnet TLS. */ counter_u64_add(ktls_ifnet_reset_failed, 1); } out: mtx_pool_lock(mtxpool_sleep, tls); tls->reset_pending = false; mtx_pool_unlock(mtxpool_sleep, tls); if (ifp != NULL) if_rele(ifp); CURVNET_SET(so->so_vnet); sorele(so); CURVNET_RESTORE(); ktls_free(tls); } /* * Try to allocate a new TLS send tag. This task is scheduled when * ip_output detects a route change while trying to transmit a packet * holding a TLS record. If a new tag is allocated, replace the tag * in the TLS session. Subsequent packets on the connection will use * the new tag. If a new tag cannot be allocated, drop the * connection. */ static void ktls_reset_send_tag(void *context, int pending) { struct epoch_tracker et; struct ktls_session *tls; struct m_snd_tag *old, *new; struct inpcb *inp; struct tcpcb *tp; int error; MPASS(pending == 1); tls = context; inp = tls->inp; /* * Free the old tag first before allocating a new one. * ip[6]_output_send() will treat a NULL send tag the same as * an ifp mismatch and drop packets until a new tag is * allocated. * * Write-lock the INP when changing tls->snd_tag since * ip[6]_output_send() holds a read-lock when reading the * pointer. */ INP_WLOCK(inp); old = tls->snd_tag; tls->snd_tag = NULL; INP_WUNLOCK(inp); if (old != NULL) m_snd_tag_rele(old); error = ktls_alloc_snd_tag(inp, tls, true, &new); if (error == 0) { INP_WLOCK(inp); tls->snd_tag = new; mtx_pool_lock(mtxpool_sleep, tls); tls->reset_pending = false; mtx_pool_unlock(mtxpool_sleep, tls); INP_WUNLOCK(inp); counter_u64_add(ktls_ifnet_reset, 1); /* * XXX: Should we kick tcp_output explicitly now that * the send tag is fixed or just rely on timers? */ } else { NET_EPOCH_ENTER(et); INP_WLOCK(inp); if (!(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); CURVNET_SET(inp->inp_vnet); tp = tcp_drop(tp, ECONNABORTED); CURVNET_RESTORE(); if (tp != NULL) { counter_u64_add(ktls_ifnet_reset_dropped, 1); INP_WUNLOCK(inp); } } else INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); counter_u64_add(ktls_ifnet_reset_failed, 1); /* * Leave reset_pending true to avoid future tasks while * the socket goes away. */ } ktls_free(tls); } void ktls_input_ifp_mismatch(struct sockbuf *sb, struct ifnet *ifp) { struct ktls_session *tls; struct socket *so; SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX", __func__, sb)); so = __containerof(sb, struct socket, so_rcv); tls = sb->sb_tls_info; if_rele(tls->rx_ifp); if_ref(ifp); tls->rx_ifp = ifp; /* * See if we should schedule a task to update the receive tag for * this session. */ mtx_pool_lock(mtxpool_sleep, tls); if (!tls->reset_pending) { (void) ktls_hold(tls); soref(so); tls->so = so; tls->reset_pending = true; taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task); } mtx_pool_unlock(mtxpool_sleep, tls); } int ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls) { if (inp == NULL) return (ENOBUFS); INP_LOCK_ASSERT(inp); /* * See if we should schedule a task to update the send tag for * this session. */ mtx_pool_lock(mtxpool_sleep, tls); if (!tls->reset_pending) { (void) ktls_hold(tls); tls->reset_pending = true; taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task); } mtx_pool_unlock(mtxpool_sleep, tls); return (ENOBUFS); } #ifdef RATELIMIT int ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate) { union if_snd_tag_modify_params params = { .rate_limit.max_rate = max_pacing_rate, .rate_limit.flags = M_NOWAIT, }; struct m_snd_tag *mst; /* Can't get to the inp, but it should be locked. */ /* INP_LOCK_ASSERT(inp); */ MPASS(tls->mode == TCP_TLS_MODE_IFNET); if (tls->snd_tag == NULL) { /* * Resetting send tag, ignore this change. The * pending reset may or may not see this updated rate * in the tcpcb. If it doesn't, we will just lose * this rate change. */ return (0); } mst = tls->snd_tag; MPASS(mst != NULL); MPASS(mst->sw->type == IF_SND_TAG_TYPE_TLS_RATE_LIMIT); return (mst->sw->snd_tag_modify(mst, ¶ms)); } #endif static void ktls_destroy_help(void *context, int pending __unused) { ktls_destroy(context); } void ktls_destroy(struct ktls_session *tls) { struct inpcb *inp; struct tcpcb *tp; bool wlocked; MPASS(tls->refcount == 0); inp = tls->inp; if (tls->tx) { wlocked = INP_WLOCKED(inp); if (!wlocked && !INP_TRY_WLOCK(inp)) { /* * rwlocks read locks are anonymous, and there * is no way to know if our current thread * holds an rlock on the inp. As a rough * estimate, check to see if the thread holds * *any* rlocks at all. If it does not, then we * know that we don't hold the inp rlock, and * can safely take the wlock */ if (curthread->td_rw_rlocks == 0) { INP_WLOCK(inp); } else { /* * We might hold the rlock, so let's * do the destroy in a taskqueue * context to avoid a potential * deadlock. This should be very * rare. */ counter_u64_add(ktls_destroy_task, 1); TASK_INIT(&tls->destroy_task, 0, ktls_destroy_help, tls); (void)taskqueue_enqueue(taskqueue_thread, &tls->destroy_task); return; } } } if (tls->sequential_records) { struct mbuf *m, *n; int page_count; STAILQ_FOREACH_SAFE(m, &tls->pending_records, m_epg_stailq, n) { page_count = m->m_epg_enc_cnt; while (page_count > 0) { KASSERT(page_count >= m->m_epg_nrdy, ("%s: too few pages", __func__)); page_count -= m->m_epg_nrdy; m = m_free(m); } } } counter_u64_add(ktls_offload_active, -1); switch (tls->mode) { case TCP_TLS_MODE_SW: switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_sw_cbc, -1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_sw_gcm, -1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_sw_chacha20, -1); break; } break; case TCP_TLS_MODE_IFNET: switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_ifnet_cbc, -1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_ifnet_gcm, -1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_ifnet_chacha20, -1); break; } if (tls->snd_tag != NULL) m_snd_tag_rele(tls->snd_tag); if (tls->rx_ifp != NULL) if_rele(tls->rx_ifp); if (tls->tx) { INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); MPASS(tp->t_nic_ktls_xmit == 1); tp->t_nic_ktls_xmit = 0; } break; #ifdef TCP_OFFLOAD case TCP_TLS_MODE_TOE: switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_toe_cbc, -1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_toe_gcm, -1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_toe_chacha20, -1); break; } break; #endif } if (tls->ocf_session != NULL) ktls_ocf_free(tls); if (tls->params.auth_key != NULL) { zfree(tls->params.auth_key, M_KTLS); tls->params.auth_key = NULL; tls->params.auth_key_len = 0; } if (tls->params.cipher_key != NULL) { zfree(tls->params.cipher_key, M_KTLS); tls->params.cipher_key = NULL; tls->params.cipher_key_len = 0; } if (tls->tx) { INP_WLOCK_ASSERT(inp); if (!in_pcbrele_wlocked(inp) && !wlocked) INP_WUNLOCK(inp); } explicit_bzero(tls->params.iv, sizeof(tls->params.iv)); uma_zfree(ktls_session_zone, tls); } void ktls_seq(struct sockbuf *sb, struct mbuf *m) { for (; m != NULL; m = m->m_next) { KASSERT((m->m_flags & M_EXTPG) != 0, ("ktls_seq: mapped mbuf %p", m)); m->m_epg_seqno = sb->sb_tls_seqno; sb->sb_tls_seqno++; } } /* * Add TLS framing (headers and trailers) to a chain of mbufs. Each * mbuf in the chain must be an unmapped mbuf. The payload of the * mbuf must be populated with the payload of each TLS record. * * The record_type argument specifies the TLS record type used when * populating the TLS header. * * The enq_count argument on return is set to the number of pages of * payload data for this entire chain that need to be encrypted via SW * encryption. The returned value should be passed to ktls_enqueue * when scheduling encryption of this chain of mbufs. To handle the * special case of empty fragments for TLS 1.0 sessions, an empty * fragment counts as one page. */ void ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt, uint8_t record_type) { struct tls_record_layer *tlshdr; struct mbuf *m; uint64_t *noncep; uint16_t tls_len; int maxlen __diagused; maxlen = tls->params.max_frame_len; *enq_cnt = 0; for (m = top; m != NULL; m = m->m_next) { /* * All mbufs in the chain should be TLS records whose * payload does not exceed the maximum frame length. * * Empty TLS 1.0 records are permitted when using CBC. */ KASSERT(m->m_len <= maxlen && m->m_len >= 0 && (m->m_len > 0 || ktls_permit_empty_frames(tls)), ("ktls_frame: m %p len %d", m, m->m_len)); /* * TLS frames require unmapped mbufs to store session * info. */ KASSERT((m->m_flags & M_EXTPG) != 0, ("ktls_frame: mapped mbuf %p (top = %p)", m, top)); tls_len = m->m_len; /* Save a reference to the session. */ m->m_epg_tls = ktls_hold(tls); m->m_epg_hdrlen = tls->params.tls_hlen; m->m_epg_trllen = tls->params.tls_tlen; if (tls->params.cipher_algorithm == CRYPTO_AES_CBC) { int bs, delta; /* * AES-CBC pads messages to a multiple of the * block size. Note that the padding is * applied after the digest and the encryption * is done on the "plaintext || mac || padding". * At least one byte of padding is always * present. * * Compute the final trailer length assuming * at most one block of padding. * tls->params.tls_tlen is the maximum * possible trailer length (padding + digest). * delta holds the number of excess padding * bytes if the maximum were used. Those * extra bytes are removed. */ bs = tls->params.tls_bs; delta = (tls_len + tls->params.tls_tlen) & (bs - 1); m->m_epg_trllen -= delta; } m->m_len += m->m_epg_hdrlen + m->m_epg_trllen; /* Populate the TLS header. */ tlshdr = (void *)m->m_epg_hdr; tlshdr->tls_vmajor = tls->params.tls_vmajor; /* * TLS 1.3 masquarades as TLS 1.2 with a record type * of TLS_RLTYPE_APP. */ if (tls->params.tls_vminor == TLS_MINOR_VER_THREE && tls->params.tls_vmajor == TLS_MAJOR_VER_ONE) { tlshdr->tls_vminor = TLS_MINOR_VER_TWO; tlshdr->tls_type = TLS_RLTYPE_APP; /* save the real record type for later */ m->m_epg_record_type = record_type; m->m_epg_trail[0] = record_type; } else { tlshdr->tls_vminor = tls->params.tls_vminor; tlshdr->tls_type = record_type; } tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr)); /* * Store nonces / explicit IVs after the end of the * TLS header. * * For GCM with TLS 1.2, an 8 byte nonce is copied * from the end of the IV. The nonce is then * incremented for use by the next record. * * For CBC, a random nonce is inserted for TLS 1.1+. */ if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16 && tls->params.tls_vminor == TLS_MINOR_VER_TWO) { noncep = (uint64_t *)(tls->params.iv + 8); be64enc(tlshdr + 1, *noncep); (*noncep)++; } else if (tls->params.cipher_algorithm == CRYPTO_AES_CBC && tls->params.tls_vminor >= TLS_MINOR_VER_ONE) arc4rand(tlshdr + 1, AES_BLOCK_LEN, 0); /* * When using SW encryption, mark the mbuf not ready. * It will be marked ready via sbready() after the * record has been encrypted. * * When using ifnet TLS, unencrypted TLS records are * sent down the stack to the NIC. */ if (tls->mode == TCP_TLS_MODE_SW) { m->m_flags |= M_NOTREADY; if (__predict_false(tls_len == 0)) { /* TLS 1.0 empty fragment. */ m->m_epg_nrdy = 1; } else m->m_epg_nrdy = m->m_epg_npgs; *enq_cnt += m->m_epg_nrdy; } } } bool ktls_permit_empty_frames(struct ktls_session *tls) { return (tls->params.cipher_algorithm == CRYPTO_AES_CBC && tls->params.tls_vminor == TLS_MINOR_VER_ZERO); } void ktls_check_rx(struct sockbuf *sb) { struct tls_record_layer hdr; struct ktls_wq *wq; struct socket *so; bool running; SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX", __func__, sb)); so = __containerof(sb, struct socket, so_rcv); if (sb->sb_flags & SB_TLS_RX_RUNNING) return; /* Is there enough queued for a TLS header? */ if (sb->sb_tlscc < sizeof(hdr)) { if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc != 0) so->so_error = EMSGSIZE; return; } m_copydata(sb->sb_mtls, 0, sizeof(hdr), (void *)&hdr); /* Is the entire record queued? */ if (sb->sb_tlscc < sizeof(hdr) + ntohs(hdr.tls_length)) { if ((sb->sb_state & SBS_CANTRCVMORE) != 0) so->so_error = EMSGSIZE; return; } sb->sb_flags |= SB_TLS_RX_RUNNING; soref(so); wq = &ktls_wq[so->so_rcv.sb_tls_info->wq_index]; mtx_lock(&wq->mtx); STAILQ_INSERT_TAIL(&wq->so_head, so, so_ktls_rx_list); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); counter_u64_add(ktls_cnt_rx_queued, 1); } static struct mbuf * ktls_detach_record(struct sockbuf *sb, int len) { struct mbuf *m, *n, *top; int remain; SOCKBUF_LOCK_ASSERT(sb); MPASS(len <= sb->sb_tlscc); /* * If TLS chain is the exact size of the record, * just grab the whole record. */ top = sb->sb_mtls; if (sb->sb_tlscc == len) { sb->sb_mtls = NULL; sb->sb_mtlstail = NULL; goto out; } /* * While it would be nice to use m_split() here, we need * to know exactly what m_split() allocates to update the * accounting, so do it inline instead. */ remain = len; for (m = top; remain > m->m_len; m = m->m_next) remain -= m->m_len; /* Easy case: don't have to split 'm'. */ if (remain == m->m_len) { sb->sb_mtls = m->m_next; if (sb->sb_mtls == NULL) sb->sb_mtlstail = NULL; m->m_next = NULL; goto out; } /* * Need to allocate an mbuf to hold the remainder of 'm'. Try * with M_NOWAIT first. */ n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) { /* * Use M_WAITOK with socket buffer unlocked. If * 'sb_mtls' changes while the lock is dropped, return * NULL to force the caller to retry. */ SOCKBUF_UNLOCK(sb); n = m_get(M_WAITOK, MT_DATA); SOCKBUF_LOCK(sb); if (sb->sb_mtls != top) { m_free(n); return (NULL); } } n->m_flags |= (m->m_flags & (M_NOTREADY | M_DECRYPTED)); /* Store remainder in 'n'. */ n->m_len = m->m_len - remain; if (m->m_flags & M_EXT) { n->m_data = m->m_data + remain; mb_dupcl(n, m); } else { bcopy(mtod(m, caddr_t) + remain, mtod(n, caddr_t), n->m_len); } /* Trim 'm' and update accounting. */ m->m_len -= n->m_len; sb->sb_tlscc -= n->m_len; sb->sb_ccc -= n->m_len; /* Account for 'n'. */ sballoc_ktls_rx(sb, n); /* Insert 'n' into the TLS chain. */ sb->sb_mtls = n; n->m_next = m->m_next; if (sb->sb_mtlstail == m) sb->sb_mtlstail = n; /* Detach the record from the TLS chain. */ m->m_next = NULL; out: MPASS(m_length(top, NULL) == len); for (m = top; m != NULL; m = m->m_next) sbfree_ktls_rx(sb, m); sb->sb_tlsdcc = len; sb->sb_ccc += len; SBCHECK(sb); return (top); } /* * Determine the length of the trailing zero padding and find the real * record type in the byte before the padding. * * Walking the mbuf chain backwards is clumsy, so another option would * be to scan forwards remembering the last non-zero byte before the * trailer. However, it would be expensive to scan the entire record. * Instead, find the last non-zero byte of each mbuf in the chain * keeping track of the relative offset of that nonzero byte. * * trail_len is the size of the MAC/tag on input and is set to the * size of the full trailer including padding and the record type on * return. */ static int tls13_find_record_type(struct ktls_session *tls, struct mbuf *m, int tls_len, int *trailer_len, uint8_t *record_typep) { char *cp; u_int digest_start, last_offset, m_len, offset; uint8_t record_type; digest_start = tls_len - *trailer_len; last_offset = 0; offset = 0; for (; m != NULL && offset < digest_start; offset += m->m_len, m = m->m_next) { /* Don't look for padding in the tag. */ m_len = min(digest_start - offset, m->m_len); cp = mtod(m, char *); /* Find last non-zero byte in this mbuf. */ while (m_len > 0 && cp[m_len - 1] == 0) m_len--; if (m_len > 0) { record_type = cp[m_len - 1]; last_offset = offset + m_len; } } if (last_offset < tls->params.tls_hlen) return (EBADMSG); *record_typep = record_type; *trailer_len = tls_len - last_offset + 1; return (0); } /* * Check if a mbuf chain is fully decrypted at the given offset and * length. Returns KTLS_MBUF_CRYPTO_ST_DECRYPTED if all data is * decrypted. KTLS_MBUF_CRYPTO_ST_MIXED if there is a mix of encrypted * and decrypted data. Else KTLS_MBUF_CRYPTO_ST_ENCRYPTED if all data * is encrypted. */ ktls_mbuf_crypto_st_t ktls_mbuf_crypto_state(struct mbuf *mb, int offset, int len) { int m_flags_ored = 0; int m_flags_anded = -1; for (; mb != NULL; mb = mb->m_next) { if (offset < mb->m_len) break; offset -= mb->m_len; } offset += len; for (; mb != NULL; mb = mb->m_next) { m_flags_ored |= mb->m_flags; m_flags_anded &= mb->m_flags; if (offset <= mb->m_len) break; offset -= mb->m_len; } MPASS(mb != NULL || offset == 0); if ((m_flags_ored ^ m_flags_anded) & M_DECRYPTED) return (KTLS_MBUF_CRYPTO_ST_MIXED); else return ((m_flags_ored & M_DECRYPTED) ? KTLS_MBUF_CRYPTO_ST_DECRYPTED : KTLS_MBUF_CRYPTO_ST_ENCRYPTED); } /* * ktls_resync_ifnet - get HW TLS RX back on track after packet loss */ static int ktls_resync_ifnet(struct socket *so, uint32_t tls_len, uint64_t tls_rcd_num) { union if_snd_tag_modify_params params; struct m_snd_tag *mst; struct inpcb *inp; struct tcpcb *tp; mst = so->so_rcv.sb_tls_info->snd_tag; if (__predict_false(mst == NULL)) return (EINVAL); inp = sotoinpcb(so); if (__predict_false(inp == NULL)) return (EINVAL); INP_RLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_RUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); MPASS(tp != NULL); /* Get the TCP sequence number of the next valid TLS header. */ SOCKBUF_LOCK(&so->so_rcv); params.tls_rx.tls_hdr_tcp_sn = tp->rcv_nxt - so->so_rcv.sb_tlscc - tls_len; params.tls_rx.tls_rec_length = tls_len; params.tls_rx.tls_seq_number = tls_rcd_num; SOCKBUF_UNLOCK(&so->so_rcv); INP_RUNLOCK(inp); MPASS(mst->sw->type == IF_SND_TAG_TYPE_TLS_RX); return (mst->sw->snd_tag_modify(mst, ¶ms)); } static void ktls_drop(struct socket *so, int error) { struct epoch_tracker et; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; NET_EPOCH_ENTER(et); INP_WLOCK(inp); if (!(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); CURVNET_SET(inp->inp_vnet); tp = tcp_drop(tp, error); CURVNET_RESTORE(); if (tp != NULL) INP_WUNLOCK(inp); } else { so->so_error = error; SOCK_RECVBUF_LOCK(so); sorwakeup_locked(so); INP_WUNLOCK(inp); } NET_EPOCH_EXIT(et); } static void ktls_decrypt(struct socket *so) { char tls_header[MBUF_PEXT_HDR_LEN]; struct ktls_session *tls; struct sockbuf *sb; struct tls_record_layer *hdr; struct tls_get_record tgr; struct mbuf *control, *data, *m; ktls_mbuf_crypto_st_t state; uint64_t seqno; int error, remain, tls_len, trail_len; bool tls13; uint8_t vminor, record_type; hdr = (struct tls_record_layer *)tls_header; sb = &so->so_rcv; SOCKBUF_LOCK(sb); KASSERT(sb->sb_flags & SB_TLS_RX_RUNNING, ("%s: socket %p not running", __func__, so)); tls = sb->sb_tls_info; MPASS(tls != NULL); tls13 = (tls->params.tls_vminor == TLS_MINOR_VER_THREE); if (tls13) vminor = TLS_MINOR_VER_TWO; else vminor = tls->params.tls_vminor; for (;;) { /* Is there enough queued for a TLS header? */ if (sb->sb_tlscc < tls->params.tls_hlen) break; m_copydata(sb->sb_mtls, 0, tls->params.tls_hlen, tls_header); tls_len = sizeof(*hdr) + ntohs(hdr->tls_length); if (hdr->tls_vmajor != tls->params.tls_vmajor || hdr->tls_vminor != vminor) error = EINVAL; else if (tls13 && hdr->tls_type != TLS_RLTYPE_APP) error = EINVAL; else if (tls_len < tls->params.tls_hlen || tls_len > tls->params.tls_hlen + TLS_MAX_MSG_SIZE_V10_2 + tls->params.tls_tlen) error = EMSGSIZE; else error = 0; if (__predict_false(error != 0)) { /* * We have a corrupted record and are likely * out of sync. The connection isn't * recoverable at this point, so abort it. */ SOCKBUF_UNLOCK(sb); counter_u64_add(ktls_offload_corrupted_records, 1); ktls_drop(so, error); goto deref; } /* Is the entire record queued? */ if (sb->sb_tlscc < tls_len) break; /* * Split out the portion of the mbuf chain containing * this TLS record. */ data = ktls_detach_record(sb, tls_len); if (data == NULL) continue; MPASS(sb->sb_tlsdcc == tls_len); seqno = sb->sb_tls_seqno; sb->sb_tls_seqno++; SBCHECK(sb); SOCKBUF_UNLOCK(sb); /* get crypto state for this TLS record */ state = ktls_mbuf_crypto_state(data, 0, tls_len); switch (state) { case KTLS_MBUF_CRYPTO_ST_MIXED: error = ktls_ocf_recrypt(tls, hdr, data, seqno); if (error) break; /* FALLTHROUGH */ case KTLS_MBUF_CRYPTO_ST_ENCRYPTED: error = ktls_ocf_decrypt(tls, hdr, data, seqno, &trail_len); if (__predict_true(error == 0)) { if (tls13) { error = tls13_find_record_type(tls, data, tls_len, &trail_len, &record_type); } else { record_type = hdr->tls_type; } } break; case KTLS_MBUF_CRYPTO_ST_DECRYPTED: /* * NIC TLS is only supported for AEAD * ciphersuites which used a fixed sized * trailer. */ if (tls13) { trail_len = tls->params.tls_tlen - 1; error = tls13_find_record_type(tls, data, tls_len, &trail_len, &record_type); } else { trail_len = tls->params.tls_tlen; error = 0; record_type = hdr->tls_type; } break; default: error = EINVAL; break; } if (error) { counter_u64_add(ktls_offload_failed_crypto, 1); SOCKBUF_LOCK(sb); if (sb->sb_tlsdcc == 0) { /* * sbcut/drop/flush discarded these * mbufs. */ m_freem(data); break; } /* * Drop this TLS record's data, but keep * decrypting subsequent records. */ sb->sb_ccc -= tls_len; sb->sb_tlsdcc = 0; if (error != EMSGSIZE) error = EBADMSG; CURVNET_SET(so->so_vnet); so->so_error = error; sorwakeup_locked(so); CURVNET_RESTORE(); m_freem(data); SOCKBUF_LOCK(sb); continue; } /* Allocate the control mbuf. */ memset(&tgr, 0, sizeof(tgr)); tgr.tls_type = record_type; tgr.tls_vmajor = hdr->tls_vmajor; tgr.tls_vminor = hdr->tls_vminor; tgr.tls_length = htobe16(tls_len - tls->params.tls_hlen - trail_len); control = sbcreatecontrol(&tgr, sizeof(tgr), TLS_GET_RECORD, IPPROTO_TCP, M_WAITOK); SOCKBUF_LOCK(sb); if (sb->sb_tlsdcc == 0) { /* sbcut/drop/flush discarded these mbufs. */ MPASS(sb->sb_tlscc == 0); m_freem(data); m_freem(control); break; } /* * Clear the 'dcc' accounting in preparation for * adding the decrypted record. */ sb->sb_ccc -= tls_len; sb->sb_tlsdcc = 0; SBCHECK(sb); /* If there is no payload, drop all of the data. */ if (tgr.tls_length == htobe16(0)) { m_freem(data); data = NULL; } else { /* Trim header. */ remain = tls->params.tls_hlen; while (remain > 0) { if (data->m_len > remain) { data->m_data += remain; data->m_len -= remain; break; } remain -= data->m_len; data = m_free(data); } /* Trim trailer and clear M_NOTREADY. */ remain = be16toh(tgr.tls_length); m = data; for (m = data; remain > m->m_len; m = m->m_next) { m->m_flags &= ~(M_NOTREADY | M_DECRYPTED); remain -= m->m_len; } m->m_len = remain; m_freem(m->m_next); m->m_next = NULL; m->m_flags &= ~(M_NOTREADY | M_DECRYPTED); /* Set EOR on the final mbuf. */ m->m_flags |= M_EOR; } sbappendcontrol_locked(sb, data, control, 0); if (__predict_false(state != KTLS_MBUF_CRYPTO_ST_DECRYPTED)) { sb->sb_flags |= SB_TLS_RX_RESYNC; SOCKBUF_UNLOCK(sb); ktls_resync_ifnet(so, tls_len, seqno); SOCKBUF_LOCK(sb); } else if (__predict_false(sb->sb_flags & SB_TLS_RX_RESYNC)) { sb->sb_flags &= ~SB_TLS_RX_RESYNC; SOCKBUF_UNLOCK(sb); ktls_resync_ifnet(so, 0, seqno); SOCKBUF_LOCK(sb); } } sb->sb_flags &= ~SB_TLS_RX_RUNNING; if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc > 0) so->so_error = EMSGSIZE; sorwakeup_locked(so); deref: SOCKBUF_UNLOCK_ASSERT(sb); CURVNET_SET(so->so_vnet); sorele(so); CURVNET_RESTORE(); } void ktls_enqueue_to_free(struct mbuf *m) { struct ktls_wq *wq; bool running; /* Mark it for freeing. */ m->m_epg_flags |= EPG_FLAG_2FREE; wq = &ktls_wq[m->m_epg_tls->wq_index]; mtx_lock(&wq->mtx); STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); } static void * ktls_buffer_alloc(struct ktls_wq *wq, struct mbuf *m) { void *buf; int domain, running; if (m->m_epg_npgs <= 2) return (NULL); if (ktls_buffer_zone == NULL) return (NULL); if ((u_int)(ticks - wq->lastallocfail) < hz) { /* * Rate-limit allocation attempts after a failure. * ktls_buffer_import() will acquire a per-domain mutex to check * the free page queues and may fail consistently if memory is * fragmented. */ return (NULL); } buf = uma_zalloc(ktls_buffer_zone, M_NOWAIT | M_NORECLAIM); if (buf == NULL) { domain = PCPU_GET(domain); wq->lastallocfail = ticks; /* * Note that this check is "racy", but the races are * harmless, and are either a spurious wakeup if * multiple threads fail allocations before the alloc * thread wakes, or waiting an extra second in case we * see an old value of running == true. */ if (!VM_DOMAIN_EMPTY(domain)) { running = atomic_load_int(&ktls_domains[domain].reclaim_td.running); if (!running) wakeup(&ktls_domains[domain].reclaim_td); } } return (buf); } static int ktls_encrypt_record(struct ktls_wq *wq, struct mbuf *m, struct ktls_session *tls, struct ktls_ocf_encrypt_state *state) { vm_page_t pg; int error, i, len, off; KASSERT((m->m_flags & (M_EXTPG | M_NOTREADY)) == (M_EXTPG | M_NOTREADY), ("%p not unready & nomap mbuf\n", m)); KASSERT(ptoa(m->m_epg_npgs) <= ktls_maxlen, ("page count %d larger than maximum frame length %d", m->m_epg_npgs, ktls_maxlen)); /* Anonymous mbufs are encrypted in place. */ if ((m->m_epg_flags & EPG_FLAG_ANON) != 0) return (ktls_ocf_encrypt(state, tls, m, NULL, 0)); /* * For file-backed mbufs (from sendfile), anonymous wired * pages are allocated and used as the encryption destination. */ if ((state->cbuf = ktls_buffer_alloc(wq, m)) != NULL) { len = ptoa(m->m_epg_npgs - 1) + m->m_epg_last_len - m->m_epg_1st_off; state->dst_iov[0].iov_base = (char *)state->cbuf + m->m_epg_1st_off; state->dst_iov[0].iov_len = len; state->parray[0] = DMAP_TO_PHYS((vm_offset_t)state->cbuf); i = 1; } else { off = m->m_epg_1st_off; for (i = 0; i < m->m_epg_npgs; i++, off = 0) { pg = vm_page_alloc_noobj(VM_ALLOC_NODUMP | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); len = m_epg_pagelen(m, i, off); state->parray[i] = VM_PAGE_TO_PHYS(pg); state->dst_iov[i].iov_base = (char *)PHYS_TO_DMAP(state->parray[i]) + off; state->dst_iov[i].iov_len = len; } } KASSERT(i + 1 <= nitems(state->dst_iov), ("dst_iov is too small")); state->dst_iov[i].iov_base = m->m_epg_trail; state->dst_iov[i].iov_len = m->m_epg_trllen; error = ktls_ocf_encrypt(state, tls, m, state->dst_iov, i + 1); if (__predict_false(error != 0)) { /* Free the anonymous pages. */ if (state->cbuf != NULL) uma_zfree(ktls_buffer_zone, state->cbuf); else { for (i = 0; i < m->m_epg_npgs; i++) { pg = PHYS_TO_VM_PAGE(state->parray[i]); (void)vm_page_unwire_noq(pg); vm_page_free(pg); } } } return (error); } /* Number of TLS records in a batch passed to ktls_enqueue(). */ static u_int ktls_batched_records(struct mbuf *m) { int page_count, records; records = 0; page_count = m->m_epg_enc_cnt; while (page_count > 0) { records++; page_count -= m->m_epg_nrdy; m = m->m_next; } KASSERT(page_count == 0, ("%s: mismatched page count", __func__)); return (records); } void ktls_enqueue(struct mbuf *m, struct socket *so, int page_count) { struct ktls_session *tls; struct ktls_wq *wq; int queued; bool running; KASSERT(((m->m_flags & (M_EXTPG | M_NOTREADY)) == (M_EXTPG | M_NOTREADY)), ("ktls_enqueue: %p not unready & nomap mbuf\n", m)); KASSERT(page_count != 0, ("enqueueing TLS mbuf with zero page count")); KASSERT(m->m_epg_tls->mode == TCP_TLS_MODE_SW, ("!SW TLS mbuf")); m->m_epg_enc_cnt = page_count; /* * Save a pointer to the socket. The caller is responsible * for taking an additional reference via soref(). */ m->m_epg_so = so; queued = 1; tls = m->m_epg_tls; wq = &ktls_wq[tls->wq_index]; mtx_lock(&wq->mtx); if (__predict_false(tls->sequential_records)) { /* * For TLS 1.0, records must be encrypted * sequentially. For a given connection, all records * queued to the associated work queue are processed * sequentially. However, sendfile(2) might complete * I/O requests spanning multiple TLS records out of * order. Here we ensure TLS records are enqueued to * the work queue in FIFO order. * * tls->next_seqno holds the sequence number of the * next TLS record that should be enqueued to the work * queue. If this next record is not tls->next_seqno, * it must be a future record, so insert it, sorted by * TLS sequence number, into tls->pending_records and * return. * * If this TLS record matches tls->next_seqno, place * it in the work queue and then check * tls->pending_records to see if any * previously-queued records are now ready for * encryption. */ if (m->m_epg_seqno != tls->next_seqno) { struct mbuf *n, *p; p = NULL; STAILQ_FOREACH(n, &tls->pending_records, m_epg_stailq) { if (n->m_epg_seqno > m->m_epg_seqno) break; p = n; } if (n == NULL) STAILQ_INSERT_TAIL(&tls->pending_records, m, m_epg_stailq); else if (p == NULL) STAILQ_INSERT_HEAD(&tls->pending_records, m, m_epg_stailq); else STAILQ_INSERT_AFTER(&tls->pending_records, p, m, m_epg_stailq); mtx_unlock(&wq->mtx); counter_u64_add(ktls_cnt_tx_pending, 1); return; } tls->next_seqno += ktls_batched_records(m); STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq); while (!STAILQ_EMPTY(&tls->pending_records)) { struct mbuf *n; n = STAILQ_FIRST(&tls->pending_records); if (n->m_epg_seqno != tls->next_seqno) break; queued++; STAILQ_REMOVE_HEAD(&tls->pending_records, m_epg_stailq); tls->next_seqno += ktls_batched_records(n); STAILQ_INSERT_TAIL(&wq->m_head, n, m_epg_stailq); } counter_u64_add(ktls_cnt_tx_pending, -(queued - 1)); } else STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); counter_u64_add(ktls_cnt_tx_queued, queued); } /* * Once a file-backed mbuf (from sendfile) has been encrypted, free * the pages from the file and replace them with the anonymous pages * allocated in ktls_encrypt_record(). */ static void ktls_finish_nonanon(struct mbuf *m, struct ktls_ocf_encrypt_state *state) { int i; MPASS((m->m_epg_flags & EPG_FLAG_ANON) == 0); /* Free the old pages. */ m->m_ext.ext_free(m); /* Replace them with the new pages. */ if (state->cbuf != NULL) { for (i = 0; i < m->m_epg_npgs; i++) m->m_epg_pa[i] = state->parray[0] + ptoa(i); /* Contig pages should go back to the cache. */ m->m_ext.ext_free = ktls_free_mext_contig; } else { for (i = 0; i < m->m_epg_npgs; i++) m->m_epg_pa[i] = state->parray[i]; /* Use the basic free routine. */ m->m_ext.ext_free = mb_free_mext_pgs; } /* Pages are now writable. */ m->m_epg_flags |= EPG_FLAG_ANON; } static __noinline void ktls_encrypt(struct ktls_wq *wq, struct mbuf *top) { struct ktls_ocf_encrypt_state state; struct ktls_session *tls; struct socket *so; struct mbuf *m; int error, npages, total_pages; so = top->m_epg_so; tls = top->m_epg_tls; KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top)); KASSERT(so != NULL, ("so = NULL, top = %p\n", top)); #ifdef INVARIANTS top->m_epg_so = NULL; #endif total_pages = top->m_epg_enc_cnt; npages = 0; /* * Encrypt the TLS records in the chain of mbufs starting with * 'top'. 'total_pages' gives us a total count of pages and is * used to know when we have finished encrypting the TLS * records originally queued with 'top'. * * NB: These mbufs are queued in the socket buffer and * 'm_next' is traversing the mbufs in the socket buffer. The * socket buffer lock is not held while traversing this chain. * Since the mbufs are all marked M_NOTREADY their 'm_next' * pointers should be stable. However, the 'm_next' of the * last mbuf encrypted is not necessarily NULL. It can point * to other mbufs appended while 'top' was on the TLS work * queue. * * Each mbuf holds an entire TLS record. */ error = 0; for (m = top; npages != total_pages; m = m->m_next) { KASSERT(m->m_epg_tls == tls, ("different TLS sessions in a single mbuf chain: %p vs %p", tls, m->m_epg_tls)); KASSERT(npages + m->m_epg_npgs <= total_pages, ("page count mismatch: top %p, total_pages %d, m %p", top, total_pages, m)); error = ktls_encrypt_record(wq, m, tls, &state); if (error) { counter_u64_add(ktls_offload_failed_crypto, 1); break; } if ((m->m_epg_flags & EPG_FLAG_ANON) == 0) ktls_finish_nonanon(m, &state); m->m_flags |= M_RDONLY; npages += m->m_epg_nrdy; /* * Drop a reference to the session now that it is no * longer needed. Existing code depends on encrypted * records having no associated session vs * yet-to-be-encrypted records having an associated * session. */ m->m_epg_tls = NULL; ktls_free(tls); } CURVNET_SET(so->so_vnet); if (error == 0) { (void)so->so_proto->pr_ready(so, top, npages); } else { ktls_drop(so, EIO); mb_free_notready(top, total_pages); } sorele(so); CURVNET_RESTORE(); } void ktls_encrypt_cb(struct ktls_ocf_encrypt_state *state, int error) { struct ktls_session *tls; struct socket *so; struct mbuf *m; int npages; m = state->m; if ((m->m_epg_flags & EPG_FLAG_ANON) == 0) ktls_finish_nonanon(m, state); m->m_flags |= M_RDONLY; so = state->so; free(state, M_KTLS); /* * Drop a reference to the session now that it is no longer * needed. Existing code depends on encrypted records having * no associated session vs yet-to-be-encrypted records having * an associated session. */ tls = m->m_epg_tls; m->m_epg_tls = NULL; ktls_free(tls); if (error != 0) counter_u64_add(ktls_offload_failed_crypto, 1); CURVNET_SET(so->so_vnet); npages = m->m_epg_nrdy; if (error == 0) { (void)so->so_proto->pr_ready(so, m, npages); } else { ktls_drop(so, EIO); mb_free_notready(m, npages); } sorele(so); CURVNET_RESTORE(); } /* * Similar to ktls_encrypt, but used with asynchronous OCF backends * (coprocessors) where encryption does not use host CPU resources and * it can be beneficial to queue more requests than CPUs. */ static __noinline void ktls_encrypt_async(struct ktls_wq *wq, struct mbuf *top) { struct ktls_ocf_encrypt_state *state; struct ktls_session *tls; struct socket *so; struct mbuf *m, *n; int error, mpages, npages, total_pages; so = top->m_epg_so; tls = top->m_epg_tls; KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top)); KASSERT(so != NULL, ("so = NULL, top = %p\n", top)); #ifdef INVARIANTS top->m_epg_so = NULL; #endif total_pages = top->m_epg_enc_cnt; npages = 0; error = 0; for (m = top; npages != total_pages; m = n) { KASSERT(m->m_epg_tls == tls, ("different TLS sessions in a single mbuf chain: %p vs %p", tls, m->m_epg_tls)); KASSERT(npages + m->m_epg_npgs <= total_pages, ("page count mismatch: top %p, total_pages %d, m %p", top, total_pages, m)); state = malloc(sizeof(*state), M_KTLS, M_WAITOK | M_ZERO); soref(so); state->so = so; state->m = m; mpages = m->m_epg_nrdy; n = m->m_next; error = ktls_encrypt_record(wq, m, tls, state); if (error) { counter_u64_add(ktls_offload_failed_crypto, 1); free(state, M_KTLS); CURVNET_SET(so->so_vnet); sorele(so); CURVNET_RESTORE(); break; } npages += mpages; } CURVNET_SET(so->so_vnet); if (error != 0) { ktls_drop(so, EIO); mb_free_notready(m, total_pages - npages); } sorele(so); CURVNET_RESTORE(); } static int ktls_bind_domain(int domain) { int error; error = cpuset_setthread(curthread->td_tid, &cpuset_domain[domain]); if (error != 0) return (error); curthread->td_domain.dr_policy = DOMAINSET_PREF(domain); return (0); } static void ktls_reclaim_thread(void *ctx) { struct ktls_domain_info *ktls_domain = ctx; struct ktls_reclaim_thread *sc = &ktls_domain->reclaim_td; struct sysctl_oid *oid; char name[80]; int error, domain; domain = ktls_domain - ktls_domains; if (bootverbose) printf("Starting KTLS reclaim thread for domain %d\n", domain); error = ktls_bind_domain(domain); if (error) printf("Unable to bind KTLS reclaim thread for domain %d: error %d\n", domain, error); snprintf(name, sizeof(name), "domain%d", domain); oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_kern_ipc_tls), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "reclaims", CTLFLAG_RD, &sc->reclaims, 0, "buffers reclaimed"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "wakeups", CTLFLAG_RD, &sc->wakeups, 0, "thread wakeups"); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "running", CTLFLAG_RD, &sc->running, 0, "thread running"); for (;;) { atomic_store_int(&sc->running, 0); tsleep(sc, PZERO | PNOLOCK, "-", 0); atomic_store_int(&sc->running, 1); sc->wakeups++; /* * Below we attempt to reclaim ktls_max_reclaim * buffers using vm_page_reclaim_contig_domain_ext(). * We do this here, as this function can take several * seconds to scan all of memory and it does not * matter if this thread pauses for a while. If we * block a ktls worker thread, we risk developing * backlogs of buffers to be encrypted, leading to * surges of traffic and potential NIC output drops. */ if (vm_page_reclaim_contig_domain_ext(domain, VM_ALLOC_NORMAL, atop(ktls_maxlen), 0, ~0ul, PAGE_SIZE, 0, ktls_max_reclaim) != 0) { vm_wait_domain(domain); } else { sc->reclaims += ktls_max_reclaim; } } } static void ktls_work_thread(void *ctx) { struct ktls_wq *wq = ctx; struct mbuf *m, *n; struct socket *so, *son; STAILQ_HEAD(, mbuf) local_m_head; STAILQ_HEAD(, socket) local_so_head; int cpu; cpu = wq - ktls_wq; if (bootverbose) printf("Starting KTLS worker thread for CPU %d\n", cpu); /* * Bind to a core. If ktls_bind_threads is > 1, then * we bind to the NUMA domain instead. */ if (ktls_bind_threads) { int error; if (ktls_bind_threads > 1) { struct pcpu *pc = pcpu_find(cpu); error = ktls_bind_domain(pc->pc_domain); } else { cpuset_t mask; CPU_SETOF(cpu, &mask); error = cpuset_setthread(curthread->td_tid, &mask); } if (error) printf("Unable to bind KTLS worker thread for CPU %d: error %d\n", cpu, error); } #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) fpu_kern_thread(0); #endif for (;;) { mtx_lock(&wq->mtx); while (STAILQ_EMPTY(&wq->m_head) && STAILQ_EMPTY(&wq->so_head)) { wq->running = false; mtx_sleep(wq, &wq->mtx, 0, "-", 0); wq->running = true; } STAILQ_INIT(&local_m_head); STAILQ_CONCAT(&local_m_head, &wq->m_head); STAILQ_INIT(&local_so_head); STAILQ_CONCAT(&local_so_head, &wq->so_head); mtx_unlock(&wq->mtx); STAILQ_FOREACH_SAFE(m, &local_m_head, m_epg_stailq, n) { if (m->m_epg_flags & EPG_FLAG_2FREE) { ktls_free(m->m_epg_tls); m_free_raw(m); } else { if (m->m_epg_tls->sync_dispatch) ktls_encrypt(wq, m); else ktls_encrypt_async(wq, m); counter_u64_add(ktls_cnt_tx_queued, -1); } } STAILQ_FOREACH_SAFE(so, &local_so_head, so_ktls_rx_list, son) { ktls_decrypt(so); counter_u64_add(ktls_cnt_rx_queued, -1); } } } static void ktls_disable_ifnet_help(void *context, int pending __unused) { struct ktls_session *tls; struct inpcb *inp; struct tcpcb *tp; struct socket *so; int err; tls = context; inp = tls->inp; if (inp == NULL) return; INP_WLOCK(inp); so = inp->inp_socket; MPASS(so != NULL); if (inp->inp_flags & INP_DROPPED) { goto out; } if (so->so_snd.sb_tls_info != NULL) err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW); else err = ENXIO; if (err == 0) { counter_u64_add(ktls_ifnet_disable_ok, 1); /* ktls_set_tx_mode() drops inp wlock, so recheck flags */ if ((inp->inp_flags & INP_DROPPED) == 0 && (tp = intotcpcb(inp)) != NULL && tp->t_fb->tfb_hwtls_change != NULL) (*tp->t_fb->tfb_hwtls_change)(tp, 0); } else { counter_u64_add(ktls_ifnet_disable_fail, 1); } out: CURVNET_SET(so->so_vnet); sorele(so); CURVNET_RESTORE(); INP_WUNLOCK(inp); ktls_free(tls); } /* * Called when re-transmits are becoming a substantial portion of the * sends on this connection. When this happens, we transition the * connection to software TLS. This is needed because most inline TLS * NICs keep crypto state only for in-order transmits. This means * that to handle a TCP rexmit (which is out-of-order), the NIC must * re-DMA the entire TLS record up to and including the current * segment. This means that when re-transmitting the last ~1448 byte * segment of a 16KB TLS record, we could wind up re-DMA'ing an order * of magnitude more data than we are sending. This can cause the * PCIe link to saturate well before the network, which can cause * output drops, and a general loss of capacity. */ void ktls_disable_ifnet(void *arg) { struct tcpcb *tp; struct inpcb *inp; struct socket *so; struct ktls_session *tls; tp = arg; inp = tptoinpcb(tp); INP_WLOCK_ASSERT(inp); so = inp->inp_socket; SOCK_LOCK(so); tls = so->so_snd.sb_tls_info; if (tp->t_nic_ktls_xmit_dis == 1) { SOCK_UNLOCK(so); return; } /* * note that t_nic_ktls_xmit_dis is never cleared; disabling * ifnet can only be done once per connection, so we never want * to do it again */ (void)ktls_hold(tls); soref(so); tp->t_nic_ktls_xmit_dis = 1; SOCK_UNLOCK(so); TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls); (void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task); } diff --git a/sys/vm/uma.h b/sys/vm/uma.h index 38865df7ae02..4f2b143a2fae 100644 --- a/sys/vm/uma.h +++ b/sys/vm/uma.h @@ -1,750 +1,751 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson * Copyright (c) 2004, 2005 Bosko Milekic * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ /* * uma.h - External definitions for the Universal Memory Allocator * */ #ifndef _VM_UMA_H_ #define _VM_UMA_H_ #include /* For NULL */ #include /* For M_* */ #include /* User visible parameters */ #define UMA_SMALLEST_UNIT 8 /* Smallest item allocated */ /* Types and type defs */ struct uma_zone; /* Opaque type used as a handle to the zone */ typedef struct uma_zone * uma_zone_t; /* * Item constructor * * Arguments: * item A pointer to the memory which has been allocated. * arg The arg field passed to uma_zalloc_arg * size The size of the allocated item * flags See zalloc flags * * Returns: * 0 on success * errno on failure * * Discussion: * The constructor is called just before the memory is returned * to the user. It may block if necessary. */ typedef int (*uma_ctor)(void *mem, int size, void *arg, int flags); /* * Item destructor * * Arguments: * item A pointer to the memory which has been allocated. * size The size of the item being destructed. * arg Argument passed through uma_zfree_arg * * Returns: * Nothing * * Discussion: * The destructor may perform operations that differ from those performed * by the initializer, but it must leave the object in the same state. * This IS type stable storage. This is called after EVERY zfree call. */ typedef void (*uma_dtor)(void *mem, int size, void *arg); /* * Item initializer * * Arguments: * item A pointer to the memory which has been allocated. * size The size of the item being initialized. * flags See zalloc flags * * Returns: * 0 on success * errno on failure * * Discussion: * The initializer is called when the memory is cached in the uma zone. * The initializer and the destructor should leave the object in the same * state. */ typedef int (*uma_init)(void *mem, int size, int flags); /* * Item discard function * * Arguments: * item A pointer to memory which has been 'freed' but has not left the * zone's cache. * size The size of the item being discarded. * * Returns: * Nothing * * Discussion: * This routine is called when memory leaves a zone and is returned to the * system for other uses. It is the counter-part to the init function. */ typedef void (*uma_fini)(void *mem, int size); /* * Import new memory into a cache zone. */ typedef int (*uma_import)(void *arg, void **store, int count, int domain, int flags); /* * Free memory from a cache zone. */ typedef void (*uma_release)(void *arg, void **store, int count); /* * What's the difference between initializing and constructing? * * The item is initialized when it is cached, and this is the state that the * object should be in when returned to the allocator. The purpose of this is * to remove some code which would otherwise be called on each allocation by * utilizing a known, stable state. This differs from the constructor which * will be called on EVERY allocation. * * For example, in the initializer you may want to initialize embedded locks, * NULL list pointers, set up initial states, magic numbers, etc. This way if * the object is held in the allocator and re-used it won't be necessary to * re-initialize it. * * The constructor may be used to lock a data structure, link it on to lists, * bump reference counts or total counts of outstanding structures, etc. * */ /* Function proto types */ /* * Create a new uma zone * * Arguments: * name The text name of the zone for debugging and stats. This memory * should not be freed until the zone has been deallocated. * size The size of the object that is being created. * ctor The constructor that is called when the object is allocated. * dtor The destructor that is called when the object is freed. * init An initializer that sets up the initial state of the memory. * fini A discard function that undoes initialization done by init. * ctor/dtor/init/fini may all be null, see notes above. * align A bitmask that corresponds to the requested alignment * eg 4 would be 0x3 * flags A set of parameters that control the behavior of the zone. * * Returns: * A pointer to a structure which is intended to be opaque to users of * the interface. The value may be null if the wait flag is not set. */ uma_zone_t uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, uma_init uminit, uma_fini fini, int align, uint32_t flags); /* * Create a secondary uma zone * * Arguments: * name The text name of the zone for debugging and stats. This memory * should not be freed until the zone has been deallocated. * ctor The constructor that is called when the object is allocated. * dtor The destructor that is called when the object is freed. * zinit An initializer that sets up the initial state of the memory * as the object passes from the Keg's slab to the Zone's cache. * zfini A discard function that undoes initialization done by init * as the object passes from the Zone's cache to the Keg's slab. * * ctor/dtor/zinit/zfini may all be null, see notes above. * Note that the zinit and zfini specified here are NOT * exactly the same as the init/fini specified to uma_zcreate() * when creating a primary zone. These zinit/zfini are called * on the TRANSITION from keg to zone (and vice-versa). Once * these are set, the primary zone may alter its init/fini * (which are called when the object passes from VM to keg) * using uma_zone_set_init/fini()) as well as its own * zinit/zfini (unset by default for primary zone) with * uma_zone_set_zinit/zfini() (note subtle 'z' prefix). * * primary A reference to this zone's Primary Zone which contains the * backing Keg for the Secondary Zone being added. * * Returns: * A pointer to a structure which is intended to be opaque to users of * the interface. The value may be null if the wait flag is not set. */ uma_zone_t uma_zsecond_create(const char *name, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_zone_t primary); /* * Create cache-only zones. * * This allows uma's per-cpu cache facilities to handle arbitrary * pointers. Consumers must specify the import and release functions to * fill and destroy caches. UMA does not allocate any memory for these * zones. The 'arg' parameter is passed to import/release and is caller * specific. */ uma_zone_t uma_zcache_create(const char *name, int size, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_import zimport, uma_release zrelease, void *arg, int flags); /* * Definitions for uma_zcreate flags * * These flags share space with UMA_ZFLAGs in uma_int.h. Be careful not to * overlap when adding new features. */ #define UMA_ZONE_UNMANAGED 0x0001 /* * Don't regulate the cache size, even * under memory pressure. */ #define UMA_ZONE_ZINIT 0x0002 /* Initialize with zeros */ #define UMA_ZONE_CONTIG 0x0004 /* * Physical memory underlying an object * must be contiguous. */ #define UMA_ZONE_NOTOUCH 0x0008 /* UMA may not access the memory */ #define UMA_ZONE_MALLOC 0x0010 /* For use by malloc(9) only! */ #define UMA_ZONE_NOFREE 0x0020 /* Do not free slabs of this type! */ #define UMA_ZONE_MTXCLASS 0x0040 /* Create a new lock class */ #define UMA_ZONE_VM 0x0080 /* * Used for internal vm datastructures * only. */ #define UMA_ZONE_NOTPAGE 0x0100 /* allocf memory not vm pages */ #define UMA_ZONE_SECONDARY 0x0200 /* Zone is a Secondary Zone */ #define UMA_ZONE_NOBUCKET 0x0400 /* Do not use buckets. */ #define UMA_ZONE_MAXBUCKET 0x0800 /* Use largest buckets. */ +#define UMA_ZONE_NOTRIM 0x1000 /* Don't trim this zone */ #define UMA_ZONE_CACHESPREAD 0x2000 /* * Spread memory start locations across * all possible cache lines. May * require many virtually contiguous * backend pages and can fail early. */ #define UMA_ZONE_NODUMP 0x4000 /* * Zone's pages will not be included in * mini-dumps. */ #define UMA_ZONE_PCPU 0x8000 /* * Allocates mp_maxid + 1 slabs of * PAGE_SIZE */ #define UMA_ZONE_FIRSTTOUCH 0x10000 /* First touch NUMA policy */ #define UMA_ZONE_ROUNDROBIN 0x20000 /* Round-robin NUMA policy. */ #define UMA_ZONE_SMR 0x40000 /* * Safe memory reclamation defers * frees until all read sections * have exited. This flag creates * a unique SMR context for this * zone. To share contexts see * uma_zone_set_smr() below. * * See sys/smr.h for more details. */ #define UMA_ZONE_NOKASAN 0x80000 /* * Disable KASAN verification. This is * implied by NOFREE. Cache zones are * not verified by default. */ /* In use by UMA_ZFLAGs: 0xffe00000 */ /* * These flags are shared between the keg and zone. Some are determined * based on physical parameters of the request and may not be provided by * the consumer. */ #define UMA_ZONE_INHERIT \ (UMA_ZONE_NOTOUCH | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE | \ UMA_ZONE_VM | UMA_ZONE_NOTPAGE | UMA_ZONE_PCPU | \ UMA_ZONE_FIRSTTOUCH | UMA_ZONE_ROUNDROBIN | UMA_ZONE_NOKASAN) /* Definitions for align */ #define UMA_ALIGN_PTR (sizeof(void *) - 1) /* Alignment fit for ptr */ #define UMA_ALIGN_LONG (sizeof(long) - 1) /* "" long */ #define UMA_ALIGN_INT (sizeof(int) - 1) /* "" int */ #define UMA_ALIGN_SHORT (sizeof(short) - 1) /* "" short */ #define UMA_ALIGN_CHAR (sizeof(char) - 1) /* "" char */ #define UMA_ALIGN_CACHE (uma_get_cache_align_mask()) /* Cache line size align */ /* Align both to cache line size and an explicit alignment (through mask). */ #define UMA_ALIGN_CACHE_AND_MASK(mask) (uma_get_cache_align_mask() | (mask)) #define UMA_ALIGNOF(type) (_Alignof(type) - 1) /* Alignment fit for 'type' */ #define UMA_ANYDOMAIN -1 /* Special value for domain search. */ /* * Destroys an empty uma zone. If the zone is not empty uma complains loudly. * * Arguments: * zone The zone we want to destroy. * */ void uma_zdestroy(uma_zone_t zone); /* * Allocates an item out of a zone * * Arguments: * zone The zone we are allocating from * arg This data is passed to the ctor function * flags See sys/malloc.h for available flags. * * Returns: * A non-null pointer to an initialized element from the zone is * guaranteed if the wait flag is M_WAITOK. Otherwise a null pointer * may be returned if the zone is empty or the ctor failed. */ void *uma_zalloc_arg(uma_zone_t zone, void *arg, int flags); /* Allocate per-cpu data. Access the correct data with zpcpu_get(). */ void *uma_zalloc_pcpu_arg(uma_zone_t zone, void *arg, int flags); /* Use with SMR zones. */ void *uma_zalloc_smr(uma_zone_t zone, int flags); /* * Allocate an item from a specific NUMA domain. This uses a slow path in * the allocator but is guaranteed to allocate memory from the requested * domain if M_WAITOK is set. * * Arguments: * zone The zone we are allocating from * arg This data is passed to the ctor function * domain The domain to allocate from. * flags See sys/malloc.h for available flags. */ void *uma_zalloc_domain(uma_zone_t zone, void *arg, int domain, int flags); /* * Allocates an item out of a zone without supplying an argument * * This is just a wrapper for uma_zalloc_arg for convenience. * */ static __inline void *uma_zalloc(uma_zone_t zone, int flags); static __inline void *uma_zalloc_pcpu(uma_zone_t zone, int flags); static __inline void * uma_zalloc(uma_zone_t zone, int flags) { return uma_zalloc_arg(zone, NULL, flags); } static __inline void * uma_zalloc_pcpu(uma_zone_t zone, int flags) { return uma_zalloc_pcpu_arg(zone, NULL, flags); } /* * Frees an item back into the specified zone. * * Arguments: * zone The zone the item was originally allocated out of. * item The memory to be freed. * arg Argument passed to the destructor * * Returns: * Nothing. */ void uma_zfree_arg(uma_zone_t zone, void *item, void *arg); /* Use with PCPU zones. */ void uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *arg); /* Use with SMR zones. */ void uma_zfree_smr(uma_zone_t zone, void *item); /* * Frees an item back to a zone without supplying an argument * * This is just a wrapper for uma_zfree_arg for convenience. * */ static __inline void uma_zfree(uma_zone_t zone, void *item); static __inline void uma_zfree_pcpu(uma_zone_t zone, void *item); static __inline void uma_zfree(uma_zone_t zone, void *item) { uma_zfree_arg(zone, item, NULL); } static __inline void uma_zfree_pcpu(uma_zone_t zone, void *item) { uma_zfree_pcpu_arg(zone, item, NULL); } /* * Wait until the specified zone can allocate an item. */ void uma_zwait(uma_zone_t zone); /* * Backend page supplier routines * * Arguments: * zone The zone that is requesting pages. * size The number of bytes being requested. * pflag Flags for these memory pages, see below. * domain The NUMA domain that we prefer for this allocation. * wait Indicates our willingness to block. * * Returns: * A pointer to the allocated memory or NULL on failure. */ typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, int domain, uint8_t *pflag, int wait); /* * Backend page free routines * * Arguments: * item A pointer to the previously allocated pages. * size The original size of the allocation. * pflag The flags for the slab. See UMA_SLAB_* below. * * Returns: * None */ typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag); /* * Reclaims unused memory. If no NUMA domain is specified, memory from all * domains is reclaimed. * * Arguments: * req Reclamation request type. * domain The target NUMA domain. * Returns: * None */ #define UMA_RECLAIM_DRAIN 1 /* release bucket cache */ #define UMA_RECLAIM_DRAIN_CPU 2 /* release bucket and per-CPU caches */ #define UMA_RECLAIM_TRIM 3 /* trim bucket cache to WSS */ void uma_reclaim(int req); void uma_reclaim_domain(int req, int domain); void uma_zone_reclaim(uma_zone_t, int req); void uma_zone_reclaim_domain(uma_zone_t, int req, int domain); /* * Sets the alignment mask to be used for all zones requesting cache * alignment. Should be called by MD boot code prior to starting VM/UMA. * * Arguments: * mask The alignment mask * * Returns: * Nothing */ void uma_set_cache_align_mask(unsigned int mask); #include /* * Set a reserved number of items to hold for M_USE_RESERVE allocations. All * other requests must allocate new backing pages. */ void uma_zone_reserve(uma_zone_t zone, int nitems); /* * Reserves the maximum KVA space required by the zone and configures the zone * to use a backend that allocates physical memory and maps it using the * reserved KVA. * * Arguments: * zone The zone to update. * nitems The upper limit on the number of items that can be allocated. * * Returns: * 0 if KVA space can not be allocated * 1 if successful * * Discussion: * When the machine supports a direct map and the zone's items are smaller * than a page, the zone will use the direct map instead of allocating KVA * space. */ int uma_zone_reserve_kva(uma_zone_t zone, int nitems); /* * Sets an upper limit on the number of items allocated from a zone * * Arguments: * zone The zone to limit * nitems The requested upper limit on the number of items allowed * * Returns: * int The effective value of nitems */ int uma_zone_set_max(uma_zone_t zone, int nitems); /* * Sets an upper limit on the number of items allowed in zone's caches * * Arguments: * zone The zone to limit * nitems The requested upper limit on the number of items allowed */ void uma_zone_set_maxcache(uma_zone_t zone, int nitems); /* * Obtains the effective limit on the number of items in a zone * * Arguments: * zone The zone to obtain the effective limit from * * Return: * 0 No limit * int The effective limit of the zone */ int uma_zone_get_max(uma_zone_t zone); /* * Sets a warning to be printed when limit is reached * * Arguments: * zone The zone we will warn about * warning Warning content * * Returns: * Nothing */ void uma_zone_set_warning(uma_zone_t zone, const char *warning); /* * Sets a function to run when limit is reached * * Arguments: * zone The zone to which this applies * fx The function ro run * * Returns: * Nothing */ typedef void (*uma_maxaction_t)(uma_zone_t, int); void uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t); /* * Obtains the approximate current number of items allocated from a zone * * Arguments: * zone The zone to obtain the current allocation count from * * Return: * int The approximate current number of items allocated from the zone */ int uma_zone_get_cur(uma_zone_t zone); /* * The following two routines (uma_zone_set_init/fini) * are used to set the backend init/fini pair which acts on an * object as it becomes allocated and is placed in a slab within * the specified zone's backing keg. These should probably not * be changed once allocations have already begun, but only be set * immediately upon zone creation. */ void uma_zone_set_init(uma_zone_t zone, uma_init uminit); void uma_zone_set_fini(uma_zone_t zone, uma_fini fini); /* * The following two routines (uma_zone_set_zinit/zfini) are * used to set the zinit/zfini pair which acts on an object as * it passes from the backing Keg's slab cache to the * specified Zone's bucket cache. These should probably not * be changed once allocations have already begun, but only be set * immediately upon zone creation. */ void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit); void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini); /* * Replaces the standard backend allocator for this zone. * * Arguments: * zone The zone whose backend allocator is being changed. * allocf A pointer to the allocation function * * Returns: * Nothing * * Discussion: * This could be used to implement pageable allocation, or perhaps * even DMA allocators if used in conjunction with the OFFPAGE * zone flag. */ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf); /* * Used for freeing memory provided by the allocf above * * Arguments: * zone The zone that intends to use this free routine. * freef The page freeing routine. * * Returns: * Nothing */ void uma_zone_set_freef(uma_zone_t zone, uma_free freef); /* * Associate a zone with a smr context that is allocated after creation * so that multiple zones may share the same context. */ void uma_zone_set_smr(uma_zone_t zone, smr_t smr); /* * Fetch the smr context that was set or made in uma_zcreate(). */ smr_t uma_zone_get_smr(uma_zone_t zone); /* * These flags are settable in the allocf and visible in the freef. */ #define UMA_SLAB_BOOT 0x01 /* Slab alloced from boot pages */ #define UMA_SLAB_KERNEL 0x04 /* Slab alloced from kmem */ #define UMA_SLAB_PRIV 0x08 /* Slab alloced from priv allocator */ /* 0x02, 0x10, 0x40, and 0x80 are available */ /* * Used to pre-fill a zone with some number of items * * Arguments: * zone The zone to fill * itemcnt The number of items to reserve * * Returns: * Nothing * * NOTE: This is blocking and should only be done at startup */ void uma_prealloc(uma_zone_t zone, int itemcnt); /* * Used to determine if a fixed-size zone is exhausted. * * Arguments: * zone The zone to check * * Returns: * Non-zero if zone is exhausted. */ int uma_zone_exhausted(uma_zone_t zone); /* * Returns the bytes of memory consumed by the zone. */ size_t uma_zone_memory(uma_zone_t zone); /* * Common UMA_ZONE_PCPU zones. */ extern uma_zone_t pcpu_zone_4; extern uma_zone_t pcpu_zone_8; extern uma_zone_t pcpu_zone_16; extern uma_zone_t pcpu_zone_32; extern uma_zone_t pcpu_zone_64; /* * Exported statistics structures to be used by user space monitoring tools. * Statistics stream consists of a uma_stream_header, followed by a series of * alternative uma_type_header and uma_type_stat structures. */ #define UMA_STREAM_VERSION 0x00000001 struct uma_stream_header { uint32_t ush_version; /* Stream format version. */ uint32_t ush_maxcpus; /* Value of MAXCPU for stream. */ uint32_t ush_count; /* Number of records. */ uint32_t _ush_pad; /* Pad/reserved field. */ }; #define UTH_MAX_NAME 32 #define UTH_ZONE_SECONDARY 0x00000001 struct uma_type_header { /* * Static per-zone data, some extracted from the supporting keg. */ char uth_name[UTH_MAX_NAME]; uint32_t uth_align; /* Keg: alignment. */ uint32_t uth_size; /* Keg: requested size of item. */ uint32_t uth_rsize; /* Keg: real size of item. */ uint32_t uth_maxpages; /* Keg: maximum number of pages. */ uint32_t uth_limit; /* Keg: max items to allocate. */ /* * Current dynamic zone/keg-derived statistics. */ uint32_t uth_pages; /* Keg: pages allocated. */ uint32_t uth_keg_free; /* Keg: items free. */ uint32_t uth_zone_free; /* Zone: items free. */ uint32_t uth_bucketsize; /* Zone: desired bucket size. */ uint32_t uth_zone_flags; /* Zone: flags. */ uint64_t uth_allocs; /* Zone: number of allocations. */ uint64_t uth_frees; /* Zone: number of frees. */ uint64_t uth_fails; /* Zone: number of alloc failures. */ uint64_t uth_sleeps; /* Zone: number of alloc sleeps. */ uint64_t uth_xdomain; /* Zone: Number of cross domain frees. */ uint64_t _uth_reserved1[1]; /* Reserved. */ }; struct uma_percpu_stat { uint64_t ups_allocs; /* Cache: number of allocations. */ uint64_t ups_frees; /* Cache: number of frees. */ uint64_t ups_cache_free; /* Cache: free items in cache. */ uint64_t _ups_reserved[5]; /* Reserved. */ }; void uma_reclaim_wakeup(void); void uma_reclaim_worker(void *); unsigned long uma_limit(void); /* Return the amount of memory managed by UMA. */ unsigned long uma_size(void); /* Return the amount of memory remaining. May be negative. */ long uma_avail(void); #endif /* _VM_UMA_H_ */ diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index e93c561d759a..4de850afcb66 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -1,5983 +1,5988 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2002-2019 Jeffrey Roberson * Copyright (c) 2004, 2005 Bosko Milekic * Copyright (c) 2004-2006 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * uma_core.c Implementation of the Universal Memory allocator * * This allocator is intended to replace the multitude of similar object caches * in the standard FreeBSD kernel. The intent is to be flexible as well as * efficient. A primary design goal is to return unused memory to the rest of * the system. This will make the system as a whole more flexible due to the * ability to move memory to subsystems which most need it instead of leaving * pools of reserved memory unused. * * The basic ideas stem from similar slab/zone based allocators whose algorithms * are well known. * */ /* * TODO: * - Improve memory usage for large allocations * - Investigate cache size adjustments */ #include #include "opt_ddb.h" #include "opt_param.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEBUG_MEMGUARD #include #endif #include #ifdef INVARIANTS #define UMA_ALWAYS_CTORDTOR 1 #else #define UMA_ALWAYS_CTORDTOR 0 #endif /* * This is the zone and keg from which all zones are spawned. */ static uma_zone_t kegs; static uma_zone_t zones; /* * On INVARIANTS builds, the slab contains a second bitset of the same size, * "dbg_bits", which is laid out immediately after us_free. */ #ifdef INVARIANTS #define SLAB_BITSETS 2 #else #define SLAB_BITSETS 1 #endif /* * These are the two zones from which all offpage uma_slab_ts are allocated. * * One zone is for slab headers that can represent a larger number of items, * making the slabs themselves more efficient, and the other zone is for * headers that are smaller and represent fewer items, making the headers more * efficient. */ #define SLABZONE_SIZE(setsize) \ (sizeof(struct uma_hash_slab) + BITSET_SIZE(setsize) * SLAB_BITSETS) #define SLABZONE0_SETSIZE (PAGE_SIZE / 16) #define SLABZONE1_SETSIZE SLAB_MAX_SETSIZE #define SLABZONE0_SIZE SLABZONE_SIZE(SLABZONE0_SETSIZE) #define SLABZONE1_SIZE SLABZONE_SIZE(SLABZONE1_SETSIZE) static uma_zone_t slabzones[2]; /* * The initial hash tables come out of this zone so they can be allocated * prior to malloc coming up. */ static uma_zone_t hashzone; /* The boot-time adjusted value for cache line alignment. */ static unsigned int uma_cache_align_mask = 64 - 1; static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets"); static MALLOC_DEFINE(M_UMA, "UMA", "UMA Misc"); /* * Are we allowed to allocate buckets? */ static int bucketdisable = 1; /* Linked list of all kegs in the system */ static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs); /* Linked list of all cache-only zones in the system */ static LIST_HEAD(,uma_zone) uma_cachezones = LIST_HEAD_INITIALIZER(uma_cachezones); /* * Mutex for global lists: uma_kegs, uma_cachezones, and the per-keg list of * zones. */ static struct rwlock_padalign __exclusive_cache_line uma_rwlock; static struct sx uma_reclaim_lock; /* * First available virual address for boot time allocations. */ static vm_offset_t bootstart; static vm_offset_t bootmem; /* * kmem soft limit, initialized by uma_set_limit(). Ensure that early * allocations don't trigger a wakeup of the reclaim thread. */ unsigned long uma_kmem_limit = LONG_MAX; SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0, "UMA kernel memory soft limit"); unsigned long uma_kmem_total; SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0, "UMA kernel memory usage"); /* Is the VM done starting up? */ static enum { BOOT_COLD, BOOT_KVA, BOOT_PCPU, BOOT_RUNNING, BOOT_SHUTDOWN, } booted = BOOT_COLD; /* * This is the handle used to schedule events that need to happen * outside of the allocation fast path. */ static struct timeout_task uma_timeout_task; #define UMA_TIMEOUT 20 /* Seconds for callout interval. */ /* * This structure is passed as the zone ctor arg so that I don't have to create * a special allocation function just for zones. */ struct uma_zctor_args { const char *name; size_t size; uma_ctor ctor; uma_dtor dtor; uma_init uminit; uma_fini fini; uma_import import; uma_release release; void *arg; uma_keg_t keg; int align; uint32_t flags; }; struct uma_kctor_args { uma_zone_t zone; size_t size; uma_init uminit; uma_fini fini; int align; uint32_t flags; }; struct uma_bucket_zone { uma_zone_t ubz_zone; const char *ubz_name; int ubz_entries; /* Number of items it can hold. */ int ubz_maxsize; /* Maximum allocation size per-item. */ }; /* * Compute the actual number of bucket entries to pack them in power * of two sizes for more efficient space utilization. */ #define BUCKET_SIZE(n) \ (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *)) #define BUCKET_MAX BUCKET_SIZE(256) struct uma_bucket_zone bucket_zones[] = { /* Literal bucket sizes. */ { NULL, "2 Bucket", 2, 4096 }, { NULL, "4 Bucket", 4, 3072 }, { NULL, "8 Bucket", 8, 2048 }, { NULL, "16 Bucket", 16, 1024 }, /* Rounded down power of 2 sizes for efficiency. */ { NULL, "32 Bucket", BUCKET_SIZE(32), 512 }, { NULL, "64 Bucket", BUCKET_SIZE(64), 256 }, { NULL, "128 Bucket", BUCKET_SIZE(128), 128 }, { NULL, "256 Bucket", BUCKET_SIZE(256), 64 }, { NULL, NULL, 0} }; /* * Flags and enumerations to be passed to internal functions. */ enum zfreeskip { SKIP_NONE = 0, SKIP_CNT = 0x00000001, SKIP_DTOR = 0x00010000, SKIP_FINI = 0x00020000, }; /* Prototypes.. */ void uma_startup1(vm_offset_t); void uma_startup2(void); static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *contig_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void page_free(void *, vm_size_t, uint8_t); static void pcpu_page_free(void *, vm_size_t, uint8_t); static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); static void bucket_cache_reclaim(uma_zone_t zone, bool, int); static bool bucket_cache_reclaim_domain(uma_zone_t, bool, bool, int); static int keg_ctor(void *, int, void *, int); static void keg_dtor(void *, int, void *); static void keg_drain(uma_keg_t keg, int domain); static int zone_ctor(void *, int, void *, int); static void zone_dtor(void *, int, void *); static inline void item_dtor(uma_zone_t zone, void *item, int size, void *udata, enum zfreeskip skip); static int zero_init(void *, int, int); static void zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata, int itemdomain, bool ws); static void zone_foreach(void (*zfunc)(uma_zone_t, void *), void *); static void zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *), void *); static void zone_timeout(uma_zone_t zone, void *); static int hash_alloc(struct uma_hash *, u_int); static int hash_expand(struct uma_hash *, struct uma_hash *); static void hash_free(struct uma_hash *hash); static void uma_timeout(void *, int); static void uma_shutdown(void); static void *zone_alloc_item(uma_zone_t, void *, int, int); static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); static int zone_alloc_limit(uma_zone_t zone, int count, int flags); static void zone_free_limit(uma_zone_t zone, int count); static void bucket_enable(void); static void bucket_init(void); static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); static void bucket_free(uma_zone_t zone, uma_bucket_t, void *); static void bucket_zone_drain(int domain); static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int); static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab); static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item); static size_t slab_sizeof(int nitems); static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags); static int zone_import(void *, void **, int, int, int); static void zone_release(void *, void **, int); static bool cache_alloc(uma_zone_t, uma_cache_t, void *, int); static bool cache_free(uma_zone_t, uma_cache_t, void *, int); static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS); static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS); static uint64_t uma_zone_get_allocs(uma_zone_t zone); static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Memory allocation debugging"); #ifdef INVARIANTS static uint64_t uma_keg_get_allocs(uma_keg_t zone); static inline struct noslabbits *slab_dbg_bits(uma_slab_t slab, uma_keg_t keg); static bool uma_dbg_kskip(uma_keg_t keg, void *mem); static bool uma_dbg_zskip(uma_zone_t zone, void *mem); static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item); static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item); static u_int dbg_divisor = 1; SYSCTL_UINT(_vm_debug, OID_AUTO, divisor, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0, "Debug & thrash every this item in memory allocator"); static counter_u64_t uma_dbg_cnt = EARLY_COUNTER; static counter_u64_t uma_skip_cnt = EARLY_COUNTER; SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD, &uma_dbg_cnt, "memory items debugged"); SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD, &uma_skip_cnt, "memory items skipped, not debugged"); #endif SYSCTL_NODE(_vm, OID_AUTO, uma, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Universal Memory Allocator"); SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_INT, 0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones"); SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_STRUCT, 0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats"); static int zone_warnings = 1; SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0, "Warn when UMA zones becomes full"); static int multipage_slabs = 1; TUNABLE_INT("vm.debug.uma_multipage_slabs", &multipage_slabs); SYSCTL_INT(_vm_debug, OID_AUTO, uma_multipage_slabs, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &multipage_slabs, 0, "UMA may choose larger slab sizes for better efficiency"); /* * Select the slab zone for an offpage slab with the given maximum item count. */ static inline uma_zone_t slabzone(int ipers) { return (slabzones[ipers > SLABZONE0_SETSIZE]); } /* * This routine checks to see whether or not it's safe to enable buckets. */ static void bucket_enable(void) { KASSERT(booted >= BOOT_KVA, ("Bucket enable before init")); bucketdisable = vm_page_count_min(); } /* * Initialize bucket_zones, the array of zones of buckets of various sizes. * * For each zone, calculate the memory required for each bucket, consisting * of the header and an array of pointers. */ static void bucket_init(void) { struct uma_bucket_zone *ubz; int size; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) { size = roundup(sizeof(struct uma_bucket), sizeof(void *)); size += sizeof(void *) * ubz->ubz_entries; ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_FIRSTTOUCH); } } /* * Given a desired number of entries for a bucket, return the zone from which * to allocate the bucket. */ static struct uma_bucket_zone * bucket_zone_lookup(int entries) { struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) if (ubz->ubz_entries >= entries) return (ubz); ubz--; return (ubz); } static int bucket_select(int size) { struct uma_bucket_zone *ubz; ubz = &bucket_zones[0]; if (size > ubz->ubz_maxsize) return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1); for (; ubz->ubz_entries != 0; ubz++) if (ubz->ubz_maxsize < size) break; ubz--; return (ubz->ubz_entries); } static uma_bucket_t bucket_alloc(uma_zone_t zone, void *udata, int flags) { struct uma_bucket_zone *ubz; uma_bucket_t bucket; /* * Don't allocate buckets early in boot. */ if (__predict_false(booted < BOOT_KVA)) return (NULL); /* * To limit bucket recursion we store the original zone flags * in a cookie passed via zalloc_arg/zfree_arg. This allows the * NOVM flag to persist even through deep recursions. We also * store ZFLAG_BUCKET once we have recursed attempting to allocate * a bucket for a bucket zone so we do not allow infinite bucket * recursion. This cookie will even persist to frees of unused * buckets via the allocation path or bucket allocations in the * free path. */ if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) udata = (void *)(uintptr_t)zone->uz_flags; else { if ((uintptr_t)udata & UMA_ZFLAG_BUCKET) return (NULL); udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET); } if (((uintptr_t)udata & UMA_ZONE_VM) != 0) flags |= M_NOVM; ubz = bucket_zone_lookup(atomic_load_16(&zone->uz_bucket_size)); if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0) ubz++; bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags); if (bucket) { #ifdef INVARIANTS bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries); #endif bucket->ub_cnt = 0; bucket->ub_entries = min(ubz->ubz_entries, zone->uz_bucket_size_max); bucket->ub_seq = SMR_SEQ_INVALID; CTR3(KTR_UMA, "bucket_alloc: zone %s(%p) allocated bucket %p", zone->uz_name, zone, bucket); } return (bucket); } static void bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata) { struct uma_bucket_zone *ubz; if (bucket->ub_cnt != 0) bucket_drain(zone, bucket); KASSERT(bucket->ub_cnt == 0, ("bucket_free: Freeing a non free bucket.")); KASSERT(bucket->ub_seq == SMR_SEQ_INVALID, ("bucket_free: Freeing an SMR bucket.")); if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) udata = (void *)(uintptr_t)zone->uz_flags; ubz = bucket_zone_lookup(bucket->ub_entries); uma_zfree_arg(ubz->ubz_zone, bucket, udata); } static void bucket_zone_drain(int domain) { struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) uma_zone_reclaim_domain(ubz->ubz_zone, UMA_RECLAIM_DRAIN, domain); } #ifdef KASAN _Static_assert(UMA_SMALLEST_UNIT % KASAN_SHADOW_SCALE == 0, "Base UMA allocation size not a multiple of the KASAN scale factor"); static void kasan_mark_item_valid(uma_zone_t zone, void *item) { void *pcpu_item; size_t sz, rsz; int i; if ((zone->uz_flags & UMA_ZONE_NOKASAN) != 0) return; sz = zone->uz_size; rsz = roundup2(sz, KASAN_SHADOW_SCALE); if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) { kasan_mark(item, sz, rsz, KASAN_GENERIC_REDZONE); } else { pcpu_item = zpcpu_base_to_offset(item); for (i = 0; i <= mp_maxid; i++) kasan_mark(zpcpu_get_cpu(pcpu_item, i), sz, rsz, KASAN_GENERIC_REDZONE); } } static void kasan_mark_item_invalid(uma_zone_t zone, void *item) { void *pcpu_item; size_t sz; int i; if ((zone->uz_flags & UMA_ZONE_NOKASAN) != 0) return; sz = roundup2(zone->uz_size, KASAN_SHADOW_SCALE); if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) { kasan_mark(item, 0, sz, KASAN_UMA_FREED); } else { pcpu_item = zpcpu_base_to_offset(item); for (i = 0; i <= mp_maxid; i++) kasan_mark(zpcpu_get_cpu(pcpu_item, i), 0, sz, KASAN_UMA_FREED); } } static void kasan_mark_slab_valid(uma_keg_t keg, void *mem) { size_t sz; if ((keg->uk_flags & UMA_ZONE_NOKASAN) == 0) { sz = keg->uk_ppera * PAGE_SIZE; kasan_mark(mem, sz, sz, 0); } } static void kasan_mark_slab_invalid(uma_keg_t keg, void *mem) { size_t sz; if ((keg->uk_flags & UMA_ZONE_NOKASAN) == 0) { if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0) sz = keg->uk_ppera * PAGE_SIZE; else sz = keg->uk_pgoff; kasan_mark(mem, 0, sz, KASAN_UMA_FREED); } } #else /* !KASAN */ static void kasan_mark_item_valid(uma_zone_t zone __unused, void *item __unused) { } static void kasan_mark_item_invalid(uma_zone_t zone __unused, void *item __unused) { } static void kasan_mark_slab_valid(uma_keg_t keg __unused, void *mem __unused) { } static void kasan_mark_slab_invalid(uma_keg_t keg __unused, void *mem __unused) { } #endif /* KASAN */ #ifdef KMSAN static inline void kmsan_mark_item_uninitialized(uma_zone_t zone, void *item) { void *pcpu_item; size_t sz; int i; if ((zone->uz_flags & (UMA_ZFLAG_CACHE | UMA_ZONE_SECONDARY | UMA_ZONE_MALLOC)) != 0) { /* * Cache zones should not be instrumented by default, as UMA * does not have enough information to do so correctly. * Consumers can mark items themselves if it makes sense to do * so. * * Items from secondary zones are initialized by the parent * zone and thus cannot safely be marked by UMA. * * malloc zones are handled directly by malloc(9) and friends, * since they can provide more precise origin tracking. */ return; } if (zone->uz_keg->uk_init != NULL) { /* * By definition, initialized items cannot be marked. The * best we can do is mark items from these zones after they * are freed to the keg. */ return; } sz = zone->uz_size; if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) { kmsan_orig(item, sz, KMSAN_TYPE_UMA, KMSAN_RET_ADDR); kmsan_mark(item, sz, KMSAN_STATE_UNINIT); } else { pcpu_item = zpcpu_base_to_offset(item); for (i = 0; i <= mp_maxid; i++) { kmsan_orig(zpcpu_get_cpu(pcpu_item, i), sz, KMSAN_TYPE_UMA, KMSAN_RET_ADDR); kmsan_mark(zpcpu_get_cpu(pcpu_item, i), sz, KMSAN_STATE_INITED); } } } #else /* !KMSAN */ static inline void kmsan_mark_item_uninitialized(uma_zone_t zone __unused, void *item __unused) { } #endif /* KMSAN */ /* * Acquire the domain lock and record contention. */ static uma_zone_domain_t zone_domain_lock(uma_zone_t zone, int domain) { uma_zone_domain_t zdom; bool lockfail; zdom = ZDOM_GET(zone, domain); lockfail = false; if (ZDOM_OWNED(zdom)) lockfail = true; ZDOM_LOCK(zdom); /* This is unsynchronized. The counter does not need to be precise. */ if (lockfail && zone->uz_bucket_size < zone->uz_bucket_size_max) zone->uz_bucket_size++; return (zdom); } /* * Search for the domain with the least cached items and return it if it * is out of balance with the preferred domain. */ static __noinline int zone_domain_lowest(uma_zone_t zone, int pref) { long least, nitems, prefitems; int domain; int i; prefitems = least = LONG_MAX; domain = 0; for (i = 0; i < vm_ndomains; i++) { nitems = ZDOM_GET(zone, i)->uzd_nitems; if (nitems < least) { domain = i; least = nitems; } if (domain == pref) prefitems = nitems; } if (prefitems < least * 2) return (pref); return (domain); } /* * Search for the domain with the most cached items and return it or the * preferred domain if it has enough to proceed. */ static __noinline int zone_domain_highest(uma_zone_t zone, int pref) { long most, nitems; int domain; int i; if (ZDOM_GET(zone, pref)->uzd_nitems > BUCKET_MAX) return (pref); most = 0; domain = 0; for (i = 0; i < vm_ndomains; i++) { nitems = ZDOM_GET(zone, i)->uzd_nitems; if (nitems > most) { domain = i; most = nitems; } } return (domain); } /* * Set the maximum imax value. */ static void zone_domain_imax_set(uma_zone_domain_t zdom, int nitems) { long old; old = zdom->uzd_imax; do { if (old >= nitems) return; } while (atomic_fcmpset_long(&zdom->uzd_imax, &old, nitems) == 0); /* * We are at new maximum, so do the last WSS update for the old * bimin and prepare to measure next allocation batch. */ if (zdom->uzd_wss < old - zdom->uzd_bimin) zdom->uzd_wss = old - zdom->uzd_bimin; zdom->uzd_bimin = nitems; } /* * Attempt to satisfy an allocation by retrieving a full bucket from one of the * zone's caches. If a bucket is found the zone is not locked on return. */ static uma_bucket_t zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, bool reclaim) { uma_bucket_t bucket; long cnt; int i; bool dtor = false; ZDOM_LOCK_ASSERT(zdom); if ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) == NULL) return (NULL); /* SMR Buckets can not be re-used until readers expire. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && bucket->ub_seq != SMR_SEQ_INVALID) { if (!smr_poll(zone->uz_smr, bucket->ub_seq, false)) return (NULL); bucket->ub_seq = SMR_SEQ_INVALID; dtor = (zone->uz_dtor != NULL) || UMA_ALWAYS_CTORDTOR; if (STAILQ_NEXT(bucket, ub_link) != NULL) zdom->uzd_seq = STAILQ_NEXT(bucket, ub_link)->ub_seq; } STAILQ_REMOVE_HEAD(&zdom->uzd_buckets, ub_link); KASSERT(zdom->uzd_nitems >= bucket->ub_cnt, ("%s: item count underflow (%ld, %d)", __func__, zdom->uzd_nitems, bucket->ub_cnt)); KASSERT(bucket->ub_cnt > 0, ("%s: empty bucket in bucket cache", __func__)); zdom->uzd_nitems -= bucket->ub_cnt; if (reclaim) { /* * Shift the bounds of the current WSS interval to avoid * perturbing the estimates. */ cnt = lmin(zdom->uzd_bimin, bucket->ub_cnt); atomic_subtract_long(&zdom->uzd_imax, cnt); zdom->uzd_bimin -= cnt; zdom->uzd_imin -= lmin(zdom->uzd_imin, bucket->ub_cnt); if (zdom->uzd_limin >= bucket->ub_cnt) { zdom->uzd_limin -= bucket->ub_cnt; } else { zdom->uzd_limin = 0; zdom->uzd_timin = 0; } } else if (zdom->uzd_bimin > zdom->uzd_nitems) { zdom->uzd_bimin = zdom->uzd_nitems; if (zdom->uzd_imin > zdom->uzd_nitems) zdom->uzd_imin = zdom->uzd_nitems; } ZDOM_UNLOCK(zdom); if (dtor) for (i = 0; i < bucket->ub_cnt; i++) item_dtor(zone, bucket->ub_bucket[i], zone->uz_size, NULL, SKIP_NONE); return (bucket); } /* * Insert a full bucket into the specified cache. The "ws" parameter indicates * whether the bucket's contents should be counted as part of the zone's working * set. The bucket may be freed if it exceeds the bucket limit. */ static void zone_put_bucket(uma_zone_t zone, int domain, uma_bucket_t bucket, void *udata, const bool ws) { uma_zone_domain_t zdom; /* We don't cache empty buckets. This can happen after a reclaim. */ if (bucket->ub_cnt == 0) goto out; zdom = zone_domain_lock(zone, domain); /* * Conditionally set the maximum number of items. */ zdom->uzd_nitems += bucket->ub_cnt; if (__predict_true(zdom->uzd_nitems < zone->uz_bucket_max)) { if (ws) { zone_domain_imax_set(zdom, zdom->uzd_nitems); } else { /* * Shift the bounds of the current WSS interval to * avoid perturbing the estimates. */ atomic_add_long(&zdom->uzd_imax, bucket->ub_cnt); zdom->uzd_imin += bucket->ub_cnt; zdom->uzd_bimin += bucket->ub_cnt; zdom->uzd_limin += bucket->ub_cnt; } if (STAILQ_EMPTY(&zdom->uzd_buckets)) zdom->uzd_seq = bucket->ub_seq; /* * Try to promote reuse of recently used items. For items * protected by SMR, try to defer reuse to minimize polling. */ if (bucket->ub_seq == SMR_SEQ_INVALID) STAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); else STAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link); ZDOM_UNLOCK(zdom); return; } zdom->uzd_nitems -= bucket->ub_cnt; ZDOM_UNLOCK(zdom); out: bucket_free(zone, bucket, udata); } /* Pops an item out of a per-cpu cache bucket. */ static inline void * cache_bucket_pop(uma_cache_t cache, uma_cache_bucket_t bucket) { void *item; CRITICAL_ASSERT(curthread); bucket->ucb_cnt--; item = bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt]; #ifdef INVARIANTS bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = NULL; KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled.")); #endif cache->uc_allocs++; return (item); } /* Pushes an item into a per-cpu cache bucket. */ static inline void cache_bucket_push(uma_cache_t cache, uma_cache_bucket_t bucket, void *item) { CRITICAL_ASSERT(curthread); KASSERT(bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] == NULL, ("uma_zfree: Freeing to non free bucket index.")); bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = item; bucket->ucb_cnt++; cache->uc_frees++; } /* * Unload a UMA bucket from a per-cpu cache. */ static inline uma_bucket_t cache_bucket_unload(uma_cache_bucket_t bucket) { uma_bucket_t b; b = bucket->ucb_bucket; if (b != NULL) { MPASS(b->ub_entries == bucket->ucb_entries); b->ub_cnt = bucket->ucb_cnt; bucket->ucb_bucket = NULL; bucket->ucb_entries = bucket->ucb_cnt = 0; } return (b); } static inline uma_bucket_t cache_bucket_unload_alloc(uma_cache_t cache) { return (cache_bucket_unload(&cache->uc_allocbucket)); } static inline uma_bucket_t cache_bucket_unload_free(uma_cache_t cache) { return (cache_bucket_unload(&cache->uc_freebucket)); } static inline uma_bucket_t cache_bucket_unload_cross(uma_cache_t cache) { return (cache_bucket_unload(&cache->uc_crossbucket)); } /* * Load a bucket into a per-cpu cache bucket. */ static inline void cache_bucket_load(uma_cache_bucket_t bucket, uma_bucket_t b) { CRITICAL_ASSERT(curthread); MPASS(bucket->ucb_bucket == NULL); MPASS(b->ub_seq == SMR_SEQ_INVALID); bucket->ucb_bucket = b; bucket->ucb_cnt = b->ub_cnt; bucket->ucb_entries = b->ub_entries; } static inline void cache_bucket_load_alloc(uma_cache_t cache, uma_bucket_t b) { cache_bucket_load(&cache->uc_allocbucket, b); } static inline void cache_bucket_load_free(uma_cache_t cache, uma_bucket_t b) { cache_bucket_load(&cache->uc_freebucket, b); } #ifdef NUMA static inline void cache_bucket_load_cross(uma_cache_t cache, uma_bucket_t b) { cache_bucket_load(&cache->uc_crossbucket, b); } #endif /* * Copy and preserve ucb_spare. */ static inline void cache_bucket_copy(uma_cache_bucket_t b1, uma_cache_bucket_t b2) { b1->ucb_bucket = b2->ucb_bucket; b1->ucb_entries = b2->ucb_entries; b1->ucb_cnt = b2->ucb_cnt; } /* * Swap two cache buckets. */ static inline void cache_bucket_swap(uma_cache_bucket_t b1, uma_cache_bucket_t b2) { struct uma_cache_bucket b3; CRITICAL_ASSERT(curthread); cache_bucket_copy(&b3, b1); cache_bucket_copy(b1, b2); cache_bucket_copy(b2, &b3); } /* * Attempt to fetch a bucket from a zone on behalf of the current cpu cache. */ static uma_bucket_t cache_fetch_bucket(uma_zone_t zone, uma_cache_t cache, int domain) { uma_zone_domain_t zdom; uma_bucket_t bucket; smr_seq_t seq; /* * Avoid the lock if possible. */ zdom = ZDOM_GET(zone, domain); if (zdom->uzd_nitems == 0) return (NULL); if ((cache_uz_flags(cache) & UMA_ZONE_SMR) != 0 && (seq = atomic_load_32(&zdom->uzd_seq)) != SMR_SEQ_INVALID && !smr_poll(zone->uz_smr, seq, false)) return (NULL); /* * Check the zone's cache of buckets. */ zdom = zone_domain_lock(zone, domain); if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL) return (bucket); ZDOM_UNLOCK(zdom); return (NULL); } static void zone_log_warning(uma_zone_t zone) { static const struct timeval warninterval = { 300, 0 }; if (!zone_warnings || zone->uz_warning == NULL) return; if (ratecheck(&zone->uz_ratecheck, &warninterval)) printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning); } static inline void zone_maxaction(uma_zone_t zone) { if (zone->uz_maxaction.ta_func != NULL) taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction); } /* * Routine called by timeout which is used to fire off some time interval * based calculations. (stats, hash size, etc.) * * Arguments: * arg Unused * * Returns: * Nothing */ static void uma_timeout(void *context __unused, int pending __unused) { bucket_enable(); zone_foreach(zone_timeout, NULL); /* Reschedule this event */ taskqueue_enqueue_timeout(taskqueue_thread, &uma_timeout_task, UMA_TIMEOUT * hz); } /* * Update the working set size estimates for the zone's bucket cache. * The constants chosen here are somewhat arbitrary. */ static void zone_domain_update_wss(uma_zone_domain_t zdom) { long m; ZDOM_LOCK_ASSERT(zdom); MPASS(zdom->uzd_imax >= zdom->uzd_nitems); MPASS(zdom->uzd_nitems >= zdom->uzd_bimin); MPASS(zdom->uzd_bimin >= zdom->uzd_imin); /* * Estimate WSS as modified moving average of biggest allocation * batches for each period over few minutes (UMA_TIMEOUT of 20s). */ zdom->uzd_wss = lmax(zdom->uzd_wss * 3 / 4, zdom->uzd_imax - zdom->uzd_bimin); /* * Estimate longtime minimum item count as a combination of recent * minimum item count, adjusted by WSS for safety, and the modified * moving average over the last several hours (UMA_TIMEOUT of 20s). * timin measures time since limin tried to go negative, that means * we were dangerously close to or got out of cache. */ m = zdom->uzd_imin - zdom->uzd_wss; if (m >= 0) { if (zdom->uzd_limin >= m) zdom->uzd_limin = m; else zdom->uzd_limin = (m + zdom->uzd_limin * 255) / 256; zdom->uzd_timin++; } else { zdom->uzd_limin = 0; zdom->uzd_timin = 0; } /* To reduce period edge effects on WSS keep half of the imax. */ atomic_subtract_long(&zdom->uzd_imax, (zdom->uzd_imax - zdom->uzd_nitems + 1) / 2); zdom->uzd_imin = zdom->uzd_bimin = zdom->uzd_nitems; } /* * Routine to perform timeout driven calculations. This expands the * hashes and does per cpu statistics aggregation. * * Returns nothing. */ static void zone_timeout(uma_zone_t zone, void *unused) { uma_keg_t keg; u_int slabs, pages; if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0) goto trim; keg = zone->uz_keg; /* * Hash zones are non-numa by definition so the first domain * is the only one present. */ KEG_LOCK(keg, 0); pages = keg->uk_domain[0].ud_pages; /* * Expand the keg hash table. * * This is done if the number of slabs is larger than the hash size. * What I'm trying to do here is completely reduce collisions. This * may be a little aggressive. Should I allow for two collisions max? */ if ((slabs = pages / keg->uk_ppera) > keg->uk_hash.uh_hashsize) { struct uma_hash newhash; struct uma_hash oldhash; int ret; /* * This is so involved because allocating and freeing * while the keg lock is held will lead to deadlock. * I have to do everything in stages and check for * races. */ KEG_UNLOCK(keg, 0); ret = hash_alloc(&newhash, 1 << fls(slabs)); KEG_LOCK(keg, 0); if (ret) { if (hash_expand(&keg->uk_hash, &newhash)) { oldhash = keg->uk_hash; keg->uk_hash = newhash; } else oldhash = newhash; KEG_UNLOCK(keg, 0); hash_free(&oldhash); goto trim; } } KEG_UNLOCK(keg, 0); trim: /* Trim caches not used for a long time. */ - if ((zone->uz_flags & UMA_ZONE_UNMANAGED) == 0) { + if ((zone->uz_flags & (UMA_ZONE_UNMANAGED | UMA_ZONE_NOTRIM)) == 0) { for (int i = 0; i < vm_ndomains; i++) { if (bucket_cache_reclaim_domain(zone, false, false, i) && (zone->uz_flags & UMA_ZFLAG_CACHE) == 0) keg_drain(zone->uz_keg, i); } } } /* * Allocate and zero fill the next sized hash table from the appropriate * backing store. * * Arguments: * hash A new hash structure with the old hash size in uh_hashsize * * Returns: * 1 on success and 0 on failure. */ static int hash_alloc(struct uma_hash *hash, u_int size) { size_t alloc; KASSERT(powerof2(size), ("hash size must be power of 2")); if (size > UMA_HASH_SIZE_INIT) { hash->uh_hashsize = size; alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize; hash->uh_slab_hash = malloc(alloc, M_UMAHASH, M_NOWAIT); } else { alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT; hash->uh_slab_hash = zone_alloc_item(hashzone, NULL, UMA_ANYDOMAIN, M_WAITOK); hash->uh_hashsize = UMA_HASH_SIZE_INIT; } if (hash->uh_slab_hash) { bzero(hash->uh_slab_hash, alloc); hash->uh_hashmask = hash->uh_hashsize - 1; return (1); } return (0); } /* * Expands the hash table for HASH zones. This is done from zone_timeout * to reduce collisions. This must not be done in the regular allocation * path, otherwise, we can recurse on the vm while allocating pages. * * Arguments: * oldhash The hash you want to expand * newhash The hash structure for the new table * * Returns: * Nothing * * Discussion: */ static int hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) { uma_hash_slab_t slab; u_int hval; u_int idx; if (!newhash->uh_slab_hash) return (0); if (oldhash->uh_hashsize >= newhash->uh_hashsize) return (0); /* * I need to investigate hash algorithms for resizing without a * full rehash. */ for (idx = 0; idx < oldhash->uh_hashsize; idx++) while (!LIST_EMPTY(&oldhash->uh_slab_hash[idx])) { slab = LIST_FIRST(&oldhash->uh_slab_hash[idx]); LIST_REMOVE(slab, uhs_hlink); hval = UMA_HASH(newhash, slab->uhs_data); LIST_INSERT_HEAD(&newhash->uh_slab_hash[hval], slab, uhs_hlink); } return (1); } /* * Free the hash bucket to the appropriate backing store. * * Arguments: * slab_hash The hash bucket we're freeing * hashsize The number of entries in that hash bucket * * Returns: * Nothing */ static void hash_free(struct uma_hash *hash) { if (hash->uh_slab_hash == NULL) return; if (hash->uh_hashsize == UMA_HASH_SIZE_INIT) zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE); else free(hash->uh_slab_hash, M_UMAHASH); } /* * Frees all outstanding items in a bucket * * Arguments: * zone The zone to free to, must be unlocked. * bucket The free/alloc bucket with items. * * Returns: * Nothing */ static void bucket_drain(uma_zone_t zone, uma_bucket_t bucket) { int i; if (bucket->ub_cnt == 0) return; if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && bucket->ub_seq != SMR_SEQ_INVALID) { smr_wait(zone->uz_smr, bucket->ub_seq); bucket->ub_seq = SMR_SEQ_INVALID; for (i = 0; i < bucket->ub_cnt; i++) item_dtor(zone, bucket->ub_bucket[i], zone->uz_size, NULL, SKIP_NONE); } if (zone->uz_fini) for (i = 0; i < bucket->ub_cnt; i++) { kasan_mark_item_valid(zone, bucket->ub_bucket[i]); zone->uz_fini(bucket->ub_bucket[i], zone->uz_size); kasan_mark_item_invalid(zone, bucket->ub_bucket[i]); } zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt); if (zone->uz_max_items > 0) zone_free_limit(zone, bucket->ub_cnt); #ifdef INVARIANTS bzero(bucket->ub_bucket, sizeof(void *) * bucket->ub_cnt); #endif bucket->ub_cnt = 0; } /* * Drains the per cpu caches for a zone. * * NOTE: This may only be called while the zone is being torn down, and not * during normal operation. This is necessary in order that we do not have * to migrate CPUs to drain the per-CPU caches. * * Arguments: * zone The zone to drain, must be unlocked. * * Returns: * Nothing */ static void cache_drain(uma_zone_t zone) { uma_cache_t cache; uma_bucket_t bucket; smr_seq_t seq; int cpu; /* * XXX: It is safe to not lock the per-CPU caches, because we're * tearing down the zone anyway. I.e., there will be no further use * of the caches at this point. * * XXX: It would good to be able to assert that the zone is being * torn down to prevent improper use of cache_drain(). */ seq = SMR_SEQ_INVALID; if ((zone->uz_flags & UMA_ZONE_SMR) != 0) seq = smr_advance(zone->uz_smr); CPU_FOREACH(cpu) { cache = &zone->uz_cpu[cpu]; bucket = cache_bucket_unload_alloc(cache); if (bucket != NULL) bucket_free(zone, bucket, NULL); bucket = cache_bucket_unload_free(cache); if (bucket != NULL) { bucket->ub_seq = seq; bucket_free(zone, bucket, NULL); } bucket = cache_bucket_unload_cross(cache); if (bucket != NULL) { bucket->ub_seq = seq; bucket_free(zone, bucket, NULL); } } bucket_cache_reclaim(zone, true, UMA_ANYDOMAIN); } static void cache_shrink(uma_zone_t zone, void *unused) { if (zone->uz_flags & UMA_ZFLAG_INTERNAL) return; ZONE_LOCK(zone); zone->uz_bucket_size = (zone->uz_bucket_size_min + zone->uz_bucket_size) / 2; ZONE_UNLOCK(zone); } static void cache_drain_safe_cpu(uma_zone_t zone, void *unused) { uma_cache_t cache; uma_bucket_t b1, b2, b3; int domain; if (zone->uz_flags & UMA_ZFLAG_INTERNAL) return; b1 = b2 = b3 = NULL; critical_enter(); cache = &zone->uz_cpu[curcpu]; domain = PCPU_GET(domain); b1 = cache_bucket_unload_alloc(cache); /* * Don't flush SMR zone buckets. This leaves the zone without a * bucket and forces every free to synchronize(). */ if ((zone->uz_flags & UMA_ZONE_SMR) == 0) { b2 = cache_bucket_unload_free(cache); b3 = cache_bucket_unload_cross(cache); } critical_exit(); if (b1 != NULL) zone_free_bucket(zone, b1, NULL, domain, false); if (b2 != NULL) zone_free_bucket(zone, b2, NULL, domain, false); if (b3 != NULL) { /* Adjust the domain so it goes to zone_free_cross. */ domain = (domain + 1) % vm_ndomains; zone_free_bucket(zone, b3, NULL, domain, false); } } /* * Safely drain per-CPU caches of a zone(s) to alloc bucket. * This is an expensive call because it needs to bind to all CPUs * one by one and enter a critical section on each of them in order * to safely access their cache buckets. * Zone lock must not be held on call this function. */ static void pcpu_cache_drain_safe(uma_zone_t zone) { int cpu; /* * Polite bucket sizes shrinking was not enough, shrink aggressively. */ if (zone) cache_shrink(zone, NULL); else zone_foreach(cache_shrink, NULL); CPU_FOREACH(cpu) { thread_lock(curthread); sched_bind(curthread, cpu); thread_unlock(curthread); if (zone) cache_drain_safe_cpu(zone, NULL); else zone_foreach(cache_drain_safe_cpu, NULL); } thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } /* * Reclaim cached buckets from a zone. All buckets are reclaimed if the caller * requested a drain, otherwise the per-domain caches are trimmed to either * estimated working set size. */ static bool bucket_cache_reclaim_domain(uma_zone_t zone, bool drain, bool trim, int domain) { uma_zone_domain_t zdom; uma_bucket_t bucket; long target; bool done = false; /* * The cross bucket is partially filled and not part of * the item count. Reclaim it individually here. */ zdom = ZDOM_GET(zone, domain); if ((zone->uz_flags & UMA_ZONE_SMR) == 0 || drain) { ZONE_CROSS_LOCK(zone); bucket = zdom->uzd_cross; zdom->uzd_cross = NULL; ZONE_CROSS_UNLOCK(zone); if (bucket != NULL) bucket_free(zone, bucket, NULL); } /* * If we were asked to drain the zone, we are done only once * this bucket cache is empty. If trim, we reclaim items in * excess of the zone's estimated working set size. Multiple * consecutive calls will shrink the WSS and so reclaim more. * If neither drain nor trim, then voluntarily reclaim 1/4 * (to reduce first spike) of items not used for a long time. */ ZDOM_LOCK(zdom); zone_domain_update_wss(zdom); if (drain) target = 0; else if (trim) target = zdom->uzd_wss; else if (zdom->uzd_timin > 900 / UMA_TIMEOUT) target = zdom->uzd_nitems - zdom->uzd_limin / 4; else { ZDOM_UNLOCK(zdom); return (done); } while ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) != NULL && zdom->uzd_nitems >= target + bucket->ub_cnt) { bucket = zone_fetch_bucket(zone, zdom, true); if (bucket == NULL) break; bucket_free(zone, bucket, NULL); done = true; ZDOM_LOCK(zdom); } ZDOM_UNLOCK(zdom); return (done); } static void bucket_cache_reclaim(uma_zone_t zone, bool drain, int domain) { int i; /* * Shrink the zone bucket size to ensure that the per-CPU caches * don't grow too large. */ if (zone->uz_bucket_size > zone->uz_bucket_size_min) zone->uz_bucket_size--; if (domain != UMA_ANYDOMAIN && (zone->uz_flags & UMA_ZONE_ROUNDROBIN) == 0) { bucket_cache_reclaim_domain(zone, drain, true, domain); } else { for (i = 0; i < vm_ndomains; i++) bucket_cache_reclaim_domain(zone, drain, true, i); } } static void keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start) { uint8_t *mem; size_t size; int i; uint8_t flags; CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes", keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera); mem = slab_data(slab, keg); size = PAGE_SIZE * keg->uk_ppera; kasan_mark_slab_valid(keg, mem); if (keg->uk_fini != NULL) { for (i = start - 1; i > -1; i--) #ifdef INVARIANTS /* * trash_fini implies that dtor was trash_dtor. trash_fini * would check that memory hasn't been modified since free, * which executed trash_dtor. * That's why we need to run uma_dbg_kskip() check here, * albeit we don't make skip check for other init/fini * invocations. */ if (!uma_dbg_kskip(keg, slab_item(slab, keg, i)) || keg->uk_fini != trash_fini) #endif keg->uk_fini(slab_item(slab, keg, i), keg->uk_size); } flags = slab->us_flags; if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) { zone_free_item(slabzone(keg->uk_ipers), slab_tohashslab(slab), NULL, SKIP_NONE); } keg->uk_freef(mem, size, flags); uma_total_dec(size); } static void keg_drain_domain(uma_keg_t keg, int domain) { struct slabhead freeslabs; uma_domain_t dom; uma_slab_t slab, tmp; uint32_t i, stofree, stokeep, partial; dom = &keg->uk_domain[domain]; LIST_INIT(&freeslabs); CTR4(KTR_UMA, "keg_drain %s(%p) domain %d free items: %u", keg->uk_name, keg, domain, dom->ud_free_items); KEG_LOCK(keg, domain); /* * Are the free items in partially allocated slabs sufficient to meet * the reserve? If not, compute the number of fully free slabs that must * be kept. */ partial = dom->ud_free_items - dom->ud_free_slabs * keg->uk_ipers; if (partial < keg->uk_reserve) { stokeep = min(dom->ud_free_slabs, howmany(keg->uk_reserve - partial, keg->uk_ipers)); } else { stokeep = 0; } stofree = dom->ud_free_slabs - stokeep; /* * Partition the free slabs into two sets: those that must be kept in * order to maintain the reserve, and those that may be released back to * the system. Since one set may be much larger than the other, * populate the smaller of the two sets and swap them if necessary. */ for (i = min(stofree, stokeep); i > 0; i--) { slab = LIST_FIRST(&dom->ud_free_slab); LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&freeslabs, slab, us_link); } if (stofree > stokeep) LIST_SWAP(&freeslabs, &dom->ud_free_slab, uma_slab, us_link); if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) { LIST_FOREACH(slab, &freeslabs, us_link) UMA_HASH_REMOVE(&keg->uk_hash, slab); } dom->ud_free_items -= stofree * keg->uk_ipers; dom->ud_free_slabs -= stofree; dom->ud_pages -= stofree * keg->uk_ppera; KEG_UNLOCK(keg, domain); LIST_FOREACH_SAFE(slab, &freeslabs, us_link, tmp) keg_free_slab(keg, slab, keg->uk_ipers); } /* * Frees pages from a keg back to the system. This is done on demand from * the pageout daemon. * * Returns nothing. */ static void keg_drain(uma_keg_t keg, int domain) { int i; if ((keg->uk_flags & UMA_ZONE_NOFREE) != 0) return; if (domain != UMA_ANYDOMAIN) { keg_drain_domain(keg, domain); } else { for (i = 0; i < vm_ndomains; i++) keg_drain_domain(keg, i); } } static void zone_reclaim(uma_zone_t zone, int domain, int waitok, bool drain) { /* * Count active reclaim operations in order to interlock with * zone_dtor(), which removes the zone from global lists before * attempting to reclaim items itself. * * The zone may be destroyed while sleeping, so only zone_dtor() should * specify M_WAITOK. */ ZONE_LOCK(zone); if (waitok == M_WAITOK) { while (zone->uz_reclaimers > 0) msleep(zone, ZONE_LOCKPTR(zone), PVM, "zonedrain", 1); } zone->uz_reclaimers++; ZONE_UNLOCK(zone); bucket_cache_reclaim(zone, drain, domain); if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) keg_drain(zone->uz_keg, domain); ZONE_LOCK(zone); zone->uz_reclaimers--; if (zone->uz_reclaimers == 0) wakeup(zone); ZONE_UNLOCK(zone); } /* * Allocate a new slab for a keg and inserts it into the partial slab list. * The keg should be unlocked on entry. If the allocation succeeds it will * be locked on return. * * Arguments: * flags Wait flags for the item initialization routine * aflags Wait flags for the slab allocation * * Returns: * The slab that was allocated or NULL if there is no memory and the * caller specified M_NOWAIT. */ static uma_slab_t keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags, int aflags) { uma_domain_t dom; uma_slab_t slab; unsigned long size; uint8_t *mem; uint8_t sflags; int i; TSENTER(); KASSERT(domain >= 0 && domain < vm_ndomains, ("keg_alloc_slab: domain %d out of range", domain)); slab = NULL; mem = NULL; if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) { uma_hash_slab_t hslab; hslab = zone_alloc_item(slabzone(keg->uk_ipers), NULL, domain, aflags); if (hslab == NULL) goto fail; slab = &hslab->uhs_slab; } /* * This reproduces the old vm_zone behavior of zero filling pages the * first time they are added to a zone. * * Malloced items are zeroed in uma_zalloc. */ if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) aflags |= M_ZERO; else aflags &= ~M_ZERO; if (keg->uk_flags & UMA_ZONE_NODUMP) aflags |= M_NODUMP; if (keg->uk_flags & UMA_ZONE_NOFREE) aflags |= M_NEVERFREED; /* zone is passed for legacy reasons. */ size = keg->uk_ppera * PAGE_SIZE; mem = keg->uk_allocf(zone, size, domain, &sflags, aflags); if (mem == NULL) { if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) zone_free_item(slabzone(keg->uk_ipers), slab_tohashslab(slab), NULL, SKIP_NONE); goto fail; } uma_total_inc(size); /* For HASH zones all pages go to the same uma_domain. */ if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) domain = 0; kmsan_mark(mem, size, (aflags & M_ZERO) != 0 ? KMSAN_STATE_INITED : KMSAN_STATE_UNINIT); /* Point the slab into the allocated memory */ if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE)) slab = (uma_slab_t)(mem + keg->uk_pgoff); else slab_tohashslab(slab)->uhs_data = mem; if (keg->uk_flags & UMA_ZFLAG_VTOSLAB) for (i = 0; i < keg->uk_ppera; i++) vsetzoneslab((vm_offset_t)mem + (i * PAGE_SIZE), zone, slab); slab->us_freecount = keg->uk_ipers; slab->us_flags = sflags; slab->us_domain = domain; BIT_FILL(keg->uk_ipers, &slab->us_free); #ifdef INVARIANTS BIT_ZERO(keg->uk_ipers, slab_dbg_bits(slab, keg)); #endif if (keg->uk_init != NULL) { for (i = 0; i < keg->uk_ipers; i++) if (keg->uk_init(slab_item(slab, keg, i), keg->uk_size, flags) != 0) break; if (i != keg->uk_ipers) { keg_free_slab(keg, slab, i); goto fail; } } kasan_mark_slab_invalid(keg, mem); KEG_LOCK(keg, domain); CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)", slab, keg->uk_name, keg); if (keg->uk_flags & UMA_ZFLAG_HASH) UMA_HASH_INSERT(&keg->uk_hash, slab, mem); /* * If we got a slab here it's safe to mark it partially used * and return. We assume that the caller is going to remove * at least one item. */ dom = &keg->uk_domain[domain]; LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); dom->ud_pages += keg->uk_ppera; dom->ud_free_items += keg->uk_ipers; TSEXIT(); return (slab); fail: return (NULL); } /* * This function is intended to be used early on in place of page_alloc(). It * performs contiguous physical memory allocations and uses a bump allocator for * KVA, so is usable before the kernel map is initialized. */ static void * startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { vm_paddr_t pa; vm_page_t m; int i, pages; pages = howmany(bytes, PAGE_SIZE); KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__)); *pflag = UMA_SLAB_BOOT; m = vm_page_alloc_noobj_contig_domain(domain, malloc2vm_flags(wait) | VM_ALLOC_WIRED, pages, (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT); if (m == NULL) return (NULL); pa = VM_PAGE_TO_PHYS(m); for (i = 0; i < pages; i++, pa += PAGE_SIZE) { #if MINIDUMP_PAGE_TRACKING && MINIDUMP_STARTUP_PAGE_TRACKING if ((wait & M_NODUMP) == 0) dump_add_page(pa); #endif } /* Allocate KVA and indirectly advance bootmem. */ return ((void *)pmap_map(&bootmem, m->phys_addr, m->phys_addr + (pages * PAGE_SIZE), VM_PROT_READ | VM_PROT_WRITE)); } static void startup_free(void *mem, vm_size_t bytes) { vm_offset_t va; vm_page_t m; va = (vm_offset_t)mem; m = PHYS_TO_VM_PAGE(pmap_kextract(va)); /* * startup_alloc() returns direct-mapped slabs on some platforms. Avoid * unmapping ranges of the direct map. */ if (va >= bootstart && va + bytes <= bootmem) pmap_remove(kernel_pmap, va, va + bytes); for (; bytes != 0; bytes -= PAGE_SIZE, m++) { #if MINIDUMP_PAGE_TRACKING && MINIDUMP_STARTUP_PAGE_TRACKING dump_drop_page(VM_PAGE_TO_PHYS(m)); #endif vm_page_unwire_noq(m); vm_page_free(m); } } /* * Allocates a number of pages from the system * * Arguments: * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. */ static void * page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { void *p; /* Returned page */ *pflag = UMA_SLAB_KERNEL; p = kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait); return (p); } static void * pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { struct pglist alloctail; vm_offset_t addr, zkva; int cpu, flags; vm_page_t p, p_next; #ifdef NUMA struct pcpu *pc; #endif MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE); TAILQ_INIT(&alloctail); flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | malloc2vm_flags(wait); *pflag = UMA_SLAB_KERNEL; for (cpu = 0; cpu <= mp_maxid; cpu++) { if (CPU_ABSENT(cpu)) { p = vm_page_alloc_noobj(flags); } else { #ifndef NUMA p = vm_page_alloc_noobj(flags); #else pc = pcpu_find(cpu); if (__predict_false(VM_DOMAIN_EMPTY(pc->pc_domain))) p = NULL; else p = vm_page_alloc_noobj_domain(pc->pc_domain, flags); if (__predict_false(p == NULL)) p = vm_page_alloc_noobj(flags); #endif } if (__predict_false(p == NULL)) goto fail; TAILQ_INSERT_TAIL(&alloctail, p, listq); } if ((addr = kva_alloc(bytes)) == 0) goto fail; zkva = addr; TAILQ_FOREACH(p, &alloctail, listq) { pmap_qenter(zkva, &p, 1); zkva += PAGE_SIZE; } return ((void*)addr); fail: TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { vm_page_unwire_noq(p); vm_page_free(p); } return (NULL); } /* * Allocates a number of pages not belonging to a VM object * * Arguments: * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. */ static void * noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait) { TAILQ_HEAD(, vm_page) alloctail; u_long npages; vm_offset_t retkva, zkva; vm_page_t p, p_next; uma_keg_t keg; int req; TAILQ_INIT(&alloctail); keg = zone->uz_keg; req = VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED; if ((wait & M_WAITOK) != 0) req |= VM_ALLOC_WAITOK; npages = howmany(bytes, PAGE_SIZE); while (npages > 0) { p = vm_page_alloc_noobj_domain(domain, req); if (p != NULL) { /* * Since the page does not belong to an object, its * listq is unused. */ TAILQ_INSERT_TAIL(&alloctail, p, listq); npages--; continue; } /* * Page allocation failed, free intermediate pages and * exit. */ TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { vm_page_unwire_noq(p); vm_page_free(p); } return (NULL); } *flags = UMA_SLAB_PRIV; zkva = keg->uk_kva + atomic_fetchadd_long(&keg->uk_offset, round_page(bytes)); retkva = zkva; TAILQ_FOREACH(p, &alloctail, listq) { pmap_qenter(zkva, &p, 1); zkva += PAGE_SIZE; } return ((void *)retkva); } /* * Allocate physically contiguous pages. */ static void * contig_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { *pflag = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain), bytes, wait, 0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT)); } #if defined(UMA_USE_DMAP) && !defined(UMA_MD_SMALL_ALLOC) void * uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait) { vm_page_t m; vm_paddr_t pa; void *va; *flags = UMA_SLAB_PRIV; m = vm_page_alloc_noobj_domain(domain, malloc2vm_flags(wait) | VM_ALLOC_WIRED); if (m == NULL) return (NULL); pa = m->phys_addr; if ((wait & M_NODUMP) == 0) dump_add_page(pa); va = (void *)PHYS_TO_DMAP(pa); return (va); } #endif /* * Frees a number of pages to the system * * Arguments: * mem A pointer to the memory to be freed * size The size of the memory being freed * flags The original p->us_flags field * * Returns: * Nothing */ static void page_free(void *mem, vm_size_t size, uint8_t flags) { if ((flags & UMA_SLAB_BOOT) != 0) { startup_free(mem, size); return; } KASSERT((flags & UMA_SLAB_KERNEL) != 0, ("UMA: page_free used with invalid flags %x", flags)); kmem_free(mem, size); } /* * Frees pcpu zone allocations * * Arguments: * mem A pointer to the memory to be freed * size The size of the memory being freed * flags The original p->us_flags field * * Returns: * Nothing */ static void pcpu_page_free(void *mem, vm_size_t size, uint8_t flags) { vm_offset_t sva, curva; vm_paddr_t paddr; vm_page_t m; MPASS(size == (mp_maxid+1)*PAGE_SIZE); if ((flags & UMA_SLAB_BOOT) != 0) { startup_free(mem, size); return; } sva = (vm_offset_t)mem; for (curva = sva; curva < sva + size; curva += PAGE_SIZE) { paddr = pmap_kextract(curva); m = PHYS_TO_VM_PAGE(paddr); vm_page_unwire_noq(m); vm_page_free(m); } pmap_qremove(sva, size >> PAGE_SHIFT); kva_free(sva, size); } #if defined(UMA_USE_DMAP) && !defined(UMA_MD_SMALL_ALLOC) void uma_small_free(void *mem, vm_size_t size, uint8_t flags) { vm_page_t m; vm_paddr_t pa; pa = DMAP_TO_PHYS((vm_offset_t)mem); dump_drop_page(pa); m = PHYS_TO_VM_PAGE(pa); vm_page_unwire_noq(m); vm_page_free(m); } #endif /* * Zero fill initializer * * Arguments/Returns follow uma_init specifications */ static int zero_init(void *mem, int size, int flags) { bzero(mem, size); return (0); } #ifdef INVARIANTS static struct noslabbits * slab_dbg_bits(uma_slab_t slab, uma_keg_t keg) { return ((void *)((char *)&slab->us_free + BITSET_SIZE(keg->uk_ipers))); } #endif /* * Actual size of embedded struct slab (!OFFPAGE). */ static size_t slab_sizeof(int nitems) { size_t s; s = sizeof(struct uma_slab) + BITSET_SIZE(nitems) * SLAB_BITSETS; return (roundup(s, UMA_ALIGN_PTR + 1)); } #define UMA_FIXPT_SHIFT 31 #define UMA_FRAC_FIXPT(n, d) \ ((uint32_t)(((uint64_t)(n) << UMA_FIXPT_SHIFT) / (d))) #define UMA_FIXPT_PCT(f) \ ((u_int)(((uint64_t)100 * (f)) >> UMA_FIXPT_SHIFT)) #define UMA_PCT_FIXPT(pct) UMA_FRAC_FIXPT((pct), 100) #define UMA_MIN_EFF UMA_PCT_FIXPT(100 - UMA_MAX_WASTE) /* * Compute the number of items that will fit in a slab. If hdr is true, the * item count may be limited to provide space in the slab for an inline slab * header. Otherwise, all slab space will be provided for item storage. */ static u_int slab_ipers_hdr(u_int size, u_int rsize, u_int slabsize, bool hdr) { u_int ipers; u_int padpi; /* The padding between items is not needed after the last item. */ padpi = rsize - size; if (hdr) { /* * Start with the maximum item count and remove items until * the slab header first alongside the allocatable memory. */ for (ipers = MIN(SLAB_MAX_SETSIZE, (slabsize + padpi - slab_sizeof(1)) / rsize); ipers > 0 && ipers * rsize - padpi + slab_sizeof(ipers) > slabsize; ipers--) continue; } else { ipers = MIN((slabsize + padpi) / rsize, SLAB_MAX_SETSIZE); } return (ipers); } struct keg_layout_result { u_int format; u_int slabsize; u_int ipers; u_int eff; }; static void keg_layout_one(uma_keg_t keg, u_int rsize, u_int slabsize, u_int fmt, struct keg_layout_result *kl) { u_int total; kl->format = fmt; kl->slabsize = slabsize; /* Handle INTERNAL as inline with an extra page. */ if ((fmt & UMA_ZFLAG_INTERNAL) != 0) { kl->format &= ~UMA_ZFLAG_INTERNAL; kl->slabsize += PAGE_SIZE; } kl->ipers = slab_ipers_hdr(keg->uk_size, rsize, kl->slabsize, (fmt & UMA_ZFLAG_OFFPAGE) == 0); /* Account for memory used by an offpage slab header. */ total = kl->slabsize; if ((fmt & UMA_ZFLAG_OFFPAGE) != 0) total += slabzone(kl->ipers)->uz_keg->uk_rsize; kl->eff = UMA_FRAC_FIXPT(kl->ipers * rsize, total); } /* * Determine the format of a uma keg. This determines where the slab header * will be placed (inline or offpage) and calculates ipers, rsize, and ppera. * * Arguments * keg The zone we should initialize * * Returns * Nothing */ static void keg_layout(uma_keg_t keg) { struct keg_layout_result kl = {}, kl_tmp; u_int fmts[2]; u_int alignsize; u_int nfmt; u_int pages; u_int rsize; u_int slabsize; u_int i, j; KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 || (keg->uk_size <= UMA_PCPU_ALLOC_SIZE && (keg->uk_flags & UMA_ZONE_CACHESPREAD) == 0), ("%s: cannot configure for PCPU: keg=%s, size=%u, flags=0x%b", __func__, keg->uk_name, keg->uk_size, keg->uk_flags, PRINT_UMA_ZFLAGS)); KASSERT((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) == 0 || (keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0, ("%s: incompatible flags 0x%b", __func__, keg->uk_flags, PRINT_UMA_ZFLAGS)); alignsize = keg->uk_align + 1; #ifdef KASAN /* * ASAN requires that each allocation be aligned to the shadow map * scale factor. */ if (alignsize < KASAN_SHADOW_SCALE) alignsize = KASAN_SHADOW_SCALE; #endif /* * Calculate the size of each allocation (rsize) according to * alignment. If the requested size is smaller than we have * allocation bits for we round it up. */ rsize = MAX(keg->uk_size, UMA_SMALLEST_UNIT); rsize = roundup2(rsize, alignsize); if ((keg->uk_flags & UMA_ZONE_CACHESPREAD) != 0) { /* * We want one item to start on every align boundary in a page. * To do this we will span pages. We will also extend the item * by the size of align if it is an even multiple of align. * Otherwise, it would fall on the same boundary every time. */ if ((rsize & alignsize) == 0) rsize += alignsize; slabsize = rsize * (PAGE_SIZE / alignsize); slabsize = MIN(slabsize, rsize * SLAB_MAX_SETSIZE); slabsize = MIN(slabsize, UMA_CACHESPREAD_MAX_SIZE); slabsize = round_page(slabsize); } else { /* * Start with a slab size of as many pages as it takes to * represent a single item. We will try to fit as many * additional items into the slab as possible. */ slabsize = round_page(keg->uk_size); } /* Build a list of all of the available formats for this keg. */ nfmt = 0; /* Evaluate an inline slab layout. */ if ((keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0) fmts[nfmt++] = 0; /* TODO: vm_page-embedded slab. */ /* * We can't do OFFPAGE if we're internal or if we've been * asked to not go to the VM for buckets. If we do this we * may end up going to the VM for slabs which we do not want * to do if we're UMA_ZONE_VM, which clearly forbids it. * In those cases, evaluate a pseudo-format called INTERNAL * which has an inline slab header and one extra page to * guarantee that it fits. * * Otherwise, see if using an OFFPAGE slab will improve our * efficiency. */ if ((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) != 0) fmts[nfmt++] = UMA_ZFLAG_INTERNAL; else fmts[nfmt++] = UMA_ZFLAG_OFFPAGE; /* * Choose a slab size and format which satisfy the minimum efficiency. * Prefer the smallest slab size that meets the constraints. * * Start with a minimum slab size, to accommodate CACHESPREAD. Then, * for small items (up to PAGE_SIZE), the iteration increment is one * page; and for large items, the increment is one item. */ i = (slabsize + rsize - keg->uk_size) / MAX(PAGE_SIZE, rsize); KASSERT(i >= 1, ("keg %s(%p) flags=0x%b slabsize=%u, rsize=%u, i=%u", keg->uk_name, keg, keg->uk_flags, PRINT_UMA_ZFLAGS, slabsize, rsize, i)); for ( ; ; i++) { slabsize = (rsize <= PAGE_SIZE) ? ptoa(i) : round_page(rsize * (i - 1) + keg->uk_size); for (j = 0; j < nfmt; j++) { /* Only if we have no viable format yet. */ if ((fmts[j] & UMA_ZFLAG_INTERNAL) != 0 && kl.ipers > 0) continue; keg_layout_one(keg, rsize, slabsize, fmts[j], &kl_tmp); if (kl_tmp.eff <= kl.eff) continue; kl = kl_tmp; CTR6(KTR_UMA, "keg %s layout: format %#x " "(ipers %u * rsize %u) / slabsize %#x = %u%% eff", keg->uk_name, kl.format, kl.ipers, rsize, kl.slabsize, UMA_FIXPT_PCT(kl.eff)); /* Stop when we reach the minimum efficiency. */ if (kl.eff >= UMA_MIN_EFF) break; } if (kl.eff >= UMA_MIN_EFF || !multipage_slabs || slabsize >= SLAB_MAX_SETSIZE * rsize || (keg->uk_flags & (UMA_ZONE_PCPU | UMA_ZONE_CONTIG)) != 0) break; } pages = atop(kl.slabsize); if ((keg->uk_flags & UMA_ZONE_PCPU) != 0) pages *= mp_maxid + 1; keg->uk_rsize = rsize; keg->uk_ipers = kl.ipers; keg->uk_ppera = pages; keg->uk_flags |= kl.format; /* * How do we find the slab header if it is offpage or if not all item * start addresses are in the same page? We could solve the latter * case with vaddr alignment, but we don't. */ if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0 || (keg->uk_ipers - 1) * rsize >= PAGE_SIZE) { if ((keg->uk_flags & UMA_ZONE_NOTPAGE) != 0) keg->uk_flags |= UMA_ZFLAG_HASH; else keg->uk_flags |= UMA_ZFLAG_VTOSLAB; } CTR6(KTR_UMA, "%s: keg=%s, flags=%#x, rsize=%u, ipers=%u, ppera=%u", __func__, keg->uk_name, keg->uk_flags, rsize, keg->uk_ipers, pages); KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_MAX_SETSIZE, ("%s: keg=%s, flags=0x%b, rsize=%u, ipers=%u, ppera=%u", __func__, keg->uk_name, keg->uk_flags, PRINT_UMA_ZFLAGS, rsize, keg->uk_ipers, pages)); } /* * Keg header ctor. This initializes all fields, locks, etc. And inserts * the keg onto the global keg list. * * Arguments/Returns follow uma_ctor specifications * udata Actually uma_kctor_args */ static int keg_ctor(void *mem, int size, void *udata, int flags) { struct uma_kctor_args *arg = udata; uma_keg_t keg = mem; uma_zone_t zone; int i; bzero(keg, size); keg->uk_size = arg->size; keg->uk_init = arg->uminit; keg->uk_fini = arg->fini; keg->uk_align = arg->align; keg->uk_reserve = 0; keg->uk_flags = arg->flags; /* * We use a global round-robin policy by default. Zones with * UMA_ZONE_FIRSTTOUCH set will use first-touch instead, in which * case the iterator is never run. */ keg->uk_dr.dr_policy = DOMAINSET_RR(); keg->uk_dr.dr_iter = 0; /* * The primary zone is passed to us at keg-creation time. */ zone = arg->zone; keg->uk_name = zone->uz_name; if (arg->flags & UMA_ZONE_ZINIT) keg->uk_init = zero_init; if (arg->flags & UMA_ZONE_MALLOC) keg->uk_flags |= UMA_ZFLAG_VTOSLAB; #ifndef SMP keg->uk_flags &= ~UMA_ZONE_PCPU; #endif keg_layout(keg); /* * Use a first-touch NUMA policy for kegs that pmap_extract() will * work on. Use round-robin for everything else. * * Zones may override the default by specifying either. */ #ifdef NUMA if ((keg->uk_flags & (UMA_ZONE_ROUNDROBIN | UMA_ZFLAG_CACHE | UMA_ZONE_NOTPAGE)) == 0) keg->uk_flags |= UMA_ZONE_FIRSTTOUCH; else if ((keg->uk_flags & UMA_ZONE_FIRSTTOUCH) == 0) keg->uk_flags |= UMA_ZONE_ROUNDROBIN; #endif /* * If we haven't booted yet we need allocations to go through the * startup cache until the vm is ready. */ #ifdef UMA_USE_DMAP if (keg->uk_ppera == 1) keg->uk_allocf = uma_small_alloc; else #endif if (booted < BOOT_KVA) keg->uk_allocf = startup_alloc; else if (keg->uk_flags & UMA_ZONE_PCPU) keg->uk_allocf = pcpu_page_alloc; else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && keg->uk_ppera > 1) keg->uk_allocf = contig_alloc; else keg->uk_allocf = page_alloc; #ifdef UMA_USE_DMAP if (keg->uk_ppera == 1) keg->uk_freef = uma_small_free; else #endif if (keg->uk_flags & UMA_ZONE_PCPU) keg->uk_freef = pcpu_page_free; else keg->uk_freef = page_free; /* * Initialize keg's locks. */ for (i = 0; i < vm_ndomains; i++) KEG_LOCK_INIT(keg, i, (arg->flags & UMA_ZONE_MTXCLASS)); /* * If we're putting the slab header in the actual page we need to * figure out where in each page it goes. See slab_sizeof * definition. */ if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE)) { size_t shsize; shsize = slab_sizeof(keg->uk_ipers); keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - shsize; /* * The only way the following is possible is if with our * UMA_ALIGN_PTR adjustments we are now bigger than * UMA_SLAB_SIZE. I haven't checked whether this is * mathematically possible for all cases, so we make * sure here anyway. */ KASSERT(keg->uk_pgoff + shsize <= PAGE_SIZE * keg->uk_ppera, ("zone %s ipers %d rsize %d size %d slab won't fit", zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size)); } if (keg->uk_flags & UMA_ZFLAG_HASH) hash_alloc(&keg->uk_hash, 0); CTR3(KTR_UMA, "keg_ctor %p zone %s(%p)", keg, zone->uz_name, zone); LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link); rw_wlock(&uma_rwlock); LIST_INSERT_HEAD(&uma_kegs, keg, uk_link); rw_wunlock(&uma_rwlock); return (0); } static void zone_kva_available(uma_zone_t zone, void *unused) { uma_keg_t keg; if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0) return; KEG_GET(zone, keg); if (keg->uk_allocf == startup_alloc) { /* Switch to the real allocator. */ if (keg->uk_flags & UMA_ZONE_PCPU) keg->uk_allocf = pcpu_page_alloc; else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && keg->uk_ppera > 1) keg->uk_allocf = contig_alloc; else keg->uk_allocf = page_alloc; } } static void zone_alloc_counters(uma_zone_t zone, void *unused) { zone->uz_allocs = counter_u64_alloc(M_WAITOK); zone->uz_frees = counter_u64_alloc(M_WAITOK); zone->uz_fails = counter_u64_alloc(M_WAITOK); zone->uz_xdomain = counter_u64_alloc(M_WAITOK); } static void zone_alloc_sysctl(uma_zone_t zone, void *unused) { uma_zone_domain_t zdom; uma_domain_t dom; uma_keg_t keg; struct sysctl_oid *oid, *domainoid; int domains, i, cnt; static const char *nokeg = "cache zone"; char *c; /* * Make a sysctl safe copy of the zone name by removing * any special characters and handling dups by appending * an index. */ if (zone->uz_namecnt != 0) { /* Count the number of decimal digits and '_' separator. */ for (i = 1, cnt = zone->uz_namecnt; cnt != 0; i++) cnt /= 10; zone->uz_ctlname = malloc(strlen(zone->uz_name) + i + 1, M_UMA, M_WAITOK); sprintf(zone->uz_ctlname, "%s_%d", zone->uz_name, zone->uz_namecnt); } else zone->uz_ctlname = strdup(zone->uz_name, M_UMA); for (c = zone->uz_ctlname; *c != '\0'; c++) if (strchr("./\\ -", *c) != NULL) *c = '_'; /* * Basic parameters at the root. */ zone->uz_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm_uma), OID_AUTO, zone->uz_ctlname, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); oid = zone->uz_oid; SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "size", CTLFLAG_RD, &zone->uz_size, 0, "Allocation size"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "flags", CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_MPSAFE, zone, 0, sysctl_handle_uma_zone_flags, "A", "Allocator configuration flags"); SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "bucket_size", CTLFLAG_RD, &zone->uz_bucket_size, 0, "Desired per-cpu cache size"); SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "bucket_size_max", CTLFLAG_RD, &zone->uz_bucket_size_max, 0, "Maximum allowed per-cpu cache size"); /* * keg if present. */ if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0) domains = vm_ndomains; else domains = 1; oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "keg", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); keg = zone->uz_keg; if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) { SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "name", CTLFLAG_RD, keg->uk_name, "Keg name"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "rsize", CTLFLAG_RD, &keg->uk_rsize, 0, "Real object size with alignment"); SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "ppera", CTLFLAG_RD, &keg->uk_ppera, 0, "pages per-slab allocation"); SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "ipers", CTLFLAG_RD, &keg->uk_ipers, 0, "items available per-slab"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "align", CTLFLAG_RD, &keg->uk_align, 0, "item alignment mask"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "reserve", CTLFLAG_RD, &keg->uk_reserve, 0, "number of reserved items"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "efficiency", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE, keg, 0, sysctl_handle_uma_slab_efficiency, "I", "Slab utilization (100 - internal fragmentation %)"); domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); for (i = 0; i < domains; i++) { dom = &keg->uk_domain[i]; oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid), OID_AUTO, VM_DOMAIN(i)->vmd_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "pages", CTLFLAG_RD, &dom->ud_pages, 0, "Total pages currently allocated from VM"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_items", CTLFLAG_RD, &dom->ud_free_items, 0, "Items free in the slab layer"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_slabs", CTLFLAG_RD, &dom->ud_free_slabs, 0, "Unused slabs"); } } else SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "name", CTLFLAG_RD, nokeg, "Keg name"); /* * Information about zone limits. */ oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "limit", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "items", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, zone, 0, sysctl_handle_uma_zone_items, "QU", "Current number of allocated items if limit is set"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "max_items", CTLFLAG_RD, &zone->uz_max_items, 0, "Maximum number of allocated and cached items"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "sleepers", CTLFLAG_RD, &zone->uz_sleepers, 0, "Number of threads sleeping at limit"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "sleeps", CTLFLAG_RD, &zone->uz_sleeps, 0, "Total zone limit sleeps"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "bucket_max", CTLFLAG_RD, &zone->uz_bucket_max, 0, "Maximum number of items in each domain's bucket cache"); /* * Per-domain zone information. */ domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); for (i = 0; i < domains; i++) { zdom = ZDOM_GET(zone, i); oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid), OID_AUTO, VM_DOMAIN(i)->vmd_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "nitems", CTLFLAG_RD, &zdom->uzd_nitems, "number of items in this domain"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "imax", CTLFLAG_RD, &zdom->uzd_imax, "maximum item count in this period"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "imin", CTLFLAG_RD, &zdom->uzd_imin, "minimum item count in this period"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "bimin", CTLFLAG_RD, &zdom->uzd_bimin, "Minimum item count in this batch"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "wss", CTLFLAG_RD, &zdom->uzd_wss, "Working set size"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "limin", CTLFLAG_RD, &zdom->uzd_limin, "Long time minimum item count"); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "timin", CTLFLAG_RD, &zdom->uzd_timin, 0, "Time since zero long time minimum item count"); } /* * General statistics. */ oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "current", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE, zone, 1, sysctl_handle_uma_zone_cur, "I", "Current number of allocated items"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "allocs", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, zone, 0, sysctl_handle_uma_zone_allocs, "QU", "Total allocation calls"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "frees", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, zone, 0, sysctl_handle_uma_zone_frees, "QU", "Total free calls"); SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "fails", CTLFLAG_RD, &zone->uz_fails, "Number of allocation failures"); SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "xdomain", CTLFLAG_RD, &zone->uz_xdomain, "Free calls from the wrong domain"); } struct uma_zone_count { const char *name; int count; }; static void zone_count(uma_zone_t zone, void *arg) { struct uma_zone_count *cnt; cnt = arg; /* * Some zones are rapidly created with identical names and * destroyed out of order. This can lead to gaps in the count. * Use one greater than the maximum observed for this name. */ if (strcmp(zone->uz_name, cnt->name) == 0) cnt->count = MAX(cnt->count, zone->uz_namecnt + 1); } static void zone_update_caches(uma_zone_t zone) { int i; for (i = 0; i <= mp_maxid; i++) { cache_set_uz_size(&zone->uz_cpu[i], zone->uz_size); cache_set_uz_flags(&zone->uz_cpu[i], zone->uz_flags); } } /* * Zone header ctor. This initializes all fields, locks, etc. * * Arguments/Returns follow uma_ctor specifications * udata Actually uma_zctor_args */ static int zone_ctor(void *mem, int size, void *udata, int flags) { struct uma_zone_count cnt; struct uma_zctor_args *arg = udata; uma_zone_domain_t zdom; uma_zone_t zone = mem; uma_zone_t z; uma_keg_t keg; int i; bzero(zone, size); zone->uz_name = arg->name; zone->uz_ctor = arg->ctor; zone->uz_dtor = arg->dtor; zone->uz_init = NULL; zone->uz_fini = NULL; zone->uz_sleeps = 0; zone->uz_bucket_size = 0; zone->uz_bucket_size_min = 0; zone->uz_bucket_size_max = BUCKET_MAX; zone->uz_flags = (arg->flags & UMA_ZONE_SMR); zone->uz_warning = NULL; /* The domain structures follow the cpu structures. */ zone->uz_bucket_max = ULONG_MAX; timevalclear(&zone->uz_ratecheck); /* Count the number of duplicate names. */ cnt.name = arg->name; cnt.count = 0; zone_foreach(zone_count, &cnt); zone->uz_namecnt = cnt.count; ZONE_CROSS_LOCK_INIT(zone); for (i = 0; i < vm_ndomains; i++) { zdom = ZDOM_GET(zone, i); ZDOM_LOCK_INIT(zone, zdom, (arg->flags & UMA_ZONE_MTXCLASS)); STAILQ_INIT(&zdom->uzd_buckets); } #if defined(INVARIANTS) && !defined(KASAN) && !defined(KMSAN) if (arg->uminit == trash_init && arg->fini == trash_fini) zone->uz_flags |= UMA_ZFLAG_TRASH | UMA_ZFLAG_CTORDTOR; #elif defined(KASAN) if ((arg->flags & (UMA_ZONE_NOFREE | UMA_ZFLAG_CACHE)) != 0) arg->flags |= UMA_ZONE_NOKASAN; #endif /* * This is a pure cache zone, no kegs. */ if (arg->import) { KASSERT((arg->flags & UMA_ZFLAG_CACHE) != 0, ("zone_ctor: Import specified for non-cache zone.")); zone->uz_flags = arg->flags; zone->uz_size = arg->size; zone->uz_import = arg->import; zone->uz_release = arg->release; zone->uz_arg = arg->arg; #ifdef NUMA /* * Cache zones are round-robin unless a policy is * specified because they may have incompatible * constraints. */ if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) == 0) zone->uz_flags |= UMA_ZONE_ROUNDROBIN; #endif rw_wlock(&uma_rwlock); LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link); rw_wunlock(&uma_rwlock); goto out; } /* * Use the regular zone/keg/slab allocator. */ zone->uz_import = zone_import; zone->uz_release = zone_release; zone->uz_arg = zone; keg = arg->keg; if (arg->flags & UMA_ZONE_SECONDARY) { KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0, ("Secondary zone requested UMA_ZFLAG_INTERNAL")); KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg")); zone->uz_init = arg->uminit; zone->uz_fini = arg->fini; zone->uz_flags |= UMA_ZONE_SECONDARY; rw_wlock(&uma_rwlock); ZONE_LOCK(zone); LIST_FOREACH(z, &keg->uk_zones, uz_link) { if (LIST_NEXT(z, uz_link) == NULL) { LIST_INSERT_AFTER(z, zone, uz_link); break; } } ZONE_UNLOCK(zone); rw_wunlock(&uma_rwlock); } else if (keg == NULL) { if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini, arg->align, arg->flags)) == NULL) return (ENOMEM); } else { struct uma_kctor_args karg; int error; /* We should only be here from uma_startup() */ karg.size = arg->size; karg.uminit = arg->uminit; karg.fini = arg->fini; karg.align = arg->align; karg.flags = (arg->flags & ~UMA_ZONE_SMR); karg.zone = zone; error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg, flags); if (error) return (error); } /* Inherit properties from the keg. */ zone->uz_keg = keg; zone->uz_size = keg->uk_size; zone->uz_flags |= (keg->uk_flags & (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT)); out: if (booted >= BOOT_PCPU) { zone_alloc_counters(zone, NULL); if (booted >= BOOT_RUNNING) zone_alloc_sysctl(zone, NULL); } else { zone->uz_allocs = EARLY_COUNTER; zone->uz_frees = EARLY_COUNTER; zone->uz_fails = EARLY_COUNTER; } /* Caller requests a private SMR context. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0) zone->uz_smr = smr_create(zone->uz_name, 0, 0); KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) != (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET), ("Invalid zone flag combination")); if (arg->flags & UMA_ZFLAG_INTERNAL) zone->uz_bucket_size_max = zone->uz_bucket_size = 0; if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0) zone->uz_bucket_size = BUCKET_MAX; else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0) zone->uz_bucket_size = 0; else zone->uz_bucket_size = bucket_select(zone->uz_size); zone->uz_bucket_size_min = zone->uz_bucket_size; if (zone->uz_dtor != NULL || zone->uz_ctor != NULL) zone->uz_flags |= UMA_ZFLAG_CTORDTOR; zone_update_caches(zone); return (0); } /* * Keg header dtor. This frees all data, destroys locks, frees the hash * table and removes the keg from the global list. * * Arguments/Returns follow uma_dtor specifications * udata unused */ static void keg_dtor(void *arg, int size, void *udata) { uma_keg_t keg; uint32_t free, pages; int i; keg = (uma_keg_t)arg; free = pages = 0; for (i = 0; i < vm_ndomains; i++) { free += keg->uk_domain[i].ud_free_items; pages += keg->uk_domain[i].ud_pages; KEG_LOCK_FINI(keg, i); } if (pages != 0) printf("Freed UMA keg (%s) was not empty (%u items). " " Lost %u pages of memory.\n", keg->uk_name ? keg->uk_name : "", pages / keg->uk_ppera * keg->uk_ipers - free, pages); hash_free(&keg->uk_hash); } /* * Zone header dtor. * * Arguments/Returns follow uma_dtor specifications * udata unused */ static void zone_dtor(void *arg, int size, void *udata) { uma_zone_t zone; uma_keg_t keg; int i; zone = (uma_zone_t)arg; sysctl_remove_oid(zone->uz_oid, 1, 1); if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) cache_drain(zone); rw_wlock(&uma_rwlock); LIST_REMOVE(zone, uz_link); rw_wunlock(&uma_rwlock); if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) { keg = zone->uz_keg; keg->uk_reserve = 0; } zone_reclaim(zone, UMA_ANYDOMAIN, M_WAITOK, true); /* * We only destroy kegs from non secondary/non cache zones. */ if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) { keg = zone->uz_keg; rw_wlock(&uma_rwlock); LIST_REMOVE(keg, uk_link); rw_wunlock(&uma_rwlock); zone_free_item(kegs, keg, NULL, SKIP_NONE); } counter_u64_free(zone->uz_allocs); counter_u64_free(zone->uz_frees); counter_u64_free(zone->uz_fails); counter_u64_free(zone->uz_xdomain); free(zone->uz_ctlname, M_UMA); for (i = 0; i < vm_ndomains; i++) ZDOM_LOCK_FINI(ZDOM_GET(zone, i)); ZONE_CROSS_LOCK_FINI(zone); } static void zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *arg), void *arg) { uma_keg_t keg; uma_zone_t zone; LIST_FOREACH(keg, &uma_kegs, uk_link) { LIST_FOREACH(zone, &keg->uk_zones, uz_link) zfunc(zone, arg); } LIST_FOREACH(zone, &uma_cachezones, uz_link) zfunc(zone, arg); } /* * Traverses every zone in the system and calls a callback * * Arguments: * zfunc A pointer to a function which accepts a zone * as an argument. * * Returns: * Nothing */ static void zone_foreach(void (*zfunc)(uma_zone_t, void *arg), void *arg) { rw_rlock(&uma_rwlock); zone_foreach_unlocked(zfunc, arg); rw_runlock(&uma_rwlock); } /* * Initialize the kernel memory allocator. This is done after pages can be * allocated but before general KVA is available. */ void uma_startup1(vm_offset_t virtual_avail) { struct uma_zctor_args args; size_t ksize, zsize, size; uma_keg_t primarykeg; uintptr_t m; int domain; uint8_t pflag; bootstart = bootmem = virtual_avail; rw_init(&uma_rwlock, "UMA lock"); sx_init(&uma_reclaim_lock, "umareclaim"); ksize = sizeof(struct uma_keg) + (sizeof(struct uma_domain) * vm_ndomains); ksize = roundup(ksize, UMA_SUPER_ALIGN); zsize = sizeof(struct uma_zone) + (sizeof(struct uma_cache) * (mp_maxid + 1)) + (sizeof(struct uma_zone_domain) * vm_ndomains); zsize = roundup(zsize, UMA_SUPER_ALIGN); /* Allocate the zone of zones, zone of kegs, and zone of zones keg. */ size = (zsize * 2) + ksize; for (domain = 0; domain < vm_ndomains; domain++) { m = (uintptr_t)startup_alloc(NULL, size, domain, &pflag, M_NOWAIT | M_ZERO); if (m != 0) break; } zones = (uma_zone_t)m; m += zsize; kegs = (uma_zone_t)m; m += zsize; primarykeg = (uma_keg_t)m; /* "manually" create the initial zone */ memset(&args, 0, sizeof(args)); args.name = "UMA Kegs"; args.size = ksize; args.ctor = keg_ctor; args.dtor = keg_dtor; args.uminit = zero_init; args.fini = NULL; args.keg = primarykeg; args.align = UMA_SUPER_ALIGN - 1; args.flags = UMA_ZFLAG_INTERNAL; zone_ctor(kegs, zsize, &args, M_WAITOK); args.name = "UMA Zones"; args.size = zsize; args.ctor = zone_ctor; args.dtor = zone_dtor; args.uminit = zero_init; args.fini = NULL; args.keg = NULL; args.align = UMA_SUPER_ALIGN - 1; args.flags = UMA_ZFLAG_INTERNAL; zone_ctor(zones, zsize, &args, M_WAITOK); /* Now make zones for slab headers */ slabzones[0] = uma_zcreate("UMA Slabs 0", SLABZONE0_SIZE, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); slabzones[1] = uma_zcreate("UMA Slabs 1", SLABZONE1_SIZE, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); hashzone = uma_zcreate("UMA Hash", sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); bucket_init(); smr_init(); } #ifndef UMA_USE_DMAP extern void vm_radix_reserve_kva(void); #endif /* * Advertise the availability of normal kva allocations and switch to * the default back-end allocator. Marks the KVA we consumed on startup * as used in the map. */ void uma_startup2(void) { if (bootstart != bootmem) { vm_map_lock(kernel_map); (void)vm_map_insert(kernel_map, NULL, 0, bootstart, bootmem, VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT); vm_map_unlock(kernel_map); } #ifndef UMA_USE_DMAP /* Set up radix zone to use noobj_alloc. */ vm_radix_reserve_kva(); #endif booted = BOOT_KVA; zone_foreach_unlocked(zone_kva_available, NULL); bucket_enable(); } /* * Allocate counters as early as possible so that boot-time allocations are * accounted more precisely. */ static void uma_startup_pcpu(void *arg __unused) { zone_foreach_unlocked(zone_alloc_counters, NULL); booted = BOOT_PCPU; } SYSINIT(uma_startup_pcpu, SI_SUB_COUNTER, SI_ORDER_ANY, uma_startup_pcpu, NULL); /* * Finish our initialization steps. */ static void uma_startup3(void *arg __unused) { #ifdef INVARIANTS TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor); uma_dbg_cnt = counter_u64_alloc(M_WAITOK); uma_skip_cnt = counter_u64_alloc(M_WAITOK); #endif zone_foreach_unlocked(zone_alloc_sysctl, NULL); booted = BOOT_RUNNING; EVENTHANDLER_REGISTER(shutdown_post_sync, uma_shutdown, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL); static void uma_startup4(void *arg __unused) { TIMEOUT_TASK_INIT(taskqueue_thread, &uma_timeout_task, 0, uma_timeout, NULL); taskqueue_enqueue_timeout(taskqueue_thread, &uma_timeout_task, UMA_TIMEOUT * hz); } SYSINIT(uma_startup4, SI_SUB_TASKQ, SI_ORDER_ANY, uma_startup4, NULL); static void uma_shutdown(void) { booted = BOOT_SHUTDOWN; } static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags) { struct uma_kctor_args args; args.size = size; args.uminit = uminit; args.fini = fini; args.align = align; args.flags = flags; args.zone = zone; return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK)); } static void check_align_mask(unsigned int mask) { KASSERT(powerof2(mask + 1), ("UMA: %s: Not the mask of a power of 2 (%#x)", __func__, mask)); /* * Make sure the stored align mask doesn't have its highest bit set, * which would cause implementation-defined behavior when passing it as * the 'align' argument of uma_zcreate(). Such very large alignments do * not make sense anyway. */ KASSERT(mask <= INT_MAX, ("UMA: %s: Mask too big (%#x)", __func__, mask)); } /* Public functions */ /* See uma.h */ void uma_set_cache_align_mask(unsigned int mask) { check_align_mask(mask); uma_cache_align_mask = mask; } /* Returns the alignment mask to use to request cache alignment. */ unsigned int uma_get_cache_align_mask(void) { return (uma_cache_align_mask); } /* See uma.h */ uma_zone_t uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, uma_init uminit, uma_fini fini, int align, uint32_t flags) { struct uma_zctor_args args; uma_zone_t res; check_align_mask(align); /* This stuff is essential for the zone ctor */ memset(&args, 0, sizeof(args)); args.name = name; args.size = size; args.ctor = ctor; args.dtor = dtor; args.uminit = uminit; args.fini = fini; #if defined(INVARIANTS) && !defined(KASAN) && !defined(KMSAN) /* * Inject procedures which check for memory use after free if we are * allowed to scramble the memory while it is not allocated. This * requires that: UMA is actually able to access the memory, no init * or fini procedures, no dependency on the initial value of the * memory, and no (legitimate) use of the memory after free. Note, * the ctor and dtor do not need to be empty. */ if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOTOUCH | UMA_ZONE_NOFREE))) && uminit == NULL && fini == NULL) { args.uminit = trash_init; args.fini = trash_fini; } #endif args.align = align; args.flags = flags; args.keg = NULL; sx_xlock(&uma_reclaim_lock); res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); sx_xunlock(&uma_reclaim_lock); return (res); } /* See uma.h */ uma_zone_t uma_zsecond_create(const char *name, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_zone_t primary) { struct uma_zctor_args args; uma_keg_t keg; uma_zone_t res; keg = primary->uz_keg; memset(&args, 0, sizeof(args)); args.name = name; args.size = keg->uk_size; args.ctor = ctor; args.dtor = dtor; args.uminit = zinit; args.fini = zfini; args.align = keg->uk_align; args.flags = keg->uk_flags | UMA_ZONE_SECONDARY; args.keg = keg; sx_xlock(&uma_reclaim_lock); res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); sx_xunlock(&uma_reclaim_lock); return (res); } /* See uma.h */ uma_zone_t uma_zcache_create(const char *name, int size, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_import zimport, uma_release zrelease, void *arg, int flags) { struct uma_zctor_args args; memset(&args, 0, sizeof(args)); args.name = name; args.size = size; args.ctor = ctor; args.dtor = dtor; args.uminit = zinit; args.fini = zfini; args.import = zimport; args.release = zrelease; args.arg = arg; args.align = 0; args.flags = flags | UMA_ZFLAG_CACHE; return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK)); } /* See uma.h */ void uma_zdestroy(uma_zone_t zone) { /* * Large slabs are expensive to reclaim, so don't bother doing * unnecessary work if we're shutting down. */ if (booted == BOOT_SHUTDOWN && zone->uz_fini == NULL && zone->uz_release == zone_release) return; sx_xlock(&uma_reclaim_lock); zone_free_item(zones, zone, NULL, SKIP_NONE); sx_xunlock(&uma_reclaim_lock); } void uma_zwait(uma_zone_t zone) { if ((zone->uz_flags & UMA_ZONE_SMR) != 0) uma_zfree_smr(zone, uma_zalloc_smr(zone, M_WAITOK)); else if ((zone->uz_flags & UMA_ZONE_PCPU) != 0) uma_zfree_pcpu(zone, uma_zalloc_pcpu(zone, M_WAITOK)); else uma_zfree(zone, uma_zalloc(zone, M_WAITOK)); } void * uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags) { void *item, *pcpu_item; #ifdef SMP int i; MPASS(zone->uz_flags & UMA_ZONE_PCPU); #endif item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO); if (item == NULL) return (NULL); pcpu_item = zpcpu_base_to_offset(item); if (flags & M_ZERO) { #ifdef SMP for (i = 0; i <= mp_maxid; i++) bzero(zpcpu_get_cpu(pcpu_item, i), zone->uz_size); #else bzero(item, zone->uz_size); #endif } return (pcpu_item); } /* * A stub while both regular and pcpu cases are identical. */ void uma_zfree_pcpu_arg(uma_zone_t zone, void *pcpu_item, void *udata) { void *item; #ifdef SMP MPASS(zone->uz_flags & UMA_ZONE_PCPU); #endif /* uma_zfree_pcu_*(..., NULL) does nothing, to match free(9). */ if (pcpu_item == NULL) return; item = zpcpu_offset_to_base(pcpu_item); uma_zfree_arg(zone, item, udata); } static inline void * item_ctor(uma_zone_t zone, int uz_flags, int size, void *udata, int flags, void *item) { #ifdef INVARIANTS bool skipdbg; #endif kasan_mark_item_valid(zone, item); kmsan_mark_item_uninitialized(zone, item); #ifdef INVARIANTS skipdbg = uma_dbg_zskip(zone, item); if (!skipdbg && (uz_flags & UMA_ZFLAG_TRASH) != 0 && zone->uz_ctor != trash_ctor) trash_ctor(item, size, zone, flags); #endif /* Check flags before loading ctor pointer. */ if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0) && __predict_false(zone->uz_ctor != NULL) && zone->uz_ctor(item, size, udata, flags) != 0) { counter_u64_add(zone->uz_fails, 1); zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT); return (NULL); } #ifdef INVARIANTS if (!skipdbg) uma_dbg_alloc(zone, NULL, item); #endif if (__predict_false(flags & M_ZERO)) return (memset(item, 0, size)); return (item); } static inline void item_dtor(uma_zone_t zone, void *item, int size, void *udata, enum zfreeskip skip) { #ifdef INVARIANTS bool skipdbg; skipdbg = uma_dbg_zskip(zone, item); if (skip == SKIP_NONE && !skipdbg) { if ((zone->uz_flags & UMA_ZONE_MALLOC) != 0) uma_dbg_free(zone, udata, item); else uma_dbg_free(zone, NULL, item); } #endif if (__predict_true(skip < SKIP_DTOR)) { if (zone->uz_dtor != NULL) zone->uz_dtor(item, size, udata); #ifdef INVARIANTS if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 && zone->uz_dtor != trash_dtor) trash_dtor(item, size, zone); #endif } kasan_mark_item_invalid(zone, item); } #ifdef NUMA static int item_domain(void *item) { int domain; domain = vm_phys_domain(vtophys(item)); KASSERT(domain >= 0 && domain < vm_ndomains, ("%s: unknown domain for item %p", __func__, item)); return (domain); } #endif #if defined(INVARIANTS) || defined(DEBUG_MEMGUARD) || defined(WITNESS) #if defined(INVARIANTS) && (defined(DDB) || defined(STACK)) #include #endif #define UMA_ZALLOC_DEBUG static int uma_zalloc_debug(uma_zone_t zone, void **itemp, void *udata, int flags) { int error; error = 0; #ifdef WITNESS if (flags & M_WAITOK) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "uma_zalloc_debug: zone \"%s\"", zone->uz_name); } #endif #ifdef INVARIANTS KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_debug: called with M_EXEC")); KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("uma_zalloc_debug: called within spinlock or critical section")); KASSERT((zone->uz_flags & UMA_ZONE_PCPU) == 0 || (flags & M_ZERO) == 0, ("uma_zalloc_debug: allocating from a pcpu zone with M_ZERO")); _Static_assert(M_NOWAIT != 0 && M_WAITOK != 0, "M_NOWAIT and M_WAITOK must be non-zero for this assertion:"); #if 0 /* * Give the #elif clause time to find problems, then remove it * and enable this. (Remove above, too.) */ KASSERT((flags & (M_NOWAIT|M_WAITOK)) == M_NOWAIT || (flags & (M_NOWAIT|M_WAITOK)) == M_WAITOK, ("uma_zalloc_debug: must pass one of M_NOWAIT or M_WAITOK")); #elif defined(DDB) || defined(STACK) if (__predict_false((flags & (M_NOWAIT|M_WAITOK)) != M_NOWAIT && (flags & (M_NOWAIT|M_WAITOK)) != M_WAITOK)) { static int stack_count; struct stack st; if (stack_count < 10) { ++stack_count; printf("uma_zalloc* called with bad WAIT flags:\n"); stack_save(&st); stack_print(&st); } } #endif #endif #ifdef DEBUG_MEMGUARD if ((zone->uz_flags & (UMA_ZONE_SMR | UMA_ZFLAG_CACHE)) == 0 && memguard_cmp_zone(zone)) { void *item; item = memguard_alloc(zone->uz_size, flags); if (item != NULL) { error = EJUSTRETURN; if (zone->uz_init != NULL && zone->uz_init(item, zone->uz_size, flags) != 0) { *itemp = NULL; return (error); } if (zone->uz_ctor != NULL && zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) { counter_u64_add(zone->uz_fails, 1); if (zone->uz_fini != NULL) zone->uz_fini(item, zone->uz_size); *itemp = NULL; return (error); } *itemp = item; return (error); } /* This is unfortunate but should not be fatal. */ } #endif return (error); } static int uma_zfree_debug(uma_zone_t zone, void *item, void *udata) { KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("uma_zfree_debug: called with spinlock or critical section held")); #ifdef DEBUG_MEMGUARD if ((zone->uz_flags & (UMA_ZONE_SMR | UMA_ZFLAG_CACHE)) == 0 && is_memguard_addr(item)) { if (zone->uz_dtor != NULL) zone->uz_dtor(item, zone->uz_size, udata); if (zone->uz_fini != NULL) zone->uz_fini(item, zone->uz_size); memguard_free(item); return (EJUSTRETURN); } #endif return (0); } #endif static inline void * cache_alloc_item(uma_zone_t zone, uma_cache_t cache, uma_cache_bucket_t bucket, void *udata, int flags) { void *item; int size, uz_flags; item = cache_bucket_pop(cache, bucket); size = cache_uz_size(cache); uz_flags = cache_uz_flags(cache); critical_exit(); return (item_ctor(zone, uz_flags, size, udata, flags, item)); } static __noinline void * cache_alloc_retry(uma_zone_t zone, uma_cache_t cache, void *udata, int flags) { uma_cache_bucket_t bucket; int domain; while (cache_alloc(zone, cache, udata, flags)) { cache = &zone->uz_cpu[curcpu]; bucket = &cache->uc_allocbucket; if (__predict_false(bucket->ucb_cnt == 0)) continue; return (cache_alloc_item(zone, cache, bucket, udata, flags)); } critical_exit(); /* * We can not get a bucket so try to return a single item. */ if (zone->uz_flags & UMA_ZONE_FIRSTTOUCH) domain = PCPU_GET(domain); else domain = UMA_ANYDOMAIN; return (zone_alloc_item(zone, udata, domain, flags)); } /* See uma.h */ void * uma_zalloc_smr(uma_zone_t zone, int flags) { uma_cache_bucket_t bucket; uma_cache_t cache; CTR3(KTR_UMA, "uma_zalloc_smr zone %s(%p) flags %d", zone->uz_name, zone, flags); #ifdef UMA_ZALLOC_DEBUG void *item; KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0, ("uma_zalloc_arg: called with non-SMR zone.")); if (uma_zalloc_debug(zone, &item, NULL, flags) == EJUSTRETURN) return (item); #endif critical_enter(); cache = &zone->uz_cpu[curcpu]; bucket = &cache->uc_allocbucket; if (__predict_false(bucket->ucb_cnt == 0)) return (cache_alloc_retry(zone, cache, NULL, flags)); return (cache_alloc_item(zone, cache, bucket, NULL, flags)); } /* See uma.h */ void * uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) { uma_cache_bucket_t bucket; uma_cache_t cache; /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); /* This is the fast path allocation */ CTR3(KTR_UMA, "uma_zalloc_arg zone %s(%p) flags %d", zone->uz_name, zone, flags); #ifdef UMA_ZALLOC_DEBUG void *item; KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, ("uma_zalloc_arg: called with SMR zone.")); if (uma_zalloc_debug(zone, &item, udata, flags) == EJUSTRETURN) return (item); #endif /* * If possible, allocate from the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread * accessing the cache must not be preempted or yield during access, * and (2) the thread must not migrate CPUs without switching which * cache it accesses. We rely on a critical section to prevent * preemption and migration. We release the critical section in * order to acquire the zone mutex if we are unable to allocate from * the current cache; when we re-acquire the critical section, we * must detect and handle migration if it has occurred. */ critical_enter(); cache = &zone->uz_cpu[curcpu]; bucket = &cache->uc_allocbucket; if (__predict_false(bucket->ucb_cnt == 0)) return (cache_alloc_retry(zone, cache, udata, flags)); return (cache_alloc_item(zone, cache, bucket, udata, flags)); } /* * Replenish an alloc bucket and possibly restore an old one. Called in * a critical section. Returns in a critical section. * * A false return value indicates an allocation failure. * A true return value indicates success and the caller should retry. */ static __noinline bool cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags) { uma_bucket_t bucket; int curdomain, domain; bool new; CRITICAL_ASSERT(curthread); /* * If we have run out of items in our alloc bucket see * if we can switch with the free bucket. * * SMR Zones can't re-use the free bucket until the sequence has * expired. */ if ((cache_uz_flags(cache) & UMA_ZONE_SMR) == 0 && cache->uc_freebucket.ucb_cnt != 0) { cache_bucket_swap(&cache->uc_freebucket, &cache->uc_allocbucket); return (true); } /* * Discard any empty allocation bucket while we hold no locks. */ bucket = cache_bucket_unload_alloc(cache); critical_exit(); if (bucket != NULL) { KASSERT(bucket->ub_cnt == 0, ("cache_alloc: Entered with non-empty alloc bucket.")); bucket_free(zone, bucket, udata); } /* * Attempt to retrieve the item from the per-CPU cache has failed, so * we must go back to the zone. This requires the zdom lock, so we * must drop the critical section, then re-acquire it when we go back * to the cache. Since the critical section is released, we may be * preempted or migrate. As such, make sure not to maintain any * thread-local state specific to the cache from prior to releasing * the critical section. */ domain = PCPU_GET(domain); if ((cache_uz_flags(cache) & UMA_ZONE_ROUNDROBIN) != 0 || VM_DOMAIN_EMPTY(domain)) domain = zone_domain_highest(zone, domain); bucket = cache_fetch_bucket(zone, cache, domain); if (bucket == NULL && zone->uz_bucket_size != 0 && !bucketdisable) { bucket = zone_alloc_bucket(zone, udata, domain, flags); new = true; } else { new = false; } CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p", zone->uz_name, zone, bucket); if (bucket == NULL) { critical_enter(); return (false); } /* * See if we lost the race or were migrated. Cache the * initialized bucket to make this less likely or claim * the memory directly. */ critical_enter(); cache = &zone->uz_cpu[curcpu]; if (cache->uc_allocbucket.ucb_bucket == NULL && ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) == 0 || (curdomain = PCPU_GET(domain)) == domain || VM_DOMAIN_EMPTY(curdomain))) { if (new) atomic_add_long(&ZDOM_GET(zone, domain)->uzd_imax, bucket->ub_cnt); cache_bucket_load_alloc(cache, bucket); return (true); } /* * We lost the race, release this bucket and start over. */ critical_exit(); zone_put_bucket(zone, domain, bucket, udata, !new); critical_enter(); return (true); } void * uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags) { #ifdef NUMA uma_bucket_t bucket; uma_zone_domain_t zdom; void *item; #endif /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); /* This is the fast path allocation */ CTR4(KTR_UMA, "uma_zalloc_domain zone %s(%p) domain %d flags %d", zone->uz_name, zone, domain, flags); KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, ("uma_zalloc_domain: called with SMR zone.")); #ifdef NUMA KASSERT((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0, ("uma_zalloc_domain: called with non-FIRSTTOUCH zone.")); if (vm_ndomains == 1) return (uma_zalloc_arg(zone, udata, flags)); #ifdef UMA_ZALLOC_DEBUG if (uma_zalloc_debug(zone, &item, udata, flags) == EJUSTRETURN) return (item); #endif /* * Try to allocate from the bucket cache before falling back to the keg. * We could try harder and attempt to allocate from per-CPU caches or * the per-domain cross-domain buckets, but the complexity is probably * not worth it. It is more important that frees of previous * cross-domain allocations do not blow up the cache. */ zdom = zone_domain_lock(zone, domain); if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL) { item = bucket->ub_bucket[bucket->ub_cnt - 1]; #ifdef INVARIANTS bucket->ub_bucket[bucket->ub_cnt - 1] = NULL; #endif bucket->ub_cnt--; zone_put_bucket(zone, domain, bucket, udata, true); item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata, flags, item); if (item != NULL) { KASSERT(item_domain(item) == domain, ("%s: bucket cache item %p from wrong domain", __func__, item)); counter_u64_add(zone->uz_allocs, 1); } return (item); } ZDOM_UNLOCK(zdom); return (zone_alloc_item(zone, udata, domain, flags)); #else return (uma_zalloc_arg(zone, udata, flags)); #endif } /* * Find a slab with some space. Prefer slabs that are partially used over those * that are totally full. This helps to reduce fragmentation. * * If 'rr' is 1, search all domains starting from 'domain'. Otherwise check * only 'domain'. */ static uma_slab_t keg_first_slab(uma_keg_t keg, int domain, bool rr) { uma_domain_t dom; uma_slab_t slab; int start; KASSERT(domain >= 0 && domain < vm_ndomains, ("keg_first_slab: domain %d out of range", domain)); KEG_LOCK_ASSERT(keg, domain); slab = NULL; start = domain; do { dom = &keg->uk_domain[domain]; if ((slab = LIST_FIRST(&dom->ud_part_slab)) != NULL) return (slab); if ((slab = LIST_FIRST(&dom->ud_free_slab)) != NULL) { LIST_REMOVE(slab, us_link); dom->ud_free_slabs--; LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); return (slab); } if (rr) domain = (domain + 1) % vm_ndomains; } while (domain != start); return (NULL); } /* * Fetch an existing slab from a free or partial list. Returns with the * keg domain lock held if a slab was found or unlocked if not. */ static uma_slab_t keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags) { uma_slab_t slab; uint32_t reserve; /* HASH has a single free list. */ if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) domain = 0; KEG_LOCK(keg, domain); reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve; if (keg->uk_domain[domain].ud_free_items <= reserve || (slab = keg_first_slab(keg, domain, rr)) == NULL) { KEG_UNLOCK(keg, domain); return (NULL); } return (slab); } static uma_slab_t keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags) { struct vm_domainset_iter di; uma_slab_t slab; int aflags, domain; bool rr; KASSERT((flags & (M_WAITOK | M_NOVM)) != (M_WAITOK | M_NOVM), ("%s: invalid flags %#x", __func__, flags)); restart: /* * Use the keg's policy if upper layers haven't already specified a * domain (as happens with first-touch zones). * * To avoid races we run the iterator with the keg lock held, but that * means that we cannot allow the vm_domainset layer to sleep. Thus, * clear M_WAITOK and handle low memory conditions locally. */ rr = rdomain == UMA_ANYDOMAIN; if (rr) { aflags = (flags & ~M_WAITOK) | M_NOWAIT; vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, &aflags); } else { aflags = flags; domain = rdomain; } for (;;) { slab = keg_fetch_free_slab(keg, domain, rr, flags); if (slab != NULL) return (slab); /* * M_NOVM is used to break the recursion that can otherwise * occur if low-level memory management routines use UMA. */ if ((flags & M_NOVM) == 0) { slab = keg_alloc_slab(keg, zone, domain, flags, aflags); if (slab != NULL) return (slab); } if (!rr) { if ((flags & M_USE_RESERVE) != 0) { /* * Drain reserves from other domains before * giving up or sleeping. It may be useful to * support per-domain reserves eventually. */ rdomain = UMA_ANYDOMAIN; goto restart; } if ((flags & M_WAITOK) == 0) break; vm_wait_domain(domain); } else if (vm_domainset_iter_policy(&di, &domain) != 0) { if ((flags & M_WAITOK) != 0) { vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0); goto restart; } break; } } /* * We might not have been able to get a slab but another cpu * could have while we were unlocked. Check again before we * fail. */ if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL) return (slab); return (NULL); } static void * slab_alloc_item(uma_keg_t keg, uma_slab_t slab) { uma_domain_t dom; void *item; int freei; KEG_LOCK_ASSERT(keg, slab->us_domain); dom = &keg->uk_domain[slab->us_domain]; freei = BIT_FFS(keg->uk_ipers, &slab->us_free) - 1; BIT_CLR(keg->uk_ipers, freei, &slab->us_free); item = slab_item(slab, keg, freei); slab->us_freecount--; dom->ud_free_items--; /* * Move this slab to the full list. It must be on the partial list, so * we do not need to update the free slab count. In particular, * keg_fetch_slab() always returns slabs on the partial list. */ if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link); } return (item); } static int zone_import(void *arg, void **bucket, int max, int domain, int flags) { uma_domain_t dom; uma_zone_t zone; uma_slab_t slab; uma_keg_t keg; #ifdef NUMA int stripe; #endif int i; zone = arg; slab = NULL; keg = zone->uz_keg; /* Try to keep the buckets totally full */ for (i = 0; i < max; ) { if ((slab = keg_fetch_slab(keg, zone, domain, flags)) == NULL) break; #ifdef NUMA stripe = howmany(max, vm_ndomains); #endif dom = &keg->uk_domain[slab->us_domain]; do { bucket[i++] = slab_alloc_item(keg, slab); if (keg->uk_reserve > 0 && dom->ud_free_items <= keg->uk_reserve) { /* * Avoid depleting the reserve after a * successful item allocation, even if * M_USE_RESERVE is specified. */ KEG_UNLOCK(keg, slab->us_domain); goto out; } #ifdef NUMA /* * If the zone is striped we pick a new slab for every * N allocations. Eliminating this conditional will * instead pick a new domain for each bucket rather * than stripe within each bucket. The current option * produces more fragmentation and requires more cpu * time but yields better distribution. */ if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0 && vm_ndomains > 1 && --stripe == 0) break; #endif } while (slab->us_freecount != 0 && i < max); KEG_UNLOCK(keg, slab->us_domain); /* Don't block if we allocated any successfully. */ flags &= ~M_WAITOK; flags |= M_NOWAIT; } out: return i; } static int zone_alloc_limit_hard(uma_zone_t zone, int count, int flags) { uint64_t old, new, total, max; /* * The hard case. We're going to sleep because there were existing * sleepers or because we ran out of items. This routine enforces * fairness by keeping fifo order. * * First release our ill gotten gains and make some noise. */ for (;;) { zone_free_limit(zone, count); zone_log_warning(zone); zone_maxaction(zone); if (flags & M_NOWAIT) return (0); /* * We need to allocate an item or set ourself as a sleeper * while the sleepq lock is held to avoid wakeup races. This * is essentially a home rolled semaphore. */ sleepq_lock(&zone->uz_max_items); old = zone->uz_items; do { MPASS(UZ_ITEMS_SLEEPERS(old) < UZ_ITEMS_SLEEPERS_MAX); /* Cache the max since we will evaluate twice. */ max = zone->uz_max_items; if (UZ_ITEMS_SLEEPERS(old) != 0 || UZ_ITEMS_COUNT(old) >= max) new = old + UZ_ITEMS_SLEEPER; else new = old + MIN(count, max - old); } while (atomic_fcmpset_64(&zone->uz_items, &old, new) == 0); /* We may have successfully allocated under the sleepq lock. */ if (UZ_ITEMS_SLEEPERS(new) == 0) { sleepq_release(&zone->uz_max_items); return (new - old); } /* * This is in a different cacheline from uz_items so that we * don't constantly invalidate the fastpath cacheline when we * adjust item counts. This could be limited to toggling on * transitions. */ atomic_add_32(&zone->uz_sleepers, 1); atomic_add_64(&zone->uz_sleeps, 1); /* * We have added ourselves as a sleeper. The sleepq lock * protects us from wakeup races. Sleep now and then retry. */ sleepq_add(&zone->uz_max_items, NULL, "zonelimit", 0, 0); sleepq_wait(&zone->uz_max_items, PVM); /* * After wakeup, remove ourselves as a sleeper and try * again. We no longer have the sleepq lock for protection. * * Subract ourselves as a sleeper while attempting to add * our count. */ atomic_subtract_32(&zone->uz_sleepers, 1); old = atomic_fetchadd_64(&zone->uz_items, -(UZ_ITEMS_SLEEPER - count)); /* We're no longer a sleeper. */ old -= UZ_ITEMS_SLEEPER; /* * If we're still at the limit, restart. Notably do not * block on other sleepers. Cache the max value to protect * against changes via sysctl. */ total = UZ_ITEMS_COUNT(old); max = zone->uz_max_items; if (total >= max) continue; /* Truncate if necessary, otherwise wake other sleepers. */ if (total + count > max) { zone_free_limit(zone, total + count - max); count = max - total; } else if (total + count < max && UZ_ITEMS_SLEEPERS(old) != 0) wakeup_one(&zone->uz_max_items); return (count); } } /* * Allocate 'count' items from our max_items limit. Returns the number * available. If M_NOWAIT is not specified it will sleep until at least * one item can be allocated. */ static int zone_alloc_limit(uma_zone_t zone, int count, int flags) { uint64_t old; uint64_t max; max = zone->uz_max_items; MPASS(max > 0); /* * We expect normal allocations to succeed with a simple * fetchadd. */ old = atomic_fetchadd_64(&zone->uz_items, count); if (__predict_true(old + count <= max)) return (count); /* * If we had some items and no sleepers just return the * truncated value. We have to release the excess space * though because that may wake sleepers who weren't woken * because we were temporarily over the limit. */ if (old < max) { zone_free_limit(zone, (old + count) - max); return (max - old); } return (zone_alloc_limit_hard(zone, count, flags)); } /* * Free a number of items back to the limit. */ static void zone_free_limit(uma_zone_t zone, int count) { uint64_t old; MPASS(count > 0); /* * In the common case we either have no sleepers or * are still over the limit and can just return. */ old = atomic_fetchadd_64(&zone->uz_items, -count); if (__predict_true(UZ_ITEMS_SLEEPERS(old) == 0 || UZ_ITEMS_COUNT(old) - count >= zone->uz_max_items)) return; /* * Moderate the rate of wakeups. Sleepers will continue * to generate wakeups if necessary. */ wakeup_one(&zone->uz_max_items); } static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) { uma_bucket_t bucket; int error, maxbucket, cnt; CTR3(KTR_UMA, "zone_alloc_bucket zone %s(%p) domain %d", zone->uz_name, zone, domain); /* Avoid allocs targeting empty domains. */ if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) domain = UMA_ANYDOMAIN; else if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0) domain = UMA_ANYDOMAIN; if (zone->uz_max_items > 0) maxbucket = zone_alloc_limit(zone, zone->uz_bucket_size, M_NOWAIT); else maxbucket = zone->uz_bucket_size; if (maxbucket == 0) return (NULL); /* Don't wait for buckets, preserve caller's NOVM setting. */ bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); if (bucket == NULL) { cnt = 0; goto out; } bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket, MIN(maxbucket, bucket->ub_entries), domain, flags); /* * Initialize the memory if necessary. */ if (bucket->ub_cnt != 0 && zone->uz_init != NULL) { int i; for (i = 0; i < bucket->ub_cnt; i++) { kasan_mark_item_valid(zone, bucket->ub_bucket[i]); error = zone->uz_init(bucket->ub_bucket[i], zone->uz_size, flags); kasan_mark_item_invalid(zone, bucket->ub_bucket[i]); if (error != 0) break; } /* * If we couldn't initialize the whole bucket, put the * rest back onto the freelist. */ if (i != bucket->ub_cnt) { zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i], bucket->ub_cnt - i); #ifdef INVARIANTS bzero(&bucket->ub_bucket[i], sizeof(void *) * (bucket->ub_cnt - i)); #endif bucket->ub_cnt = i; } } cnt = bucket->ub_cnt; if (bucket->ub_cnt == 0) { bucket_free(zone, bucket, udata); counter_u64_add(zone->uz_fails, 1); bucket = NULL; } out: if (zone->uz_max_items > 0 && cnt < maxbucket) zone_free_limit(zone, maxbucket - cnt); return (bucket); } /* * Allocates a single item from a zone. * * Arguments * zone The zone to alloc for. * udata The data to be passed to the constructor. * domain The domain to allocate from or UMA_ANYDOMAIN. * flags M_WAITOK, M_NOWAIT, M_ZERO. * * Returns * NULL if there is no memory and M_NOWAIT is set * An item if successful */ static void * zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) { void *item; if (zone->uz_max_items > 0 && zone_alloc_limit(zone, 1, flags) == 0) { counter_u64_add(zone->uz_fails, 1); return (NULL); } /* Avoid allocs targeting empty domains. */ if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) domain = UMA_ANYDOMAIN; if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1) goto fail_cnt; /* * We have to call both the zone's init (not the keg's init) * and the zone's ctor. This is because the item is going from * a keg slab directly to the user, and the user is expecting it * to be both zone-init'd as well as zone-ctor'd. */ if (zone->uz_init != NULL) { int error; kasan_mark_item_valid(zone, item); error = zone->uz_init(item, zone->uz_size, flags); kasan_mark_item_invalid(zone, item); if (error != 0) { zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT); goto fail_cnt; } } item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata, flags, item); if (item == NULL) goto fail; counter_u64_add(zone->uz_allocs, 1); CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item, zone->uz_name, zone); return (item); fail_cnt: counter_u64_add(zone->uz_fails, 1); fail: if (zone->uz_max_items > 0) zone_free_limit(zone, 1); CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)", zone->uz_name, zone); return (NULL); } /* See uma.h */ void uma_zfree_smr(uma_zone_t zone, void *item) { uma_cache_t cache; uma_cache_bucket_t bucket; int itemdomain; #ifdef NUMA int uz_flags; #endif CTR3(KTR_UMA, "uma_zfree_smr zone %s(%p) item %p", zone->uz_name, zone, item); #ifdef UMA_ZALLOC_DEBUG KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0, ("uma_zfree_smr: called with non-SMR zone.")); KASSERT(item != NULL, ("uma_zfree_smr: Called with NULL pointer.")); SMR_ASSERT_NOT_ENTERED(zone->uz_smr); if (uma_zfree_debug(zone, item, NULL) == EJUSTRETURN) return; #endif cache = &zone->uz_cpu[curcpu]; itemdomain = 0; #ifdef NUMA uz_flags = cache_uz_flags(cache); if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0) itemdomain = item_domain(item); #endif critical_enter(); do { cache = &zone->uz_cpu[curcpu]; /* SMR Zones must free to the free bucket. */ bucket = &cache->uc_freebucket; #ifdef NUMA if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && PCPU_GET(domain) != itemdomain) { bucket = &cache->uc_crossbucket; } #endif if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) { cache_bucket_push(cache, bucket, item); critical_exit(); return; } } while (cache_free(zone, cache, NULL, itemdomain)); critical_exit(); /* * If nothing else caught this, we'll just do an internal free. */ zone_free_item(zone, item, NULL, SKIP_NONE); } /* See uma.h */ void uma_zfree_arg(uma_zone_t zone, void *item, void *udata) { uma_cache_t cache; uma_cache_bucket_t bucket; int itemdomain, uz_flags; /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); CTR3(KTR_UMA, "uma_zfree_arg zone %s(%p) item %p", zone->uz_name, zone, item); #ifdef UMA_ZALLOC_DEBUG KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, ("uma_zfree_arg: called with SMR zone.")); if (uma_zfree_debug(zone, item, udata) == EJUSTRETURN) return; #endif /* uma_zfree(..., NULL) does nothing, to match free(9). */ if (item == NULL) return; /* * We are accessing the per-cpu cache without a critical section to * fetch size and flags. This is acceptable, if we are preempted we * will simply read another cpu's line. */ cache = &zone->uz_cpu[curcpu]; uz_flags = cache_uz_flags(cache); if (UMA_ALWAYS_CTORDTOR || __predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0)) item_dtor(zone, item, cache_uz_size(cache), udata, SKIP_NONE); /* * The race here is acceptable. If we miss it we'll just have to wait * a little longer for the limits to be reset. */ if (__predict_false(uz_flags & UMA_ZFLAG_LIMIT)) { if (atomic_load_32(&zone->uz_sleepers) > 0) goto zfree_item; } /* * If possible, free to the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread * accessing the cache must not be preempted or yield during access, * and (2) the thread must not migrate CPUs without switching which * cache it accesses. We rely on a critical section to prevent * preemption and migration. We release the critical section in * order to acquire the zone mutex if we are unable to free to the * current cache; when we re-acquire the critical section, we must * detect and handle migration if it has occurred. */ itemdomain = 0; #ifdef NUMA if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0) itemdomain = item_domain(item); #endif critical_enter(); do { cache = &zone->uz_cpu[curcpu]; /* * Try to free into the allocbucket first to give LIFO * ordering for cache-hot datastructures. Spill over * into the freebucket if necessary. Alloc will swap * them if one runs dry. */ bucket = &cache->uc_allocbucket; #ifdef NUMA if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && PCPU_GET(domain) != itemdomain) { bucket = &cache->uc_crossbucket; } else #endif if (bucket->ucb_cnt == bucket->ucb_entries && cache->uc_freebucket.ucb_cnt < cache->uc_freebucket.ucb_entries) cache_bucket_swap(&cache->uc_freebucket, &cache->uc_allocbucket); if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) { cache_bucket_push(cache, bucket, item); critical_exit(); return; } } while (cache_free(zone, cache, udata, itemdomain)); critical_exit(); /* * If nothing else caught this, we'll just do an internal free. */ zfree_item: zone_free_item(zone, item, udata, SKIP_DTOR); } #ifdef NUMA /* * sort crossdomain free buckets to domain correct buckets and cache * them. */ static void zone_free_cross(uma_zone_t zone, uma_bucket_t bucket, void *udata) { struct uma_bucketlist emptybuckets, fullbuckets; uma_zone_domain_t zdom; uma_bucket_t b; smr_seq_t seq; void *item; int domain; CTR3(KTR_UMA, "uma_zfree: zone %s(%p) draining cross bucket %p", zone->uz_name, zone, bucket); /* * It is possible for buckets to arrive here out of order so we fetch * the current smr seq rather than accepting the bucket's. */ seq = SMR_SEQ_INVALID; if ((zone->uz_flags & UMA_ZONE_SMR) != 0) seq = smr_advance(zone->uz_smr); /* * To avoid having ndomain * ndomain buckets for sorting we have a * lock on the current crossfree bucket. A full matrix with * per-domain locking could be used if necessary. */ STAILQ_INIT(&emptybuckets); STAILQ_INIT(&fullbuckets); ZONE_CROSS_LOCK(zone); for (; bucket->ub_cnt > 0; bucket->ub_cnt--) { item = bucket->ub_bucket[bucket->ub_cnt - 1]; domain = item_domain(item); zdom = ZDOM_GET(zone, domain); if (zdom->uzd_cross == NULL) { if ((b = STAILQ_FIRST(&emptybuckets)) != NULL) { STAILQ_REMOVE_HEAD(&emptybuckets, ub_link); zdom->uzd_cross = b; } else { /* * Avoid allocating a bucket with the cross lock * held, since allocation can trigger a * cross-domain free and bucket zones may * allocate from each other. */ ZONE_CROSS_UNLOCK(zone); b = bucket_alloc(zone, udata, M_NOWAIT); if (b == NULL) goto out; ZONE_CROSS_LOCK(zone); if (zdom->uzd_cross != NULL) { STAILQ_INSERT_HEAD(&emptybuckets, b, ub_link); } else { zdom->uzd_cross = b; } } } b = zdom->uzd_cross; b->ub_bucket[b->ub_cnt++] = item; b->ub_seq = seq; if (b->ub_cnt == b->ub_entries) { STAILQ_INSERT_HEAD(&fullbuckets, b, ub_link); if ((b = STAILQ_FIRST(&emptybuckets)) != NULL) STAILQ_REMOVE_HEAD(&emptybuckets, ub_link); zdom->uzd_cross = b; } } ZONE_CROSS_UNLOCK(zone); out: if (bucket->ub_cnt == 0) bucket->ub_seq = SMR_SEQ_INVALID; bucket_free(zone, bucket, udata); while ((b = STAILQ_FIRST(&emptybuckets)) != NULL) { STAILQ_REMOVE_HEAD(&emptybuckets, ub_link); bucket_free(zone, b, udata); } while ((b = STAILQ_FIRST(&fullbuckets)) != NULL) { STAILQ_REMOVE_HEAD(&fullbuckets, ub_link); domain = item_domain(b->ub_bucket[0]); zone_put_bucket(zone, domain, b, udata, true); } } #endif static void zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata, int itemdomain, bool ws) { #ifdef NUMA /* * Buckets coming from the wrong domain will be entirely for the * only other domain on two domain systems. In this case we can * simply cache them. Otherwise we need to sort them back to * correct domains. */ if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && vm_ndomains > 2 && PCPU_GET(domain) != itemdomain) { zone_free_cross(zone, bucket, udata); return; } #endif /* * Attempt to save the bucket in the zone's domain bucket cache. */ CTR3(KTR_UMA, "uma_zfree: zone %s(%p) putting bucket %p on free list", zone->uz_name, zone, bucket); /* ub_cnt is pointing to the last free item */ if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0) itemdomain = zone_domain_lowest(zone, itemdomain); zone_put_bucket(zone, itemdomain, bucket, udata, ws); } /* * Populate a free or cross bucket for the current cpu cache. Free any * existing full bucket either to the zone cache or back to the slab layer. * * Enters and returns in a critical section. false return indicates that * we can not satisfy this free in the cache layer. true indicates that * the caller should retry. */ static __noinline bool cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, int itemdomain) { uma_cache_bucket_t cbucket; uma_bucket_t newbucket, bucket; CRITICAL_ASSERT(curthread); if (zone->uz_bucket_size == 0) return false; cache = &zone->uz_cpu[curcpu]; newbucket = NULL; /* * FIRSTTOUCH domains need to free to the correct zdom. When * enabled this is the zdom of the item. The bucket is the * cross bucket if the current domain and itemdomain do not match. */ cbucket = &cache->uc_freebucket; #ifdef NUMA if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) { if (PCPU_GET(domain) != itemdomain) { cbucket = &cache->uc_crossbucket; if (cbucket->ucb_cnt != 0) counter_u64_add(zone->uz_xdomain, cbucket->ucb_cnt); } } #endif bucket = cache_bucket_unload(cbucket); KASSERT(bucket == NULL || bucket->ub_cnt == bucket->ub_entries, ("cache_free: Entered with non-full free bucket.")); /* We are no longer associated with this CPU. */ critical_exit(); /* * Don't let SMR zones operate without a free bucket. Force * a synchronize and re-use this one. We will only degrade * to a synchronize every bucket_size items rather than every * item if we fail to allocate a bucket. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0) { if (bucket != NULL) bucket->ub_seq = smr_advance(zone->uz_smr); newbucket = bucket_alloc(zone, udata, M_NOWAIT); if (newbucket == NULL && bucket != NULL) { bucket_drain(zone, bucket); newbucket = bucket; bucket = NULL; } } else if (!bucketdisable) newbucket = bucket_alloc(zone, udata, M_NOWAIT); if (bucket != NULL) zone_free_bucket(zone, bucket, udata, itemdomain, true); critical_enter(); if ((bucket = newbucket) == NULL) return (false); cache = &zone->uz_cpu[curcpu]; #ifdef NUMA /* * Check to see if we should be populating the cross bucket. If it * is already populated we will fall through and attempt to populate * the free bucket. */ if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) { if (PCPU_GET(domain) != itemdomain && cache->uc_crossbucket.ucb_bucket == NULL) { cache_bucket_load_cross(cache, bucket); return (true); } } #endif /* * We may have lost the race to fill the bucket or switched CPUs. */ if (cache->uc_freebucket.ucb_bucket != NULL) { critical_exit(); bucket_free(zone, bucket, udata); critical_enter(); } else cache_bucket_load_free(cache, bucket); return (true); } static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item) { uma_keg_t keg; uma_domain_t dom; int freei; keg = zone->uz_keg; KEG_LOCK_ASSERT(keg, slab->us_domain); /* Do we need to remove from any lists? */ dom = &keg->uk_domain[slab->us_domain]; if (slab->us_freecount + 1 == keg->uk_ipers) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); dom->ud_free_slabs++; } else if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); } /* Slab management. */ freei = slab_item_index(slab, keg, item); BIT_SET(keg->uk_ipers, freei, &slab->us_free); slab->us_freecount++; /* Keg statistics. */ dom->ud_free_items++; } static void zone_release(void *arg, void **bucket, int cnt) { struct mtx *lock; uma_zone_t zone; uma_slab_t slab; uma_keg_t keg; uint8_t *mem; void *item; int i; zone = arg; keg = zone->uz_keg; lock = NULL; if (__predict_false((zone->uz_flags & UMA_ZFLAG_HASH) != 0)) lock = KEG_LOCK(keg, 0); for (i = 0; i < cnt; i++) { item = bucket[i]; if (__predict_true((zone->uz_flags & UMA_ZFLAG_VTOSLAB) != 0)) { slab = vtoslab((vm_offset_t)item); } else { mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); if ((zone->uz_flags & UMA_ZFLAG_HASH) != 0) slab = hash_sfind(&keg->uk_hash, mem); else slab = (uma_slab_t)(mem + keg->uk_pgoff); } if (lock != KEG_LOCKPTR(keg, slab->us_domain)) { if (lock != NULL) mtx_unlock(lock); lock = KEG_LOCK(keg, slab->us_domain); } slab_free_item(zone, slab, item); } if (lock != NULL) mtx_unlock(lock); } /* * Frees a single item to any zone. * * Arguments: * zone The zone to free to * item The item we're freeing * udata User supplied data for the dtor * skip Skip dtors and finis */ static __noinline void zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip) { /* * If a free is sent directly to an SMR zone we have to * synchronize immediately because the item can instantly * be reallocated. This should only happen in degenerate * cases when no memory is available for per-cpu caches. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && skip == SKIP_NONE) smr_synchronize(zone->uz_smr); item_dtor(zone, item, zone->uz_size, udata, skip); if (skip < SKIP_FINI && zone->uz_fini) { kasan_mark_item_valid(zone, item); zone->uz_fini(item, zone->uz_size); kasan_mark_item_invalid(zone, item); } zone->uz_release(zone->uz_arg, &item, 1); if (skip & SKIP_CNT) return; counter_u64_add(zone->uz_frees, 1); if (zone->uz_max_items > 0) zone_free_limit(zone, 1); } /* See uma.h */ int uma_zone_set_max(uma_zone_t zone, int nitems) { /* * If the limit is small, we may need to constrain the maximum per-CPU * cache size, or disable caching entirely. */ uma_zone_set_maxcache(zone, nitems); /* * XXX This can misbehave if the zone has any allocations with * no limit and a limit is imposed. There is currently no * way to clear a limit. */ ZONE_LOCK(zone); if (zone->uz_max_items == 0) ZONE_ASSERT_COLD(zone); zone->uz_max_items = nitems; zone->uz_flags |= UMA_ZFLAG_LIMIT; zone_update_caches(zone); /* We may need to wake waiters. */ wakeup(&zone->uz_max_items); ZONE_UNLOCK(zone); return (nitems); } /* See uma.h */ void uma_zone_set_maxcache(uma_zone_t zone, int nitems) { int bpcpu, bpdom, bsize, nb; ZONE_LOCK(zone); /* * Compute a lower bound on the number of items that may be cached in * the zone. Each CPU gets at least two buckets, and for cross-domain * frees we use an additional bucket per CPU and per domain. Select the * largest bucket size that does not exceed half of the requested limit, * with the left over space given to the full bucket cache. */ bpdom = 0; bpcpu = 2; #ifdef NUMA if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && vm_ndomains > 1) { bpcpu++; bpdom++; } #endif nb = bpcpu * mp_ncpus + bpdom * vm_ndomains; bsize = nitems / nb / 2; if (bsize > BUCKET_MAX) bsize = BUCKET_MAX; else if (bsize == 0 && nitems / nb > 0) bsize = 1; zone->uz_bucket_size_max = zone->uz_bucket_size = bsize; if (zone->uz_bucket_size_min > zone->uz_bucket_size_max) zone->uz_bucket_size_min = zone->uz_bucket_size_max; zone->uz_bucket_max = nitems - nb * bsize; ZONE_UNLOCK(zone); } /* See uma.h */ int uma_zone_get_max(uma_zone_t zone) { int nitems; nitems = atomic_load_64(&zone->uz_max_items); return (nitems); } /* See uma.h */ void uma_zone_set_warning(uma_zone_t zone, const char *warning) { ZONE_ASSERT_COLD(zone); zone->uz_warning = warning; } /* See uma.h */ void uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction) { ZONE_ASSERT_COLD(zone); TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone); } /* See uma.h */ int uma_zone_get_cur(uma_zone_t zone) { int64_t nitems; u_int i; nitems = 0; if (zone->uz_allocs != EARLY_COUNTER && zone->uz_frees != EARLY_COUNTER) nitems = counter_u64_fetch(zone->uz_allocs) - counter_u64_fetch(zone->uz_frees); CPU_FOREACH(i) nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs) - atomic_load_64(&zone->uz_cpu[i].uc_frees); return (nitems < 0 ? 0 : nitems); } static uint64_t uma_zone_get_allocs(uma_zone_t zone) { uint64_t nitems; u_int i; nitems = 0; if (zone->uz_allocs != EARLY_COUNTER) nitems = counter_u64_fetch(zone->uz_allocs); CPU_FOREACH(i) nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs); return (nitems); } static uint64_t uma_zone_get_frees(uma_zone_t zone) { uint64_t nitems; u_int i; nitems = 0; if (zone->uz_frees != EARLY_COUNTER) nitems = counter_u64_fetch(zone->uz_frees); CPU_FOREACH(i) nitems += atomic_load_64(&zone->uz_cpu[i].uc_frees); return (nitems); } #ifdef INVARIANTS /* Used only for KEG_ASSERT_COLD(). */ static uint64_t uma_keg_get_allocs(uma_keg_t keg) { uma_zone_t z; uint64_t nitems; nitems = 0; LIST_FOREACH(z, &keg->uk_zones, uz_link) nitems += uma_zone_get_allocs(z); return (nitems); } #endif /* See uma.h */ void uma_zone_set_init(uma_zone_t zone, uma_init uminit) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_init = uminit; } /* See uma.h */ void uma_zone_set_fini(uma_zone_t zone, uma_fini fini) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_fini = fini; } /* See uma.h */ void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit) { ZONE_ASSERT_COLD(zone); zone->uz_init = zinit; } /* See uma.h */ void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini) { ZONE_ASSERT_COLD(zone); zone->uz_fini = zfini; } /* See uma.h */ void uma_zone_set_freef(uma_zone_t zone, uma_free freef) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_freef = freef; } /* See uma.h */ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_allocf = allocf; } /* See uma.h */ void uma_zone_set_smr(uma_zone_t zone, smr_t smr) { ZONE_ASSERT_COLD(zone); KASSERT(smr != NULL, ("Got NULL smr")); KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, ("zone %p (%s) already uses SMR", zone, zone->uz_name)); zone->uz_flags |= UMA_ZONE_SMR; zone->uz_smr = smr; zone_update_caches(zone); } smr_t uma_zone_get_smr(uma_zone_t zone) { return (zone->uz_smr); } /* See uma.h */ void uma_zone_reserve(uma_zone_t zone, int items) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_reserve = items; } /* See uma.h */ int uma_zone_reserve_kva(uma_zone_t zone, int count) { uma_keg_t keg; vm_offset_t kva; u_int pages; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); ZONE_ASSERT_COLD(zone); pages = howmany(count, keg->uk_ipers) * keg->uk_ppera; #ifdef UMA_USE_DMAP if (keg->uk_ppera > 1) { #else if (1) { #endif kva = kva_alloc((vm_size_t)pages * PAGE_SIZE); if (kva == 0) return (0); } else kva = 0; MPASS(keg->uk_kva == 0); keg->uk_kva = kva; keg->uk_offset = 0; zone->uz_max_items = pages * keg->uk_ipers; #ifdef UMA_USE_DMAP keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc; #else keg->uk_allocf = noobj_alloc; #endif keg->uk_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE; zone->uz_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE; zone_update_caches(zone); return (1); } /* See uma.h */ void uma_prealloc(uma_zone_t zone, int items) { struct vm_domainset_iter di; uma_domain_t dom; uma_slab_t slab; uma_keg_t keg; int aflags, domain, slabs; KEG_GET(zone, keg); slabs = howmany(items, keg->uk_ipers); while (slabs-- > 0) { aflags = M_NOWAIT; vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, &aflags); for (;;) { slab = keg_alloc_slab(keg, zone, domain, M_WAITOK, aflags); if (slab != NULL) { dom = &keg->uk_domain[slab->us_domain]; /* * keg_alloc_slab() always returns a slab on the * partial list. */ LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); dom->ud_free_slabs++; KEG_UNLOCK(keg, slab->us_domain); break; } if (vm_domainset_iter_policy(&di, &domain) != 0) vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0); } } } /* * Returns a snapshot of memory consumption in bytes. */ size_t uma_zone_memory(uma_zone_t zone) { size_t sz; int i; sz = 0; if (zone->uz_flags & UMA_ZFLAG_CACHE) { for (i = 0; i < vm_ndomains; i++) sz += ZDOM_GET(zone, i)->uzd_nitems; return (sz * zone->uz_size); } for (i = 0; i < vm_ndomains; i++) sz += zone->uz_keg->uk_domain[i].ud_pages; return (sz * PAGE_SIZE); } struct uma_reclaim_args { int domain; int req; }; static void uma_reclaim_domain_cb(uma_zone_t zone, void *arg) { struct uma_reclaim_args *args; args = arg; - if ((zone->uz_flags & UMA_ZONE_UNMANAGED) == 0) - uma_zone_reclaim_domain(zone, args->req, args->domain); + if ((zone->uz_flags & UMA_ZONE_UNMANAGED) != 0) + return; + if ((args->req == UMA_RECLAIM_TRIM) && + (zone->uz_flags & UMA_ZONE_NOTRIM) !=0) + return; + + uma_zone_reclaim_domain(zone, args->req, args->domain); } /* See uma.h */ void uma_reclaim(int req) { uma_reclaim_domain(req, UMA_ANYDOMAIN); } void uma_reclaim_domain(int req, int domain) { struct uma_reclaim_args args; bucket_enable(); args.domain = domain; args.req = req; sx_slock(&uma_reclaim_lock); switch (req) { case UMA_RECLAIM_TRIM: case UMA_RECLAIM_DRAIN: zone_foreach(uma_reclaim_domain_cb, &args); break; case UMA_RECLAIM_DRAIN_CPU: zone_foreach(uma_reclaim_domain_cb, &args); pcpu_cache_drain_safe(NULL); zone_foreach(uma_reclaim_domain_cb, &args); break; default: panic("unhandled reclamation request %d", req); } /* * Some slabs may have been freed but this zone will be visited early * we visit again so that we can free pages that are empty once other * zones are drained. We have to do the same for buckets. */ uma_zone_reclaim_domain(slabzones[0], UMA_RECLAIM_DRAIN, domain); uma_zone_reclaim_domain(slabzones[1], UMA_RECLAIM_DRAIN, domain); bucket_zone_drain(domain); sx_sunlock(&uma_reclaim_lock); } static volatile int uma_reclaim_needed; void uma_reclaim_wakeup(void) { if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0) wakeup(uma_reclaim); } void uma_reclaim_worker(void *arg __unused) { for (;;) { sx_xlock(&uma_reclaim_lock); while (atomic_load_int(&uma_reclaim_needed) == 0) sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl", hz); sx_xunlock(&uma_reclaim_lock); EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM); uma_reclaim(UMA_RECLAIM_DRAIN_CPU); atomic_store_int(&uma_reclaim_needed, 0); /* Don't fire more than once per-second. */ pause("umarclslp", hz); } } /* See uma.h */ void uma_zone_reclaim(uma_zone_t zone, int req) { uma_zone_reclaim_domain(zone, req, UMA_ANYDOMAIN); } void uma_zone_reclaim_domain(uma_zone_t zone, int req, int domain) { switch (req) { case UMA_RECLAIM_TRIM: zone_reclaim(zone, domain, M_NOWAIT, false); break; case UMA_RECLAIM_DRAIN: zone_reclaim(zone, domain, M_NOWAIT, true); break; case UMA_RECLAIM_DRAIN_CPU: pcpu_cache_drain_safe(zone); zone_reclaim(zone, domain, M_NOWAIT, true); break; default: panic("unhandled reclamation request %d", req); } } /* See uma.h */ int uma_zone_exhausted(uma_zone_t zone) { return (atomic_load_32(&zone->uz_sleepers) > 0); } unsigned long uma_limit(void) { return (uma_kmem_limit); } void uma_set_limit(unsigned long limit) { uma_kmem_limit = limit; } unsigned long uma_size(void) { return (atomic_load_long(&uma_kmem_total)); } long uma_avail(void) { return (uma_kmem_limit - uma_size()); } #ifdef DDB /* * Generate statistics across both the zone and its per-cpu cache's. Return * desired statistics if the pointer is non-NULL for that statistic. * * Note: does not update the zone statistics, as it can't safely clear the * per-CPU cache statistic. * */ static void uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp, uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp) { uma_cache_t cache; uint64_t allocs, frees, sleeps, xdomain; int cachefree, cpu; allocs = frees = sleeps = xdomain = 0; cachefree = 0; CPU_FOREACH(cpu) { cache = &z->uz_cpu[cpu]; cachefree += cache->uc_allocbucket.ucb_cnt; cachefree += cache->uc_freebucket.ucb_cnt; xdomain += cache->uc_crossbucket.ucb_cnt; cachefree += cache->uc_crossbucket.ucb_cnt; allocs += cache->uc_allocs; frees += cache->uc_frees; } allocs += counter_u64_fetch(z->uz_allocs); frees += counter_u64_fetch(z->uz_frees); xdomain += counter_u64_fetch(z->uz_xdomain); sleeps += z->uz_sleeps; if (cachefreep != NULL) *cachefreep = cachefree; if (allocsp != NULL) *allocsp = allocs; if (freesp != NULL) *freesp = frees; if (sleepsp != NULL) *sleepsp = sleeps; if (xdomainp != NULL) *xdomainp = xdomain; } #endif /* DDB */ static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS) { uma_keg_t kz; uma_zone_t z; int count; count = 0; rw_rlock(&uma_rwlock); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) count++; } LIST_FOREACH(z, &uma_cachezones, uz_link) count++; rw_runlock(&uma_rwlock); return (sysctl_handle_int(oidp, &count, 0, req)); } static void uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf, struct uma_percpu_stat *ups, bool internal) { uma_zone_domain_t zdom; uma_cache_t cache; int i; for (i = 0; i < vm_ndomains; i++) { zdom = ZDOM_GET(z, i); uth->uth_zone_free += zdom->uzd_nitems; } uth->uth_allocs = counter_u64_fetch(z->uz_allocs); uth->uth_frees = counter_u64_fetch(z->uz_frees); uth->uth_fails = counter_u64_fetch(z->uz_fails); uth->uth_xdomain = counter_u64_fetch(z->uz_xdomain); uth->uth_sleeps = z->uz_sleeps; for (i = 0; i < mp_maxid + 1; i++) { bzero(&ups[i], sizeof(*ups)); if (internal || CPU_ABSENT(i)) continue; cache = &z->uz_cpu[i]; ups[i].ups_cache_free += cache->uc_allocbucket.ucb_cnt; ups[i].ups_cache_free += cache->uc_freebucket.ucb_cnt; ups[i].ups_cache_free += cache->uc_crossbucket.ucb_cnt; ups[i].ups_allocs = cache->uc_allocs; ups[i].ups_frees = cache->uc_frees; } } static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) { struct uma_stream_header ush; struct uma_type_header uth; struct uma_percpu_stat *ups; struct sbuf sbuf; uma_keg_t kz; uma_zone_t z; uint64_t items; uint32_t kfree, pages; int count, error, i; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL); ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK); count = 0; rw_rlock(&uma_rwlock); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) count++; } LIST_FOREACH(z, &uma_cachezones, uz_link) count++; /* * Insert stream header. */ bzero(&ush, sizeof(ush)); ush.ush_version = UMA_STREAM_VERSION; ush.ush_maxcpus = (mp_maxid + 1); ush.ush_count = count; (void)sbuf_bcat(&sbuf, &ush, sizeof(ush)); LIST_FOREACH(kz, &uma_kegs, uk_link) { kfree = pages = 0; for (i = 0; i < vm_ndomains; i++) { kfree += kz->uk_domain[i].ud_free_items; pages += kz->uk_domain[i].ud_pages; } LIST_FOREACH(z, &kz->uk_zones, uz_link) { bzero(&uth, sizeof(uth)); strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); uth.uth_align = kz->uk_align; uth.uth_size = kz->uk_size; uth.uth_rsize = kz->uk_rsize; if (z->uz_max_items > 0) { items = UZ_ITEMS_COUNT(z->uz_items); uth.uth_pages = (items / kz->uk_ipers) * kz->uk_ppera; } else uth.uth_pages = pages; uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) * kz->uk_ppera; uth.uth_limit = z->uz_max_items; uth.uth_keg_free = kfree; /* * A zone is secondary is it is not the first entry * on the keg's zone list. */ if ((z->uz_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z)) uth.uth_zone_flags = UTH_ZONE_SECONDARY; uma_vm_zone_stats(&uth, z, &sbuf, ups, kz->uk_flags & UMA_ZFLAG_INTERNAL); (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); for (i = 0; i < mp_maxid + 1; i++) (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); } } LIST_FOREACH(z, &uma_cachezones, uz_link) { bzero(&uth, sizeof(uth)); strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); uth.uth_size = z->uz_size; uma_vm_zone_stats(&uth, z, &sbuf, ups, false); (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); for (i = 0; i < mp_maxid + 1; i++) (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); } rw_runlock(&uma_rwlock); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); free(ups, M_TEMP); return (error); } int sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = *(uma_zone_t *)arg1; int error, max; max = uma_zone_get_max(zone); error = sysctl_handle_int(oidp, &max, 0, req); if (error || !req->newptr) return (error); uma_zone_set_max(zone, max); return (0); } int sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS) { uma_zone_t zone; int cur; /* * Some callers want to add sysctls for global zones that * may not yet exist so they pass a pointer to a pointer. */ if (arg2 == 0) zone = *(uma_zone_t *)arg1; else zone = arg1; cur = uma_zone_get_cur(zone); return (sysctl_handle_int(oidp, &cur, 0, req)); } static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = arg1; uint64_t cur; cur = uma_zone_get_allocs(zone); return (sysctl_handle_64(oidp, &cur, 0, req)); } static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = arg1; uint64_t cur; cur = uma_zone_get_frees(zone); return (sysctl_handle_64(oidp, &cur, 0, req)); } static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; uma_zone_t zone = arg1; int error; sbuf_new_for_sysctl(&sbuf, NULL, 0, req); if (zone->uz_flags != 0) sbuf_printf(&sbuf, "0x%b", zone->uz_flags, PRINT_UMA_ZFLAGS); else sbuf_printf(&sbuf, "0"); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS) { uma_keg_t keg = arg1; int avail, effpct, total; total = keg->uk_ppera * PAGE_SIZE; if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0) total += slabzone(keg->uk_ipers)->uz_keg->uk_rsize; /* * We consider the client's requested size and alignment here, not the * real size determination uk_rsize, because we also adjust the real * size for internal implementation reasons (max bitset size). */ avail = keg->uk_ipers * roundup2(keg->uk_size, keg->uk_align + 1); if ((keg->uk_flags & UMA_ZONE_PCPU) != 0) avail *= mp_maxid + 1; effpct = 100 * avail / total; return (sysctl_handle_int(oidp, &effpct, 0, req)); } static int sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = arg1; uint64_t cur; cur = UZ_ITEMS_COUNT(atomic_load_64(&zone->uz_items)); return (sysctl_handle_64(oidp, &cur, 0, req)); } #ifdef INVARIANTS static uma_slab_t uma_dbg_getslab(uma_zone_t zone, void *item) { uma_slab_t slab; uma_keg_t keg; uint8_t *mem; /* * It is safe to return the slab here even though the * zone is unlocked because the item's allocation state * essentially holds a reference. */ mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0) return (NULL); if (zone->uz_flags & UMA_ZFLAG_VTOSLAB) return (vtoslab((vm_offset_t)mem)); keg = zone->uz_keg; if ((keg->uk_flags & UMA_ZFLAG_HASH) == 0) return ((uma_slab_t)(mem + keg->uk_pgoff)); KEG_LOCK(keg, 0); slab = hash_sfind(&keg->uk_hash, mem); KEG_UNLOCK(keg, 0); return (slab); } static bool uma_dbg_zskip(uma_zone_t zone, void *mem) { if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0) return (true); return (uma_dbg_kskip(zone->uz_keg, mem)); } static bool uma_dbg_kskip(uma_keg_t keg, void *mem) { uintptr_t idx; if (dbg_divisor == 0) return (true); if (dbg_divisor == 1) return (false); idx = (uintptr_t)mem >> PAGE_SHIFT; if (keg->uk_ipers > 1) { idx *= keg->uk_ipers; idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize; } if ((idx / dbg_divisor) * dbg_divisor != idx) { counter_u64_add(uma_skip_cnt, 1); return (true); } counter_u64_add(uma_dbg_cnt, 1); return (false); } /* * Set up the slab's freei data such that uma_dbg_free can function. * */ static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item) { uma_keg_t keg; int freei; if (slab == NULL) { slab = uma_dbg_getslab(zone, item); if (slab == NULL) panic("uma: item %p did not belong to zone %s", item, zone->uz_name); } keg = zone->uz_keg; freei = slab_item_index(slab, keg, item); if (BIT_TEST_SET_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg))) panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)", item, zone, zone->uz_name, slab, freei); } /* * Verifies freed addresses. Checks for alignment, valid slab membership * and duplicate frees. * */ static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) { uma_keg_t keg; int freei; if (slab == NULL) { slab = uma_dbg_getslab(zone, item); if (slab == NULL) panic("uma: Freed item %p did not belong to zone %s", item, zone->uz_name); } keg = zone->uz_keg; freei = slab_item_index(slab, keg, item); if (freei >= keg->uk_ipers) panic("Invalid free of %p from zone %p(%s) slab %p(%d)", item, zone, zone->uz_name, slab, freei); if (slab_item(slab, keg, freei) != item) panic("Unaligned free of %p from zone %p(%s) slab %p(%d)", item, zone, zone->uz_name, slab, freei); if (!BIT_TEST_CLR_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg))) panic("Duplicate free of %p from zone %p(%s) slab %p(%d)", item, zone, zone->uz_name, slab, freei); } #endif /* INVARIANTS */ #ifdef DDB static int64_t get_uma_stats(uma_keg_t kz, uma_zone_t z, uint64_t *allocs, uint64_t *used, uint64_t *sleeps, long *cachefree, uint64_t *xdomain) { uint64_t frees; int i; if (kz->uk_flags & UMA_ZFLAG_INTERNAL) { *allocs = counter_u64_fetch(z->uz_allocs); frees = counter_u64_fetch(z->uz_frees); *sleeps = z->uz_sleeps; *cachefree = 0; *xdomain = 0; } else uma_zone_sumstat(z, cachefree, allocs, &frees, sleeps, xdomain); for (i = 0; i < vm_ndomains; i++) { *cachefree += ZDOM_GET(z, i)->uzd_nitems; if (!((z->uz_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z))) *cachefree += kz->uk_domain[i].ud_free_items; } *used = *allocs - frees; return (((int64_t)*used + *cachefree) * kz->uk_size); } DB_SHOW_COMMAND_FLAGS(uma, db_show_uma, DB_CMD_MEMSAFE) { const char *fmt_hdr, *fmt_entry; uma_keg_t kz; uma_zone_t z; uint64_t allocs, used, sleeps, xdomain; long cachefree; /* variables for sorting */ uma_keg_t cur_keg; uma_zone_t cur_zone, last_zone; int64_t cur_size, last_size, size; int ties; /* /i option produces machine-parseable CSV output */ if (modif[0] == 'i') { fmt_hdr = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n"; fmt_entry = "\"%s\",%ju,%jd,%ld,%ju,%ju,%u,%jd,%ju\n"; } else { fmt_hdr = "%18s %6s %7s %7s %11s %7s %7s %10s %8s\n"; fmt_entry = "%18s %6ju %7jd %7ld %11ju %7ju %7u %10jd %8ju\n"; } db_printf(fmt_hdr, "Zone", "Size", "Used", "Free", "Requests", "Sleeps", "Bucket", "Total Mem", "XFree"); /* Sort the zones with largest size first. */ last_zone = NULL; last_size = INT64_MAX; for (;;) { cur_zone = NULL; cur_size = -1; ties = 0; LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) { /* * In the case of size ties, print out zones * in the order they are encountered. That is, * when we encounter the most recently output * zone, we have already printed all preceding * ties, and we must print all following ties. */ if (z == last_zone) { ties = 1; continue; } size = get_uma_stats(kz, z, &allocs, &used, &sleeps, &cachefree, &xdomain); if (size > cur_size && size < last_size + ties) { cur_size = size; cur_zone = z; cur_keg = kz; } } } if (cur_zone == NULL) break; size = get_uma_stats(cur_keg, cur_zone, &allocs, &used, &sleeps, &cachefree, &xdomain); db_printf(fmt_entry, cur_zone->uz_name, (uintmax_t)cur_keg->uk_size, (intmax_t)used, cachefree, (uintmax_t)allocs, (uintmax_t)sleeps, (unsigned)cur_zone->uz_bucket_size, (intmax_t)size, xdomain); if (db_pager_quit) return; last_zone = cur_zone; last_size = cur_size; } } DB_SHOW_COMMAND_FLAGS(umacache, db_show_umacache, DB_CMD_MEMSAFE) { uma_zone_t z; uint64_t allocs, frees; long cachefree; int i; db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free", "Requests", "Bucket"); LIST_FOREACH(z, &uma_cachezones, uz_link) { uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL); for (i = 0; i < vm_ndomains; i++) cachefree += ZDOM_GET(z, i)->uzd_nitems; db_printf("%18s %8ju %8jd %8ld %12ju %8u\n", z->uz_name, (uintmax_t)z->uz_size, (intmax_t)(allocs - frees), cachefree, (uintmax_t)allocs, z->uz_bucket_size); if (db_pager_quit) return; } } #endif /* DDB */