diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c index b3235e8a1e0c..2495e940a6a0 100644 --- a/sys/kern/uipc_ktls.c +++ b/sys/kern/uipc_ktls.c @@ -1,2884 +1,2890 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2014-2019 Netflix Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) #include #endif #include #include #include #ifdef RSS #include #include #endif #include #include #if defined(INET) || defined(INET6) #include #include #endif #include #ifdef TCP_OFFLOAD #include #endif #include #include #include #include #include #include #include struct ktls_wq { struct mtx mtx; STAILQ_HEAD(, mbuf) m_head; STAILQ_HEAD(, socket) so_head; bool running; int lastallocfail; } __aligned(CACHE_LINE_SIZE); struct ktls_alloc_thread { uint64_t wakeups; uint64_t allocs; struct thread *td; int running; }; struct ktls_domain_info { int count; int cpu[MAXCPU]; struct ktls_alloc_thread alloc_td; }; struct ktls_domain_info ktls_domains[MAXMEMDOM]; static struct ktls_wq *ktls_wq; static struct proc *ktls_proc; static uma_zone_t ktls_session_zone; static uma_zone_t ktls_buffer_zone; static uint16_t ktls_cpuid_lookup[MAXCPU]; static int ktls_init_state; static struct sx ktls_init_lock; SX_SYSINIT(ktls_init_lock, &ktls_init_lock, "ktls init"); SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Kernel TLS offload"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Kernel TLS offload stats"); #ifdef RSS static int ktls_bind_threads = 1; #else static int ktls_bind_threads; #endif SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN, &ktls_bind_threads, 0, "Bind crypto threads to cores (1) or cores and domains (2) at boot"); static u_int ktls_maxlen = 16384; SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RDTUN, &ktls_maxlen, 0, "Maximum TLS record size"); static int ktls_number_threads; SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD, &ktls_number_threads, 0, "Number of TLS threads in thread-pool"); unsigned int ktls_ifnet_max_rexmit_pct = 2; SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN, &ktls_ifnet_max_rexmit_pct, 2, "Max percent bytes retransmitted before ifnet TLS is disabled"); static bool ktls_offload_enable; SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN, &ktls_offload_enable, 0, "Enable support for kernel TLS offload"); static bool ktls_cbc_enable = true; SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RWTUN, &ktls_cbc_enable, 1, "Enable Support of AES-CBC crypto for kernel TLS"); static bool ktls_sw_buffer_cache = true; SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, sw_buffer_cache, CTLFLAG_RDTUN, &ktls_sw_buffer_cache, 1, "Enable caching of output buffers for SW encryption"); static int ktls_max_alloc = 128; SYSCTL_INT(_kern_ipc_tls, OID_AUTO, max_alloc, CTLFLAG_RWTUN, &ktls_max_alloc, 128, "Max number of 16k buffers to allocate in thread context"); static COUNTER_U64_DEFINE_EARLY(ktls_tasks_active); SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD, &ktls_tasks_active, "Number of active tasks"); static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_pending); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_pending, CTLFLAG_RD, &ktls_cnt_tx_pending, "Number of TLS 1.0 records waiting for earlier TLS records"); static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_queued); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD, &ktls_cnt_tx_queued, "Number of TLS records in queue to tasks for SW encryption"); static COUNTER_U64_DEFINE_EARLY(ktls_cnt_rx_queued); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_rx_inqueue, CTLFLAG_RD, &ktls_cnt_rx_queued, "Number of TLS sockets in queue to tasks for SW decryption"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_total); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total, CTLFLAG_RD, &ktls_offload_total, "Total successful TLS setups (parameters set)"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_enable_calls); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, enable_calls, CTLFLAG_RD, &ktls_offload_enable_calls, "Total number of TLS enable calls made"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_active); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD, &ktls_offload_active, "Total Active TLS sessions"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_corrupted_records); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, corrupted_records, CTLFLAG_RD, &ktls_offload_corrupted_records, "Total corrupted TLS records received"); static COUNTER_U64_DEFINE_EARLY(ktls_offload_failed_crypto); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD, &ktls_offload_failed_crypto, "Total TLS crypto failures"); static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_ifnet); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_ifnet, CTLFLAG_RD, &ktls_switch_to_ifnet, "TLS sessions switched from SW to ifnet"); static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_sw); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_sw, CTLFLAG_RD, &ktls_switch_to_sw, "TLS sessions switched from ifnet to SW"); static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD, &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed, CTLFLAG_RD, &ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from ifnet"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok, CTLFLAG_RD, &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from ifnet"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Software TLS session stats"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Hardware (ifnet) TLS session stats"); #ifdef TCP_OFFLOAD SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "TOE TLS session stats"); #endif static COUNTER_U64_DEFINE_EARLY(ktls_sw_cbc); SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, cbc, CTLFLAG_RD, &ktls_sw_cbc, "Active number of software TLS sessions using AES-CBC"); static COUNTER_U64_DEFINE_EARLY(ktls_sw_gcm); SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, gcm, CTLFLAG_RD, &ktls_sw_gcm, "Active number of software TLS sessions using AES-GCM"); static COUNTER_U64_DEFINE_EARLY(ktls_sw_chacha20); SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, chacha20, CTLFLAG_RD, &ktls_sw_chacha20, "Active number of software TLS sessions using Chacha20-Poly1305"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_cbc); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, cbc, CTLFLAG_RD, &ktls_ifnet_cbc, "Active number of ifnet TLS sessions using AES-CBC"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_gcm); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, gcm, CTLFLAG_RD, &ktls_ifnet_gcm, "Active number of ifnet TLS sessions using AES-GCM"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_chacha20); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, chacha20, CTLFLAG_RD, &ktls_ifnet_chacha20, "Active number of ifnet TLS sessions using Chacha20-Poly1305"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset, CTLFLAG_RD, &ktls_ifnet_reset, "TLS sessions updated to a new ifnet send tag"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_dropped); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_dropped, CTLFLAG_RD, &ktls_ifnet_reset_dropped, "TLS sessions dropped after failing to update ifnet send tag"); static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_failed); SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_failed, CTLFLAG_RD, &ktls_ifnet_reset_failed, "TLS sessions that failed to allocate a new ifnet send tag"); static int ktls_ifnet_permitted; SYSCTL_UINT(_kern_ipc_tls_ifnet, OID_AUTO, permitted, CTLFLAG_RWTUN, &ktls_ifnet_permitted, 1, "Whether to permit hardware (ifnet) TLS sessions"); #ifdef TCP_OFFLOAD static COUNTER_U64_DEFINE_EARLY(ktls_toe_cbc); SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, cbc, CTLFLAG_RD, &ktls_toe_cbc, "Active number of TOE TLS sessions using AES-CBC"); static COUNTER_U64_DEFINE_EARLY(ktls_toe_gcm); SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, gcm, CTLFLAG_RD, &ktls_toe_gcm, "Active number of TOE TLS sessions using AES-GCM"); static COUNTER_U64_DEFINE_EARLY(ktls_toe_chacha20); SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, chacha20, CTLFLAG_RD, &ktls_toe_chacha20, "Active number of TOE TLS sessions using Chacha20-Poly1305"); #endif static MALLOC_DEFINE(M_KTLS, "ktls", "Kernel TLS"); static void ktls_cleanup(struct ktls_session *tls); #if defined(INET) || defined(INET6) static void ktls_reset_send_tag(void *context, int pending); #endif static void ktls_work_thread(void *ctx); static void ktls_alloc_thread(void *ctx); #if defined(INET) || defined(INET6) static u_int ktls_get_cpu(struct socket *so) { struct inpcb *inp; #ifdef NUMA struct ktls_domain_info *di; #endif u_int cpuid; inp = sotoinpcb(so); #ifdef RSS cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid != NETISR_CPUID_NONE) return (cpuid); #endif /* * Just use the flowid to shard connections in a repeatable * fashion. Note that TLS 1.0 sessions rely on the * serialization provided by having the same connection use * the same queue. */ #ifdef NUMA if (ktls_bind_threads > 1 && inp->inp_numa_domain != M_NODOM) { di = &ktls_domains[inp->inp_numa_domain]; cpuid = di->cpu[inp->inp_flowid % di->count]; } else #endif cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads]; return (cpuid); } #endif static int ktls_buffer_import(void *arg, void **store, int count, int domain, int flags) { vm_page_t m; int i, req; KASSERT((ktls_maxlen & PAGE_MASK) == 0, ("%s: ktls max length %d is not page size-aligned", __func__, ktls_maxlen)); req = VM_ALLOC_WIRED | VM_ALLOC_NODUMP | malloc2vm_flags(flags); for (i = 0; i < count; i++) { m = vm_page_alloc_noobj_contig_domain(domain, req, atop(ktls_maxlen), 0, ~0ul, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); if (m == NULL) break; store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); } return (i); } static void ktls_buffer_release(void *arg __unused, void **store, int count) { vm_page_t m; int i, j; for (i = 0; i < count; i++) { m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i])); for (j = 0; j < atop(ktls_maxlen); j++) { (void)vm_page_unwire_noq(m + j); vm_page_free(m + j); } } } static void ktls_free_mext_contig(struct mbuf *m) { M_ASSERTEXTPG(m); uma_zfree(ktls_buffer_zone, (void *)PHYS_TO_DMAP(m->m_epg_pa[0])); } static int ktls_init(void) { struct thread *td; struct pcpu *pc; int count, domain, error, i; ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS, M_WAITOK | M_ZERO); ktls_session_zone = uma_zcreate("ktls_session", sizeof(struct ktls_session), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); if (ktls_sw_buffer_cache) { ktls_buffer_zone = uma_zcache_create("ktls_buffers", roundup2(ktls_maxlen, PAGE_SIZE), NULL, NULL, NULL, NULL, ktls_buffer_import, ktls_buffer_release, NULL, UMA_ZONE_FIRSTTOUCH); } /* * Initialize the workqueues to run the TLS work. We create a * work queue for each CPU. */ CPU_FOREACH(i) { STAILQ_INIT(&ktls_wq[i].m_head); STAILQ_INIT(&ktls_wq[i].so_head); mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF); if (ktls_bind_threads > 1) { pc = pcpu_find(i); domain = pc->pc_domain; count = ktls_domains[domain].count; ktls_domains[domain].cpu[count] = i; ktls_domains[domain].count++; } ktls_cpuid_lookup[ktls_number_threads] = i; ktls_number_threads++; } /* * If we somehow have an empty domain, fall back to choosing * among all KTLS threads. */ if (ktls_bind_threads > 1) { for (i = 0; i < vm_ndomains; i++) { if (ktls_domains[i].count == 0) { ktls_bind_threads = 1; break; } } } /* Start kthreads for each workqueue. */ CPU_FOREACH(i) { error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i], &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i); if (error) { printf("Can't add KTLS thread %d error %d\n", i, error); return (error); } } /* * Start an allocation thread per-domain to perform blocking allocations * of 16k physically contiguous TLS crypto destination buffers. */ if (ktls_sw_buffer_cache) { for (domain = 0; domain < vm_ndomains; domain++) { if (VM_DOMAIN_EMPTY(domain)) continue; if (CPU_EMPTY(&cpuset_domain[domain])) continue; error = kproc_kthread_add(ktls_alloc_thread, &ktls_domains[domain], &ktls_proc, &ktls_domains[domain].alloc_td.td, 0, 0, "KTLS", "alloc_%d", domain); if (error) { printf("Can't add KTLS alloc thread %d error %d\n", domain, error); return (error); } } } if (bootverbose) printf("KTLS: Initialized %d threads\n", ktls_number_threads); return (0); } static int ktls_start_kthreads(void) { int error, state; start: state = atomic_load_acq_int(&ktls_init_state); if (__predict_true(state > 0)) return (0); if (state < 0) return (ENXIO); sx_xlock(&ktls_init_lock); if (ktls_init_state != 0) { sx_xunlock(&ktls_init_lock); goto start; } error = ktls_init(); if (error == 0) state = 1; else state = -1; atomic_store_rel_int(&ktls_init_state, state); sx_xunlock(&ktls_init_lock); return (error); } #if defined(INET) || defined(INET6) static int ktls_create_session(struct socket *so, struct tls_enable *en, struct ktls_session **tlsp) { struct ktls_session *tls; int error; /* Only TLS 1.0 - 1.3 are supported. */ if (en->tls_vmajor != TLS_MAJOR_VER_ONE) return (EINVAL); if (en->tls_vminor < TLS_MINOR_VER_ZERO || en->tls_vminor > TLS_MINOR_VER_THREE) return (EINVAL); if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE) return (EINVAL); if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE) return (EINVAL); if (en->iv_len < 0 || en->iv_len > sizeof(tls->params.iv)) return (EINVAL); /* All supported algorithms require a cipher key. */ if (en->cipher_key_len == 0) return (EINVAL); /* No flags are currently supported. */ if (en->flags != 0) return (EINVAL); /* Common checks for supported algorithms. */ switch (en->cipher_algorithm) { case CRYPTO_AES_NIST_GCM_16: /* * auth_algorithm isn't used, but permit GMAC values * for compatibility. */ switch (en->auth_algorithm) { case 0: #ifdef COMPAT_FREEBSD12 /* XXX: Really 13.0-current COMPAT. */ case CRYPTO_AES_128_NIST_GMAC: case CRYPTO_AES_192_NIST_GMAC: case CRYPTO_AES_256_NIST_GMAC: #endif break; default: return (EINVAL); } if (en->auth_key_len != 0) return (EINVAL); switch (en->tls_vminor) { case TLS_MINOR_VER_TWO: if (en->iv_len != TLS_AEAD_GCM_LEN) return (EINVAL); break; case TLS_MINOR_VER_THREE: if (en->iv_len != TLS_1_3_GCM_IV_LEN) return (EINVAL); break; default: return (EINVAL); } break; case CRYPTO_AES_CBC: switch (en->auth_algorithm) { case CRYPTO_SHA1_HMAC: break; case CRYPTO_SHA2_256_HMAC: case CRYPTO_SHA2_384_HMAC: if (en->tls_vminor != TLS_MINOR_VER_TWO) return (EINVAL); break; default: return (EINVAL); } if (en->auth_key_len == 0) return (EINVAL); /* * TLS 1.0 requires an implicit IV. TLS 1.1 and 1.2 * use explicit IVs. */ switch (en->tls_vminor) { case TLS_MINOR_VER_ZERO: if (en->iv_len != TLS_CBC_IMPLICIT_IV_LEN) return (EINVAL); break; case TLS_MINOR_VER_ONE: case TLS_MINOR_VER_TWO: /* Ignore any supplied IV. */ en->iv_len = 0; break; default: return (EINVAL); } break; case CRYPTO_CHACHA20_POLY1305: if (en->auth_algorithm != 0 || en->auth_key_len != 0) return (EINVAL); if (en->tls_vminor != TLS_MINOR_VER_TWO && en->tls_vminor != TLS_MINOR_VER_THREE) return (EINVAL); if (en->iv_len != TLS_CHACHA20_IV_LEN) return (EINVAL); break; default: return (EINVAL); } error = ktls_start_kthreads(); if (error != 0) return (error); tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO); counter_u64_add(ktls_offload_active, 1); refcount_init(&tls->refcount, 1); TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls); tls->wq_index = ktls_get_cpu(so); tls->params.cipher_algorithm = en->cipher_algorithm; tls->params.auth_algorithm = en->auth_algorithm; tls->params.tls_vmajor = en->tls_vmajor; tls->params.tls_vminor = en->tls_vminor; tls->params.flags = en->flags; tls->params.max_frame_len = min(TLS_MAX_MSG_SIZE_V10_2, ktls_maxlen); /* Set the header and trailer lengths. */ tls->params.tls_hlen = sizeof(struct tls_record_layer); switch (en->cipher_algorithm) { case CRYPTO_AES_NIST_GCM_16: /* * TLS 1.2 uses a 4 byte implicit IV with an explicit 8 byte * nonce. TLS 1.3 uses a 12 byte implicit IV. */ if (en->tls_vminor < TLS_MINOR_VER_THREE) tls->params.tls_hlen += sizeof(uint64_t); tls->params.tls_tlen = AES_GMAC_HASH_LEN; tls->params.tls_bs = 1; break; case CRYPTO_AES_CBC: switch (en->auth_algorithm) { case CRYPTO_SHA1_HMAC: if (en->tls_vminor == TLS_MINOR_VER_ZERO) { /* Implicit IV, no nonce. */ tls->sequential_records = true; tls->next_seqno = be64dec(en->rec_seq); STAILQ_INIT(&tls->pending_records); } else { tls->params.tls_hlen += AES_BLOCK_LEN; } tls->params.tls_tlen = AES_BLOCK_LEN + SHA1_HASH_LEN; break; case CRYPTO_SHA2_256_HMAC: tls->params.tls_hlen += AES_BLOCK_LEN; tls->params.tls_tlen = AES_BLOCK_LEN + SHA2_256_HASH_LEN; break; case CRYPTO_SHA2_384_HMAC: tls->params.tls_hlen += AES_BLOCK_LEN; tls->params.tls_tlen = AES_BLOCK_LEN + SHA2_384_HASH_LEN; break; default: panic("invalid hmac"); } tls->params.tls_bs = AES_BLOCK_LEN; break; case CRYPTO_CHACHA20_POLY1305: /* * Chacha20 uses a 12 byte implicit IV. */ tls->params.tls_tlen = POLY1305_HASH_LEN; tls->params.tls_bs = 1; break; default: panic("invalid cipher"); } /* * TLS 1.3 includes optional padding which we do not support, * and also puts the "real" record type at the end of the * encrypted data. */ if (en->tls_vminor == TLS_MINOR_VER_THREE) tls->params.tls_tlen += sizeof(uint8_t); KASSERT(tls->params.tls_hlen <= MBUF_PEXT_HDR_LEN, ("TLS header length too long: %d", tls->params.tls_hlen)); KASSERT(tls->params.tls_tlen <= MBUF_PEXT_TRAIL_LEN, ("TLS trailer length too long: %d", tls->params.tls_tlen)); if (en->auth_key_len != 0) { tls->params.auth_key_len = en->auth_key_len; tls->params.auth_key = malloc(en->auth_key_len, M_KTLS, M_WAITOK); error = copyin(en->auth_key, tls->params.auth_key, en->auth_key_len); if (error) goto out; } tls->params.cipher_key_len = en->cipher_key_len; tls->params.cipher_key = malloc(en->cipher_key_len, M_KTLS, M_WAITOK); error = copyin(en->cipher_key, tls->params.cipher_key, en->cipher_key_len); if (error) goto out; /* * This holds the implicit portion of the nonce for AEAD * ciphers and the initial implicit IV for TLS 1.0. The * explicit portions of the IV are generated in ktls_frame(). */ if (en->iv_len != 0) { tls->params.iv_len = en->iv_len; error = copyin(en->iv, tls->params.iv, en->iv_len); if (error) goto out; /* * For TLS 1.2 with GCM, generate an 8-byte nonce as a * counter to generate unique explicit IVs. * * Store this counter in the last 8 bytes of the IV * array so that it is 8-byte aligned. */ if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16 && en->tls_vminor == TLS_MINOR_VER_TWO) arc4rand(tls->params.iv + 8, sizeof(uint64_t), 0); } *tlsp = tls; return (0); out: ktls_cleanup(tls); return (error); } static struct ktls_session * ktls_clone_session(struct ktls_session *tls) { struct ktls_session *tls_new; tls_new = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO); counter_u64_add(ktls_offload_active, 1); refcount_init(&tls_new->refcount, 1); TASK_INIT(&tls_new->reset_tag_task, 0, ktls_reset_send_tag, tls_new); /* Copy fields from existing session. */ tls_new->params = tls->params; tls_new->wq_index = tls->wq_index; /* Deep copy keys. */ if (tls_new->params.auth_key != NULL) { tls_new->params.auth_key = malloc(tls->params.auth_key_len, M_KTLS, M_WAITOK); memcpy(tls_new->params.auth_key, tls->params.auth_key, tls->params.auth_key_len); } tls_new->params.cipher_key = malloc(tls->params.cipher_key_len, M_KTLS, M_WAITOK); memcpy(tls_new->params.cipher_key, tls->params.cipher_key, tls->params.cipher_key_len); return (tls_new); } #endif static void ktls_cleanup(struct ktls_session *tls) { counter_u64_add(ktls_offload_active, -1); switch (tls->mode) { case TCP_TLS_MODE_SW: switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_sw_cbc, -1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_sw_gcm, -1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_sw_chacha20, -1); break; } break; case TCP_TLS_MODE_IFNET: switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_ifnet_cbc, -1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_ifnet_gcm, -1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_ifnet_chacha20, -1); break; } if (tls->snd_tag != NULL) m_snd_tag_rele(tls->snd_tag); break; #ifdef TCP_OFFLOAD case TCP_TLS_MODE_TOE: switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_toe_cbc, -1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_toe_gcm, -1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_toe_chacha20, -1); break; } break; #endif } if (tls->ocf_session != NULL) ktls_ocf_free(tls); if (tls->params.auth_key != NULL) { zfree(tls->params.auth_key, M_KTLS); tls->params.auth_key = NULL; tls->params.auth_key_len = 0; } if (tls->params.cipher_key != NULL) { zfree(tls->params.cipher_key, M_KTLS); tls->params.cipher_key = NULL; tls->params.cipher_key_len = 0; } explicit_bzero(tls->params.iv, sizeof(tls->params.iv)); } #if defined(INET) || defined(INET6) #ifdef TCP_OFFLOAD static int ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction) { struct inpcb *inp; struct tcpcb *tp; int error; inp = so->so_pcb; INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } if (inp->inp_socket == NULL) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); if (!(tp->t_flags & TF_TOE)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = tcp_offload_alloc_tls_session(tp, tls, direction); INP_WUNLOCK(inp); if (error == 0) { tls->mode = TCP_TLS_MODE_TOE; switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_toe_cbc, 1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_toe_gcm, 1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_toe_chacha20, 1); break; } } return (error); } #endif /* * Common code used when first enabling ifnet TLS on a connection or * when allocating a new ifnet TLS session due to a routing change. * This function allocates a new TLS send tag on whatever interface * the connection is currently routed over. */ static int ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force, struct m_snd_tag **mstp) { union if_snd_tag_alloc_params params; struct ifnet *ifp; struct nhop_object *nh; struct tcpcb *tp; int error; INP_RLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_RUNLOCK(inp); return (ECONNRESET); } if (inp->inp_socket == NULL) { INP_RUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); /* * Check administrative controls on ifnet TLS to determine if * ifnet TLS should be denied. * * - Always permit 'force' requests. * - ktls_ifnet_permitted == 0: always deny. */ if (!force && ktls_ifnet_permitted == 0) { INP_RUNLOCK(inp); return (ENXIO); } /* * XXX: Use the cached route in the inpcb to find the * interface. This should perhaps instead use * rtalloc1_fib(dst, 0, 0, fibnum). Since KTLS is only * enabled after a connection has completed key negotiation in * userland, the cached route will be present in practice. */ nh = inp->inp_route.ro_nh; if (nh == NULL) { INP_RUNLOCK(inp); return (ENXIO); } ifp = nh->nh_ifp; if_ref(ifp); /* * Allocate a TLS + ratelimit tag if the connection has an * existing pacing rate. */ if (tp->t_pacing_rate != -1 && (ifp->if_capenable & IFCAP_TXTLS_RTLMT) != 0) { params.hdr.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT; params.tls_rate_limit.inp = inp; params.tls_rate_limit.tls = tls; params.tls_rate_limit.max_rate = tp->t_pacing_rate; } else { params.hdr.type = IF_SND_TAG_TYPE_TLS; params.tls.inp = inp; params.tls.tls = tls; } params.hdr.flowid = inp->inp_flowid; params.hdr.flowtype = inp->inp_flowtype; params.hdr.numa_domain = inp->inp_numa_domain; INP_RUNLOCK(inp); if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) { error = EOPNOTSUPP; goto out; } if (inp->inp_vflag & INP_IPV6) { if ((ifp->if_capenable & IFCAP_TXTLS6) == 0) { error = EOPNOTSUPP; goto out; } } else { if ((ifp->if_capenable & IFCAP_TXTLS4) == 0) { error = EOPNOTSUPP; goto out; } } error = m_snd_tag_alloc(ifp, ¶ms, mstp); out: if_rele(ifp); return (error); } static int ktls_try_ifnet(struct socket *so, struct ktls_session *tls, bool force) { struct m_snd_tag *mst; int error; error = ktls_alloc_snd_tag(so->so_pcb, tls, force, &mst); if (error == 0) { tls->mode = TCP_TLS_MODE_IFNET; tls->snd_tag = mst; switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_ifnet_cbc, 1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_ifnet_gcm, 1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_ifnet_chacha20, 1); break; } } return (error); } static void ktls_use_sw(struct ktls_session *tls) { tls->mode = TCP_TLS_MODE_SW; switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_sw_cbc, 1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_sw_gcm, 1); break; case CRYPTO_CHACHA20_POLY1305: counter_u64_add(ktls_sw_chacha20, 1); break; } } static int ktls_try_sw(struct socket *so, struct ktls_session *tls, int direction) { int error; error = ktls_ocf_try(so, tls, direction); if (error) return (error); ktls_use_sw(tls); return (0); } /* * KTLS RX stores data in the socket buffer as a list of TLS records, * where each record is stored as a control message containg the TLS * header followed by data mbufs containing the decrypted data. This * is different from KTLS TX which always uses an mb_ext_pgs mbuf for * both encrypted and decrypted data. TLS records decrypted by a NIC * should be queued to the socket buffer as records, but encrypted * data which needs to be decrypted by software arrives as a stream of * regular mbufs which need to be converted. In addition, there may * already be pending encrypted data in the socket buffer when KTLS RX * is enabled. * * To manage not-yet-decrypted data for KTLS RX, the following scheme * is used: * * - A single chain of NOTREADY mbufs is hung off of sb_mtls. * * - ktls_check_rx checks this chain of mbufs reading the TLS header * from the first mbuf. Once all of the data for that TLS record is * queued, the socket is queued to a worker thread. * * - The worker thread calls ktls_decrypt to decrypt TLS records in * the TLS chain. Each TLS record is detached from the TLS chain, * decrypted, and inserted into the regular socket buffer chain as * record starting with a control message holding the TLS header and * a chain of mbufs holding the encrypted data. */ static void sb_mark_notready(struct sockbuf *sb) { struct mbuf *m; m = sb->sb_mb; sb->sb_mtls = m; sb->sb_mb = NULL; sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; for (; m != NULL; m = m->m_next) { KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL", __func__)); KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail", __func__)); KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len", __func__)); m->m_flags |= M_NOTREADY; sb->sb_acc -= m->m_len; sb->sb_tlscc += m->m_len; sb->sb_mtlstail = m; } KASSERT(sb->sb_acc == 0 && sb->sb_tlscc == sb->sb_ccc, ("%s: acc %u tlscc %u ccc %u", __func__, sb->sb_acc, sb->sb_tlscc, sb->sb_ccc)); } /* * Return information about the pending TLS data in a socket * buffer. On return, 'seqno' is set to the sequence number * of the next TLS record to be received, 'resid' is set to * the amount of bytes still needed for the last pending * record. The function returns 'false' if the last pending * record contains a partial TLS header. In that case, 'resid' * is the number of bytes needed to complete the TLS header. */ bool ktls_pending_rx_info(struct sockbuf *sb, uint64_t *seqnop, size_t *residp) { struct tls_record_layer hdr; struct mbuf *m; uint64_t seqno; size_t resid; u_int offset, record_len; SOCKBUF_LOCK_ASSERT(sb); MPASS(sb->sb_flags & SB_TLS_RX); seqno = sb->sb_tls_seqno; resid = sb->sb_tlscc; m = sb->sb_mtls; offset = 0; if (resid == 0) { *seqnop = seqno; *residp = 0; return (true); } for (;;) { seqno++; if (resid < sizeof(hdr)) { *seqnop = seqno; *residp = sizeof(hdr) - resid; return (false); } m_copydata(m, offset, sizeof(hdr), (void *)&hdr); record_len = sizeof(hdr) + ntohs(hdr.tls_length); if (resid <= record_len) { *seqnop = seqno; *residp = record_len - resid; return (true); } resid -= record_len; while (record_len != 0) { if (m->m_len - offset > record_len) { offset += record_len; break; } record_len -= (m->m_len - offset); offset = 0; m = m->m_next; } } } int ktls_enable_rx(struct socket *so, struct tls_enable *en) { struct ktls_session *tls; int error; if (!ktls_offload_enable) return (ENOTSUP); if (SOLISTENING(so)) return (EINVAL); counter_u64_add(ktls_offload_enable_calls, 1); /* * This should always be true since only the TCP socket option * invokes this function. */ if (so->so_proto->pr_protocol != IPPROTO_TCP) return (EINVAL); /* * XXX: Don't overwrite existing sessions. We should permit * this to support rekeying in the future. */ if (so->so_rcv.sb_tls_info != NULL) return (EALREADY); if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable) return (ENOTSUP); error = ktls_create_session(so, en, &tls); if (error) return (error); error = ktls_ocf_try(so, tls, KTLS_RX); if (error) { ktls_cleanup(tls); return (error); } /* Mark the socket as using TLS offload. */ SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_tls_seqno = be64dec(en->rec_seq); so->so_rcv.sb_tls_info = tls; so->so_rcv.sb_flags |= SB_TLS_RX; /* Mark existing data as not ready until it can be decrypted. */ sb_mark_notready(&so->so_rcv); ktls_check_rx(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); #ifdef TCP_OFFLOAD error = ktls_try_toe(so, tls, KTLS_RX); if (error) #endif ktls_use_sw(tls); counter_u64_add(ktls_offload_total, 1); return (0); } int ktls_enable_tx(struct socket *so, struct tls_enable *en) { struct ktls_session *tls; struct inpcb *inp; int error; if (!ktls_offload_enable) return (ENOTSUP); if (SOLISTENING(so)) return (EINVAL); counter_u64_add(ktls_offload_enable_calls, 1); /* * This should always be true since only the TCP socket option * invokes this function. */ if (so->so_proto->pr_protocol != IPPROTO_TCP) return (EINVAL); /* * XXX: Don't overwrite existing sessions. We should permit * this to support rekeying in the future. */ if (so->so_snd.sb_tls_info != NULL) return (EALREADY); if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable) return (ENOTSUP); /* TLS requires ext pgs */ if (mb_use_ext_pgs == 0) return (ENXIO); error = ktls_create_session(so, en, &tls); if (error) return (error); /* Prefer TOE -> ifnet TLS -> software TLS. */ #ifdef TCP_OFFLOAD error = ktls_try_toe(so, tls, KTLS_TX); if (error) #endif error = ktls_try_ifnet(so, tls, false); if (error) error = ktls_try_sw(so, tls, KTLS_TX); if (error) { ktls_cleanup(tls); return (error); } error = SOCK_IO_SEND_LOCK(so, SBL_WAIT); if (error) { ktls_cleanup(tls); return (error); } /* * Write lock the INP when setting sb_tls_info so that * routines in tcp_ratelimit.c can read sb_tls_info while * holding the INP lock. */ inp = so->so_pcb; INP_WLOCK(inp); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_tls_seqno = be64dec(en->rec_seq); so->so_snd.sb_tls_info = tls; if (tls->mode != TCP_TLS_MODE_SW) so->so_snd.sb_flags |= SB_TLS_IFNET; SOCKBUF_UNLOCK(&so->so_snd); INP_WUNLOCK(inp); SOCK_IO_SEND_UNLOCK(so); counter_u64_add(ktls_offload_total, 1); return (0); } int ktls_get_rx_mode(struct socket *so, int *modep) { struct ktls_session *tls; struct inpcb *inp __diagused; if (SOLISTENING(so)) return (EINVAL); inp = so->so_pcb; INP_WLOCK_ASSERT(inp); SOCK_RECVBUF_LOCK(so); tls = so->so_rcv.sb_tls_info; if (tls == NULL) *modep = TCP_TLS_MODE_NONE; else *modep = tls->mode; SOCK_RECVBUF_UNLOCK(so); return (0); } /* * ktls_get_rx_sequence - get the next TCP- and TLS- sequence number. * * This function gets information about the next TCP- and TLS- * sequence number to be processed by the TLS receive worker * thread. The information is extracted from the given "inpcb" * structure. The values are stored in host endian format at the two * given output pointer locations. The TCP sequence number points to * the beginning of the TLS header. * * This function returns zero on success, else a non-zero error code * is returned. */ int ktls_get_rx_sequence(struct inpcb *inp, uint32_t *tcpseq, uint64_t *tlsseq) { struct socket *so; struct tcpcb *tp; INP_RLOCK(inp); so = inp->inp_socket; if (__predict_false(so == NULL)) { INP_RUNLOCK(inp); return (EINVAL); } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_RUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); MPASS(tp != NULL); SOCKBUF_LOCK(&so->so_rcv); *tcpseq = tp->rcv_nxt - so->so_rcv.sb_tlscc; *tlsseq = so->so_rcv.sb_tls_seqno; SOCKBUF_UNLOCK(&so->so_rcv); INP_RUNLOCK(inp); return (0); } int ktls_get_tx_mode(struct socket *so, int *modep) { struct ktls_session *tls; struct inpcb *inp __diagused; if (SOLISTENING(so)) return (EINVAL); inp = so->so_pcb; INP_WLOCK_ASSERT(inp); SOCK_SENDBUF_LOCK(so); tls = so->so_snd.sb_tls_info; if (tls == NULL) *modep = TCP_TLS_MODE_NONE; else *modep = tls->mode; SOCK_SENDBUF_UNLOCK(so); return (0); } /* * Switch between SW and ifnet TLS sessions as requested. */ int ktls_set_tx_mode(struct socket *so, int mode) { struct ktls_session *tls, *tls_new; struct inpcb *inp; int error; if (SOLISTENING(so)) return (EINVAL); switch (mode) { case TCP_TLS_MODE_SW: case TCP_TLS_MODE_IFNET: break; default: return (EINVAL); } inp = so->so_pcb; INP_WLOCK_ASSERT(inp); SOCKBUF_LOCK(&so->so_snd); tls = so->so_snd.sb_tls_info; if (tls == NULL) { SOCKBUF_UNLOCK(&so->so_snd); return (0); } if (tls->mode == mode) { SOCKBUF_UNLOCK(&so->so_snd); return (0); } tls = ktls_hold(tls); SOCKBUF_UNLOCK(&so->so_snd); INP_WUNLOCK(inp); tls_new = ktls_clone_session(tls); if (mode == TCP_TLS_MODE_IFNET) error = ktls_try_ifnet(so, tls_new, true); else error = ktls_try_sw(so, tls_new, KTLS_TX); if (error) { counter_u64_add(ktls_switch_failed, 1); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (error); } error = SOCK_IO_SEND_LOCK(so, SBL_WAIT); if (error) { counter_u64_add(ktls_switch_failed, 1); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (error); } /* * If we raced with another session change, keep the existing * session. */ if (tls != so->so_snd.sb_tls_info) { counter_u64_add(ktls_switch_failed, 1); SOCK_IO_SEND_UNLOCK(so); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (EBUSY); } SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_tls_info = tls_new; if (tls_new->mode != TCP_TLS_MODE_SW) so->so_snd.sb_flags |= SB_TLS_IFNET; SOCKBUF_UNLOCK(&so->so_snd); SOCK_IO_SEND_UNLOCK(so); /* * Drop two references on 'tls'. The first is for the * ktls_hold() above. The second drops the reference from the * socket buffer. */ KASSERT(tls->refcount >= 2, ("too few references on old session")); ktls_free(tls); ktls_free(tls); if (mode == TCP_TLS_MODE_IFNET) counter_u64_add(ktls_switch_to_ifnet, 1); else counter_u64_add(ktls_switch_to_sw, 1); INP_WLOCK(inp); return (0); } /* * Try to allocate a new TLS send tag. This task is scheduled when * ip_output detects a route change while trying to transmit a packet * holding a TLS record. If a new tag is allocated, replace the tag * in the TLS session. Subsequent packets on the connection will use * the new tag. If a new tag cannot be allocated, drop the * connection. */ static void ktls_reset_send_tag(void *context, int pending) { struct epoch_tracker et; struct ktls_session *tls; struct m_snd_tag *old, *new; struct inpcb *inp; struct tcpcb *tp; int error; MPASS(pending == 1); tls = context; inp = tls->inp; /* * Free the old tag first before allocating a new one. * ip[6]_output_send() will treat a NULL send tag the same as * an ifp mismatch and drop packets until a new tag is * allocated. * * Write-lock the INP when changing tls->snd_tag since * ip[6]_output_send() holds a read-lock when reading the * pointer. */ INP_WLOCK(inp); old = tls->snd_tag; tls->snd_tag = NULL; INP_WUNLOCK(inp); if (old != NULL) m_snd_tag_rele(old); error = ktls_alloc_snd_tag(inp, tls, true, &new); if (error == 0) { INP_WLOCK(inp); tls->snd_tag = new; mtx_pool_lock(mtxpool_sleep, tls); tls->reset_pending = false; mtx_pool_unlock(mtxpool_sleep, tls); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); counter_u64_add(ktls_ifnet_reset, 1); /* * XXX: Should we kick tcp_output explicitly now that * the send tag is fixed or just rely on timers? */ } else { NET_EPOCH_ENTER(et); INP_WLOCK(inp); if (!in_pcbrele_wlocked(inp)) { if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); CURVNET_SET(tp->t_vnet); tp = tcp_drop(tp, ECONNABORTED); CURVNET_RESTORE(); if (tp != NULL) INP_WUNLOCK(inp); counter_u64_add(ktls_ifnet_reset_dropped, 1); } else INP_WUNLOCK(inp); } NET_EPOCH_EXIT(et); counter_u64_add(ktls_ifnet_reset_failed, 1); /* * Leave reset_pending true to avoid future tasks while * the socket goes away. */ } ktls_free(tls); } int ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls) { if (inp == NULL) return (ENOBUFS); INP_LOCK_ASSERT(inp); /* * See if we should schedule a task to update the send tag for * this session. */ mtx_pool_lock(mtxpool_sleep, tls); if (!tls->reset_pending) { (void) ktls_hold(tls); in_pcbref(inp); tls->inp = inp; tls->reset_pending = true; taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task); } mtx_pool_unlock(mtxpool_sleep, tls); return (ENOBUFS); } #ifdef RATELIMIT int ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate) { union if_snd_tag_modify_params params = { .rate_limit.max_rate = max_pacing_rate, .rate_limit.flags = M_NOWAIT, }; struct m_snd_tag *mst; /* Can't get to the inp, but it should be locked. */ /* INP_LOCK_ASSERT(inp); */ MPASS(tls->mode == TCP_TLS_MODE_IFNET); if (tls->snd_tag == NULL) { /* * Resetting send tag, ignore this change. The * pending reset may or may not see this updated rate * in the tcpcb. If it doesn't, we will just lose * this rate change. */ return (0); } MPASS(tls->snd_tag != NULL); MPASS(tls->snd_tag->sw->type == IF_SND_TAG_TYPE_TLS_RATE_LIMIT); mst = tls->snd_tag; return (mst->sw->snd_tag_modify(mst, ¶ms)); } #endif #endif void ktls_destroy(struct ktls_session *tls) { if (tls->sequential_records) { struct mbuf *m, *n; int page_count; STAILQ_FOREACH_SAFE(m, &tls->pending_records, m_epg_stailq, n) { page_count = m->m_epg_enc_cnt; while (page_count > 0) { KASSERT(page_count >= m->m_epg_nrdy, ("%s: too few pages", __func__)); page_count -= m->m_epg_nrdy; m = m_free(m); } } } ktls_cleanup(tls); uma_zfree(ktls_session_zone, tls); } void ktls_seq(struct sockbuf *sb, struct mbuf *m) { for (; m != NULL; m = m->m_next) { KASSERT((m->m_flags & M_EXTPG) != 0, ("ktls_seq: mapped mbuf %p", m)); m->m_epg_seqno = sb->sb_tls_seqno; sb->sb_tls_seqno++; } } /* * Add TLS framing (headers and trailers) to a chain of mbufs. Each * mbuf in the chain must be an unmapped mbuf. The payload of the * mbuf must be populated with the payload of each TLS record. * * The record_type argument specifies the TLS record type used when * populating the TLS header. * * The enq_count argument on return is set to the number of pages of * payload data for this entire chain that need to be encrypted via SW * encryption. The returned value should be passed to ktls_enqueue * when scheduling encryption of this chain of mbufs. To handle the * special case of empty fragments for TLS 1.0 sessions, an empty * fragment counts as one page. */ void ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt, uint8_t record_type) { struct tls_record_layer *tlshdr; struct mbuf *m; uint64_t *noncep; uint16_t tls_len; int maxlen __diagused; maxlen = tls->params.max_frame_len; *enq_cnt = 0; for (m = top; m != NULL; m = m->m_next) { /* * All mbufs in the chain should be TLS records whose * payload does not exceed the maximum frame length. * - * Empty TLS records are permitted when using CBC. + * Empty TLS 1.0 records are permitted when using CBC. */ - KASSERT(m->m_len <= maxlen && - (tls->params.cipher_algorithm == CRYPTO_AES_CBC ? - m->m_len >= 0 : m->m_len > 0), - ("ktls_frame: m %p len %d\n", m, m->m_len)); + KASSERT(m->m_len <= maxlen && m->m_len >= 0 && + (m->m_len > 0 || ktls_permit_empty_frames(tls)), + ("ktls_frame: m %p len %d", m, m->m_len)); /* * TLS frames require unmapped mbufs to store session * info. */ KASSERT((m->m_flags & M_EXTPG) != 0, - ("ktls_frame: mapped mbuf %p (top = %p)\n", m, top)); + ("ktls_frame: mapped mbuf %p (top = %p)", m, top)); tls_len = m->m_len; /* Save a reference to the session. */ m->m_epg_tls = ktls_hold(tls); m->m_epg_hdrlen = tls->params.tls_hlen; m->m_epg_trllen = tls->params.tls_tlen; if (tls->params.cipher_algorithm == CRYPTO_AES_CBC) { int bs, delta; /* * AES-CBC pads messages to a multiple of the * block size. Note that the padding is * applied after the digest and the encryption * is done on the "plaintext || mac || padding". * At least one byte of padding is always * present. * * Compute the final trailer length assuming * at most one block of padding. * tls->params.tls_tlen is the maximum * possible trailer length (padding + digest). * delta holds the number of excess padding * bytes if the maximum were used. Those * extra bytes are removed. */ bs = tls->params.tls_bs; delta = (tls_len + tls->params.tls_tlen) & (bs - 1); m->m_epg_trllen -= delta; } m->m_len += m->m_epg_hdrlen + m->m_epg_trllen; /* Populate the TLS header. */ tlshdr = (void *)m->m_epg_hdr; tlshdr->tls_vmajor = tls->params.tls_vmajor; /* * TLS 1.3 masquarades as TLS 1.2 with a record type * of TLS_RLTYPE_APP. */ if (tls->params.tls_vminor == TLS_MINOR_VER_THREE && tls->params.tls_vmajor == TLS_MAJOR_VER_ONE) { tlshdr->tls_vminor = TLS_MINOR_VER_TWO; tlshdr->tls_type = TLS_RLTYPE_APP; /* save the real record type for later */ m->m_epg_record_type = record_type; m->m_epg_trail[0] = record_type; } else { tlshdr->tls_vminor = tls->params.tls_vminor; tlshdr->tls_type = record_type; } tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr)); /* * Store nonces / explicit IVs after the end of the * TLS header. * * For GCM with TLS 1.2, an 8 byte nonce is copied * from the end of the IV. The nonce is then * incremented for use by the next record. * * For CBC, a random nonce is inserted for TLS 1.1+. */ if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16 && tls->params.tls_vminor == TLS_MINOR_VER_TWO) { noncep = (uint64_t *)(tls->params.iv + 8); be64enc(tlshdr + 1, *noncep); (*noncep)++; } else if (tls->params.cipher_algorithm == CRYPTO_AES_CBC && tls->params.tls_vminor >= TLS_MINOR_VER_ONE) arc4rand(tlshdr + 1, AES_BLOCK_LEN, 0); /* * When using SW encryption, mark the mbuf not ready. * It will be marked ready via sbready() after the * record has been encrypted. * * When using ifnet TLS, unencrypted TLS records are * sent down the stack to the NIC. */ if (tls->mode == TCP_TLS_MODE_SW) { m->m_flags |= M_NOTREADY; if (__predict_false(tls_len == 0)) { /* TLS 1.0 empty fragment. */ m->m_epg_nrdy = 1; } else m->m_epg_nrdy = m->m_epg_npgs; *enq_cnt += m->m_epg_nrdy; } } } +bool +ktls_permit_empty_frames(struct ktls_session *tls) +{ + return (tls->params.cipher_algorithm == CRYPTO_AES_CBC && + tls->params.tls_vminor == TLS_MINOR_VER_ZERO); +} + void ktls_check_rx(struct sockbuf *sb) { struct tls_record_layer hdr; struct ktls_wq *wq; struct socket *so; bool running; SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX", __func__, sb)); so = __containerof(sb, struct socket, so_rcv); if (sb->sb_flags & SB_TLS_RX_RUNNING) return; /* Is there enough queued for a TLS header? */ if (sb->sb_tlscc < sizeof(hdr)) { if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc != 0) so->so_error = EMSGSIZE; return; } m_copydata(sb->sb_mtls, 0, sizeof(hdr), (void *)&hdr); /* Is the entire record queued? */ if (sb->sb_tlscc < sizeof(hdr) + ntohs(hdr.tls_length)) { if ((sb->sb_state & SBS_CANTRCVMORE) != 0) so->so_error = EMSGSIZE; return; } sb->sb_flags |= SB_TLS_RX_RUNNING; soref(so); wq = &ktls_wq[so->so_rcv.sb_tls_info->wq_index]; mtx_lock(&wq->mtx); STAILQ_INSERT_TAIL(&wq->so_head, so, so_ktls_rx_list); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); counter_u64_add(ktls_cnt_rx_queued, 1); } static struct mbuf * ktls_detach_record(struct sockbuf *sb, int len) { struct mbuf *m, *n, *top; int remain; SOCKBUF_LOCK_ASSERT(sb); MPASS(len <= sb->sb_tlscc); /* * If TLS chain is the exact size of the record, * just grab the whole record. */ top = sb->sb_mtls; if (sb->sb_tlscc == len) { sb->sb_mtls = NULL; sb->sb_mtlstail = NULL; goto out; } /* * While it would be nice to use m_split() here, we need * to know exactly what m_split() allocates to update the * accounting, so do it inline instead. */ remain = len; for (m = top; remain > m->m_len; m = m->m_next) remain -= m->m_len; /* Easy case: don't have to split 'm'. */ if (remain == m->m_len) { sb->sb_mtls = m->m_next; if (sb->sb_mtls == NULL) sb->sb_mtlstail = NULL; m->m_next = NULL; goto out; } /* * Need to allocate an mbuf to hold the remainder of 'm'. Try * with M_NOWAIT first. */ n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) { /* * Use M_WAITOK with socket buffer unlocked. If * 'sb_mtls' changes while the lock is dropped, return * NULL to force the caller to retry. */ SOCKBUF_UNLOCK(sb); n = m_get(M_WAITOK, MT_DATA); SOCKBUF_LOCK(sb); if (sb->sb_mtls != top) { m_free(n); return (NULL); } } n->m_flags |= M_NOTREADY; /* Store remainder in 'n'. */ n->m_len = m->m_len - remain; if (m->m_flags & M_EXT) { n->m_data = m->m_data + remain; mb_dupcl(n, m); } else { bcopy(mtod(m, caddr_t) + remain, mtod(n, caddr_t), n->m_len); } /* Trim 'm' and update accounting. */ m->m_len -= n->m_len; sb->sb_tlscc -= n->m_len; sb->sb_ccc -= n->m_len; /* Account for 'n'. */ sballoc_ktls_rx(sb, n); /* Insert 'n' into the TLS chain. */ sb->sb_mtls = n; n->m_next = m->m_next; if (sb->sb_mtlstail == m) sb->sb_mtlstail = n; /* Detach the record from the TLS chain. */ m->m_next = NULL; out: MPASS(m_length(top, NULL) == len); for (m = top; m != NULL; m = m->m_next) sbfree_ktls_rx(sb, m); sb->sb_tlsdcc = len; sb->sb_ccc += len; SBCHECK(sb); return (top); } /* * Determine the length of the trailing zero padding and find the real * record type in the byte before the padding. * * Walking the mbuf chain backwards is clumsy, so another option would * be to scan forwards remembering the last non-zero byte before the * trailer. However, it would be expensive to scan the entire record. * Instead, find the last non-zero byte of each mbuf in the chain * keeping track of the relative offset of that nonzero byte. * * trail_len is the size of the MAC/tag on input and is set to the * size of the full trailer including padding and the record type on * return. */ static int tls13_find_record_type(struct ktls_session *tls, struct mbuf *m, int tls_len, int *trailer_len, uint8_t *record_typep) { char *cp; u_int digest_start, last_offset, m_len, offset; uint8_t record_type; digest_start = tls_len - *trailer_len; last_offset = 0; offset = 0; for (; m != NULL && offset < digest_start; offset += m->m_len, m = m->m_next) { /* Don't look for padding in the tag. */ m_len = min(digest_start - offset, m->m_len); cp = mtod(m, char *); /* Find last non-zero byte in this mbuf. */ while (m_len > 0 && cp[m_len - 1] == 0) m_len--; if (m_len > 0) { record_type = cp[m_len - 1]; last_offset = offset + m_len; } } if (last_offset < tls->params.tls_hlen) return (EBADMSG); *record_typep = record_type; *trailer_len = tls_len - last_offset + 1; return (0); } static void ktls_decrypt(struct socket *so) { char tls_header[MBUF_PEXT_HDR_LEN]; struct ktls_session *tls; struct sockbuf *sb; struct tls_record_layer *hdr; struct tls_get_record tgr; struct mbuf *control, *data, *m; uint64_t seqno; int error, remain, tls_len, trail_len; bool tls13; uint8_t vminor, record_type; hdr = (struct tls_record_layer *)tls_header; sb = &so->so_rcv; SOCKBUF_LOCK(sb); KASSERT(sb->sb_flags & SB_TLS_RX_RUNNING, ("%s: socket %p not running", __func__, so)); tls = sb->sb_tls_info; MPASS(tls != NULL); tls13 = (tls->params.tls_vminor == TLS_MINOR_VER_THREE); if (tls13) vminor = TLS_MINOR_VER_TWO; else vminor = tls->params.tls_vminor; for (;;) { /* Is there enough queued for a TLS header? */ if (sb->sb_tlscc < tls->params.tls_hlen) break; m_copydata(sb->sb_mtls, 0, tls->params.tls_hlen, tls_header); tls_len = sizeof(*hdr) + ntohs(hdr->tls_length); if (hdr->tls_vmajor != tls->params.tls_vmajor || hdr->tls_vminor != vminor) error = EINVAL; else if (tls13 && hdr->tls_type != TLS_RLTYPE_APP) error = EINVAL; else if (tls_len < tls->params.tls_hlen || tls_len > tls->params.tls_hlen + TLS_MAX_MSG_SIZE_V10_2 + tls->params.tls_tlen) error = EMSGSIZE; else error = 0; if (__predict_false(error != 0)) { /* * We have a corrupted record and are likely * out of sync. The connection isn't * recoverable at this point, so abort it. */ SOCKBUF_UNLOCK(sb); counter_u64_add(ktls_offload_corrupted_records, 1); CURVNET_SET(so->so_vnet); so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = error; CURVNET_RESTORE(); goto deref; } /* Is the entire record queued? */ if (sb->sb_tlscc < tls_len) break; /* * Split out the portion of the mbuf chain containing * this TLS record. */ data = ktls_detach_record(sb, tls_len); if (data == NULL) continue; MPASS(sb->sb_tlsdcc == tls_len); seqno = sb->sb_tls_seqno; sb->sb_tls_seqno++; SBCHECK(sb); SOCKBUF_UNLOCK(sb); error = tls->sw_decrypt(tls, hdr, data, seqno, &trail_len); if (error == 0) { if (tls13) error = tls13_find_record_type(tls, data, tls_len, &trail_len, &record_type); else record_type = hdr->tls_type; } if (error) { counter_u64_add(ktls_offload_failed_crypto, 1); SOCKBUF_LOCK(sb); if (sb->sb_tlsdcc == 0) { /* * sbcut/drop/flush discarded these * mbufs. */ m_freem(data); break; } /* * Drop this TLS record's data, but keep * decrypting subsequent records. */ sb->sb_ccc -= tls_len; sb->sb_tlsdcc = 0; CURVNET_SET(so->so_vnet); so->so_error = EBADMSG; sorwakeup_locked(so); CURVNET_RESTORE(); m_freem(data); SOCKBUF_LOCK(sb); continue; } /* Allocate the control mbuf. */ memset(&tgr, 0, sizeof(tgr)); tgr.tls_type = record_type; tgr.tls_vmajor = hdr->tls_vmajor; tgr.tls_vminor = hdr->tls_vminor; tgr.tls_length = htobe16(tls_len - tls->params.tls_hlen - trail_len); control = sbcreatecontrol_how(&tgr, sizeof(tgr), TLS_GET_RECORD, IPPROTO_TCP, M_WAITOK); SOCKBUF_LOCK(sb); if (sb->sb_tlsdcc == 0) { /* sbcut/drop/flush discarded these mbufs. */ MPASS(sb->sb_tlscc == 0); m_freem(data); m_freem(control); break; } /* * Clear the 'dcc' accounting in preparation for * adding the decrypted record. */ sb->sb_ccc -= tls_len; sb->sb_tlsdcc = 0; SBCHECK(sb); /* If there is no payload, drop all of the data. */ if (tgr.tls_length == htobe16(0)) { m_freem(data); data = NULL; } else { /* Trim header. */ remain = tls->params.tls_hlen; while (remain > 0) { if (data->m_len > remain) { data->m_data += remain; data->m_len -= remain; break; } remain -= data->m_len; data = m_free(data); } /* Trim trailer and clear M_NOTREADY. */ remain = be16toh(tgr.tls_length); m = data; for (m = data; remain > m->m_len; m = m->m_next) { m->m_flags &= ~M_NOTREADY; remain -= m->m_len; } m->m_len = remain; m_freem(m->m_next); m->m_next = NULL; m->m_flags &= ~M_NOTREADY; /* Set EOR on the final mbuf. */ m->m_flags |= M_EOR; } sbappendcontrol_locked(sb, data, control, 0); } sb->sb_flags &= ~SB_TLS_RX_RUNNING; if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc > 0) so->so_error = EMSGSIZE; sorwakeup_locked(so); deref: SOCKBUF_UNLOCK_ASSERT(sb); CURVNET_SET(so->so_vnet); sorele(so); CURVNET_RESTORE(); } void ktls_enqueue_to_free(struct mbuf *m) { struct ktls_wq *wq; bool running; /* Mark it for freeing. */ m->m_epg_flags |= EPG_FLAG_2FREE; wq = &ktls_wq[m->m_epg_tls->wq_index]; mtx_lock(&wq->mtx); STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); } static void * ktls_buffer_alloc(struct ktls_wq *wq, struct mbuf *m) { void *buf; int domain, running; if (m->m_epg_npgs <= 2) return (NULL); if (ktls_buffer_zone == NULL) return (NULL); if ((u_int)(ticks - wq->lastallocfail) < hz) { /* * Rate-limit allocation attempts after a failure. * ktls_buffer_import() will acquire a per-domain mutex to check * the free page queues and may fail consistently if memory is * fragmented. */ return (NULL); } buf = uma_zalloc(ktls_buffer_zone, M_NOWAIT | M_NORECLAIM); if (buf == NULL) { domain = PCPU_GET(domain); wq->lastallocfail = ticks; /* * Note that this check is "racy", but the races are * harmless, and are either a spurious wakeup if * multiple threads fail allocations before the alloc * thread wakes, or waiting an extra second in case we * see an old value of running == true. */ if (!VM_DOMAIN_EMPTY(domain)) { running = atomic_load_int(&ktls_domains[domain].alloc_td.running); if (!running) wakeup(&ktls_domains[domain].alloc_td); } } return (buf); } static int ktls_encrypt_record(struct ktls_wq *wq, struct mbuf *m, struct ktls_session *tls, struct ktls_ocf_encrypt_state *state) { vm_page_t pg; int error, i, len, off; KASSERT((m->m_flags & (M_EXTPG | M_NOTREADY)) == (M_EXTPG | M_NOTREADY), ("%p not unready & nomap mbuf\n", m)); KASSERT(ptoa(m->m_epg_npgs) <= ktls_maxlen, ("page count %d larger than maximum frame length %d", m->m_epg_npgs, ktls_maxlen)); /* Anonymous mbufs are encrypted in place. */ if ((m->m_epg_flags & EPG_FLAG_ANON) != 0) return (tls->sw_encrypt(state, tls, m, NULL, 0)); /* * For file-backed mbufs (from sendfile), anonymous wired * pages are allocated and used as the encryption destination. */ if ((state->cbuf = ktls_buffer_alloc(wq, m)) != NULL) { len = ptoa(m->m_epg_npgs - 1) + m->m_epg_last_len - m->m_epg_1st_off; state->dst_iov[0].iov_base = (char *)state->cbuf + m->m_epg_1st_off; state->dst_iov[0].iov_len = len; state->parray[0] = DMAP_TO_PHYS((vm_offset_t)state->cbuf); i = 1; } else { off = m->m_epg_1st_off; for (i = 0; i < m->m_epg_npgs; i++, off = 0) { pg = vm_page_alloc_noobj(VM_ALLOC_NODUMP | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); len = m_epg_pagelen(m, i, off); state->parray[i] = VM_PAGE_TO_PHYS(pg); state->dst_iov[i].iov_base = (char *)PHYS_TO_DMAP(state->parray[i]) + off; state->dst_iov[i].iov_len = len; } } KASSERT(i + 1 <= nitems(state->dst_iov), ("dst_iov is too small")); state->dst_iov[i].iov_base = m->m_epg_trail; state->dst_iov[i].iov_len = m->m_epg_trllen; error = tls->sw_encrypt(state, tls, m, state->dst_iov, i + 1); if (__predict_false(error != 0)) { /* Free the anonymous pages. */ if (state->cbuf != NULL) uma_zfree(ktls_buffer_zone, state->cbuf); else { for (i = 0; i < m->m_epg_npgs; i++) { pg = PHYS_TO_VM_PAGE(state->parray[i]); (void)vm_page_unwire_noq(pg); vm_page_free(pg); } } } return (error); } /* Number of TLS records in a batch passed to ktls_enqueue(). */ static u_int ktls_batched_records(struct mbuf *m) { int page_count, records; records = 0; page_count = m->m_epg_enc_cnt; while (page_count > 0) { records++; page_count -= m->m_epg_nrdy; m = m->m_next; } KASSERT(page_count == 0, ("%s: mismatched page count", __func__)); return (records); } void ktls_enqueue(struct mbuf *m, struct socket *so, int page_count) { struct ktls_session *tls; struct ktls_wq *wq; int queued; bool running; KASSERT(((m->m_flags & (M_EXTPG | M_NOTREADY)) == (M_EXTPG | M_NOTREADY)), ("ktls_enqueue: %p not unready & nomap mbuf\n", m)); KASSERT(page_count != 0, ("enqueueing TLS mbuf with zero page count")); KASSERT(m->m_epg_tls->mode == TCP_TLS_MODE_SW, ("!SW TLS mbuf")); m->m_epg_enc_cnt = page_count; /* * Save a pointer to the socket. The caller is responsible * for taking an additional reference via soref(). */ m->m_epg_so = so; queued = 1; tls = m->m_epg_tls; wq = &ktls_wq[tls->wq_index]; mtx_lock(&wq->mtx); if (__predict_false(tls->sequential_records)) { /* * For TLS 1.0, records must be encrypted * sequentially. For a given connection, all records * queued to the associated work queue are processed * sequentially. However, sendfile(2) might complete * I/O requests spanning multiple TLS records out of * order. Here we ensure TLS records are enqueued to * the work queue in FIFO order. * * tls->next_seqno holds the sequence number of the * next TLS record that should be enqueued to the work * queue. If this next record is not tls->next_seqno, * it must be a future record, so insert it, sorted by * TLS sequence number, into tls->pending_records and * return. * * If this TLS record matches tls->next_seqno, place * it in the work queue and then check * tls->pending_records to see if any * previously-queued records are now ready for * encryption. */ if (m->m_epg_seqno != tls->next_seqno) { struct mbuf *n, *p; p = NULL; STAILQ_FOREACH(n, &tls->pending_records, m_epg_stailq) { if (n->m_epg_seqno > m->m_epg_seqno) break; p = n; } if (n == NULL) STAILQ_INSERT_TAIL(&tls->pending_records, m, m_epg_stailq); else if (p == NULL) STAILQ_INSERT_HEAD(&tls->pending_records, m, m_epg_stailq); else STAILQ_INSERT_AFTER(&tls->pending_records, p, m, m_epg_stailq); mtx_unlock(&wq->mtx); counter_u64_add(ktls_cnt_tx_pending, 1); return; } tls->next_seqno += ktls_batched_records(m); STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq); while (!STAILQ_EMPTY(&tls->pending_records)) { struct mbuf *n; n = STAILQ_FIRST(&tls->pending_records); if (n->m_epg_seqno != tls->next_seqno) break; queued++; STAILQ_REMOVE_HEAD(&tls->pending_records, m_epg_stailq); tls->next_seqno += ktls_batched_records(n); STAILQ_INSERT_TAIL(&wq->m_head, n, m_epg_stailq); } counter_u64_add(ktls_cnt_tx_pending, -(queued - 1)); } else STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); counter_u64_add(ktls_cnt_tx_queued, queued); } /* * Once a file-backed mbuf (from sendfile) has been encrypted, free * the pages from the file and replace them with the anonymous pages * allocated in ktls_encrypt_record(). */ static void ktls_finish_nonanon(struct mbuf *m, struct ktls_ocf_encrypt_state *state) { int i; MPASS((m->m_epg_flags & EPG_FLAG_ANON) == 0); /* Free the old pages. */ m->m_ext.ext_free(m); /* Replace them with the new pages. */ if (state->cbuf != NULL) { for (i = 0; i < m->m_epg_npgs; i++) m->m_epg_pa[i] = state->parray[0] + ptoa(i); /* Contig pages should go back to the cache. */ m->m_ext.ext_free = ktls_free_mext_contig; } else { for (i = 0; i < m->m_epg_npgs; i++) m->m_epg_pa[i] = state->parray[i]; /* Use the basic free routine. */ m->m_ext.ext_free = mb_free_mext_pgs; } /* Pages are now writable. */ m->m_epg_flags |= EPG_FLAG_ANON; } static __noinline void ktls_encrypt(struct ktls_wq *wq, struct mbuf *top) { struct ktls_ocf_encrypt_state state; struct ktls_session *tls; struct socket *so; struct mbuf *m; int error, npages, total_pages; so = top->m_epg_so; tls = top->m_epg_tls; KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top)); KASSERT(so != NULL, ("so = NULL, top = %p\n", top)); #ifdef INVARIANTS top->m_epg_so = NULL; #endif total_pages = top->m_epg_enc_cnt; npages = 0; /* * Encrypt the TLS records in the chain of mbufs starting with * 'top'. 'total_pages' gives us a total count of pages and is * used to know when we have finished encrypting the TLS * records originally queued with 'top'. * * NB: These mbufs are queued in the socket buffer and * 'm_next' is traversing the mbufs in the socket buffer. The * socket buffer lock is not held while traversing this chain. * Since the mbufs are all marked M_NOTREADY their 'm_next' * pointers should be stable. However, the 'm_next' of the * last mbuf encrypted is not necessarily NULL. It can point * to other mbufs appended while 'top' was on the TLS work * queue. * * Each mbuf holds an entire TLS record. */ error = 0; for (m = top; npages != total_pages; m = m->m_next) { KASSERT(m->m_epg_tls == tls, ("different TLS sessions in a single mbuf chain: %p vs %p", tls, m->m_epg_tls)); KASSERT(npages + m->m_epg_npgs <= total_pages, ("page count mismatch: top %p, total_pages %d, m %p", top, total_pages, m)); error = ktls_encrypt_record(wq, m, tls, &state); if (error) { counter_u64_add(ktls_offload_failed_crypto, 1); break; } if ((m->m_epg_flags & EPG_FLAG_ANON) == 0) ktls_finish_nonanon(m, &state); npages += m->m_epg_nrdy; /* * Drop a reference to the session now that it is no * longer needed. Existing code depends on encrypted * records having no associated session vs * yet-to-be-encrypted records having an associated * session. */ m->m_epg_tls = NULL; ktls_free(tls); } CURVNET_SET(so->so_vnet); if (error == 0) { (void)(*so->so_proto->pr_usrreqs->pru_ready)(so, top, npages); } else { so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = EIO; mb_free_notready(top, total_pages); } sorele(so); CURVNET_RESTORE(); } void ktls_encrypt_cb(struct ktls_ocf_encrypt_state *state, int error) { struct ktls_session *tls; struct socket *so; struct mbuf *m; int npages; m = state->m; if ((m->m_epg_flags & EPG_FLAG_ANON) == 0) ktls_finish_nonanon(m, state); so = state->so; free(state, M_KTLS); /* * Drop a reference to the session now that it is no longer * needed. Existing code depends on encrypted records having * no associated session vs yet-to-be-encrypted records having * an associated session. */ tls = m->m_epg_tls; m->m_epg_tls = NULL; ktls_free(tls); if (error != 0) counter_u64_add(ktls_offload_failed_crypto, 1); CURVNET_SET(so->so_vnet); npages = m->m_epg_nrdy; if (error == 0) { (void)(*so->so_proto->pr_usrreqs->pru_ready)(so, m, npages); } else { so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = EIO; mb_free_notready(m, npages); } sorele(so); CURVNET_RESTORE(); } /* * Similar to ktls_encrypt, but used with asynchronous OCF backends * (coprocessors) where encryption does not use host CPU resources and * it can be beneficial to queue more requests than CPUs. */ static __noinline void ktls_encrypt_async(struct ktls_wq *wq, struct mbuf *top) { struct ktls_ocf_encrypt_state *state; struct ktls_session *tls; struct socket *so; struct mbuf *m, *n; int error, mpages, npages, total_pages; so = top->m_epg_so; tls = top->m_epg_tls; KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top)); KASSERT(so != NULL, ("so = NULL, top = %p\n", top)); #ifdef INVARIANTS top->m_epg_so = NULL; #endif total_pages = top->m_epg_enc_cnt; npages = 0; error = 0; for (m = top; npages != total_pages; m = n) { KASSERT(m->m_epg_tls == tls, ("different TLS sessions in a single mbuf chain: %p vs %p", tls, m->m_epg_tls)); KASSERT(npages + m->m_epg_npgs <= total_pages, ("page count mismatch: top %p, total_pages %d, m %p", top, total_pages, m)); state = malloc(sizeof(*state), M_KTLS, M_WAITOK | M_ZERO); soref(so); state->so = so; state->m = m; mpages = m->m_epg_nrdy; n = m->m_next; error = ktls_encrypt_record(wq, m, tls, state); if (error) { counter_u64_add(ktls_offload_failed_crypto, 1); free(state, M_KTLS); CURVNET_SET(so->so_vnet); sorele(so); CURVNET_RESTORE(); break; } npages += mpages; } CURVNET_SET(so->so_vnet); if (error != 0) { so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = EIO; mb_free_notready(m, total_pages - npages); } sorele(so); CURVNET_RESTORE(); } static int ktls_bind_domain(int domain) { int error; error = cpuset_setthread(curthread->td_tid, &cpuset_domain[domain]); if (error != 0) return (error); curthread->td_domain.dr_policy = DOMAINSET_PREF(domain); return (0); } static void ktls_alloc_thread(void *ctx) { struct ktls_domain_info *ktls_domain = ctx; struct ktls_alloc_thread *sc = &ktls_domain->alloc_td; void **buf; struct sysctl_oid *oid; char name[80]; int domain, error, i, nbufs; domain = ktls_domain - ktls_domains; if (bootverbose) printf("Starting KTLS alloc thread for domain %d\n", domain); error = ktls_bind_domain(domain); if (error) printf("Unable to bind KTLS alloc thread for domain %d: error %d\n", domain, error); snprintf(name, sizeof(name), "domain%d", domain); oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_kern_ipc_tls), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "allocs", CTLFLAG_RD, &sc->allocs, 0, "buffers allocated"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "wakeups", CTLFLAG_RD, &sc->wakeups, 0, "thread wakeups"); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "running", CTLFLAG_RD, &sc->running, 0, "thread running"); buf = NULL; nbufs = 0; for (;;) { atomic_store_int(&sc->running, 0); tsleep(sc, PZERO | PNOLOCK, "-", 0); atomic_store_int(&sc->running, 1); sc->wakeups++; if (nbufs != ktls_max_alloc) { free(buf, M_KTLS); nbufs = atomic_load_int(&ktls_max_alloc); buf = malloc(sizeof(void *) * nbufs, M_KTLS, M_WAITOK | M_ZERO); } /* * Below we allocate nbufs with different allocation * flags than we use when allocating normally during * encryption in the ktls worker thread. We specify * M_NORECLAIM in the worker thread. However, we omit * that flag here and add M_WAITOK so that the VM * system is permitted to perform expensive work to * defragment memory. We do this here, as it does not * matter if this thread blocks. If we block a ktls * worker thread, we risk developing backlogs of * buffers to be encrypted, leading to surges of * traffic and potential NIC output drops. */ for (i = 0; i < nbufs; i++) { buf[i] = uma_zalloc(ktls_buffer_zone, M_WAITOK); sc->allocs++; } for (i = 0; i < nbufs; i++) { uma_zfree(ktls_buffer_zone, buf[i]); buf[i] = NULL; } } } static void ktls_work_thread(void *ctx) { struct ktls_wq *wq = ctx; struct mbuf *m, *n; struct socket *so, *son; STAILQ_HEAD(, mbuf) local_m_head; STAILQ_HEAD(, socket) local_so_head; int cpu; cpu = wq - ktls_wq; if (bootverbose) printf("Starting KTLS worker thread for CPU %d\n", cpu); /* * Bind to a core. If ktls_bind_threads is > 1, then * we bind to the NUMA domain instead. */ if (ktls_bind_threads) { int error; if (ktls_bind_threads > 1) { struct pcpu *pc = pcpu_find(cpu); error = ktls_bind_domain(pc->pc_domain); } else { cpuset_t mask; CPU_SETOF(cpu, &mask); error = cpuset_setthread(curthread->td_tid, &mask); } if (error) printf("Unable to bind KTLS worker thread for CPU %d: error %d\n", cpu, error); } #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) fpu_kern_thread(0); #endif for (;;) { mtx_lock(&wq->mtx); while (STAILQ_EMPTY(&wq->m_head) && STAILQ_EMPTY(&wq->so_head)) { wq->running = false; mtx_sleep(wq, &wq->mtx, 0, "-", 0); wq->running = true; } STAILQ_INIT(&local_m_head); STAILQ_CONCAT(&local_m_head, &wq->m_head); STAILQ_INIT(&local_so_head); STAILQ_CONCAT(&local_so_head, &wq->so_head); mtx_unlock(&wq->mtx); STAILQ_FOREACH_SAFE(m, &local_m_head, m_epg_stailq, n) { if (m->m_epg_flags & EPG_FLAG_2FREE) { ktls_free(m->m_epg_tls); m_free_raw(m); } else { if (m->m_epg_tls->sync_dispatch) ktls_encrypt(wq, m); else ktls_encrypt_async(wq, m); counter_u64_add(ktls_cnt_tx_queued, -1); } } STAILQ_FOREACH_SAFE(so, &local_so_head, so_ktls_rx_list, son) { ktls_decrypt(so); counter_u64_add(ktls_cnt_rx_queued, -1); } } } #if defined(INET) || defined(INET6) static void ktls_disable_ifnet_help(void *context, int pending __unused) { struct ktls_session *tls; struct inpcb *inp; struct tcpcb *tp; struct socket *so; int err; tls = context; inp = tls->inp; if (inp == NULL) return; INP_WLOCK(inp); so = inp->inp_socket; MPASS(so != NULL); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { goto out; } if (so->so_snd.sb_tls_info != NULL) err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW); else err = ENXIO; if (err == 0) { counter_u64_add(ktls_ifnet_disable_ok, 1); /* ktls_set_tx_mode() drops inp wlock, so recheck flags */ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 && (tp = intotcpcb(inp)) != NULL && tp->t_fb->tfb_hwtls_change != NULL) (*tp->t_fb->tfb_hwtls_change)(tp, 0); } else { counter_u64_add(ktls_ifnet_disable_fail, 1); } out: sorele(so); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); ktls_free(tls); } /* * Called when re-transmits are becoming a substantial portion of the * sends on this connection. When this happens, we transition the * connection to software TLS. This is needed because most inline TLS * NICs keep crypto state only for in-order transmits. This means * that to handle a TCP rexmit (which is out-of-order), the NIC must * re-DMA the entire TLS record up to and including the current * segment. This means that when re-transmitting the last ~1448 byte * segment of a 16KB TLS record, we could wind up re-DMA'ing an order * of magnitude more data than we are sending. This can cause the * PCIe link to saturate well before the network, which can cause * output drops, and a general loss of capacity. */ void ktls_disable_ifnet(void *arg) { struct tcpcb *tp; struct inpcb *inp; struct socket *so; struct ktls_session *tls; tp = arg; inp = tp->t_inpcb; INP_WLOCK_ASSERT(inp); so = inp->inp_socket; SOCK_LOCK(so); tls = so->so_snd.sb_tls_info; if (tls->disable_ifnet_pending) { SOCK_UNLOCK(so); return; } /* * note that disable_ifnet_pending is never cleared; disabling * ifnet can only be done once per session, so we never want * to do it again */ (void)ktls_hold(tls); in_pcbref(inp); soref(so); tls->disable_ifnet_pending = true; tls->inp = inp; SOCK_UNLOCK(so); TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls); (void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task); } #endif diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 33dfe6cb2176..ab8e5d6e1b69 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -1,4529 +1,4534 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2004 The FreeBSD Foundation * Copyright (c) 2004-2008 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 */ /* * Comments on the socket life cycle: * * soalloc() sets of socket layer state for a socket, called only by * socreate() and sonewconn(). Socket layer private. * * sodealloc() tears down socket layer state for a socket, called only by * sofree() and sonewconn(). Socket layer private. * * pru_attach() associates protocol layer state with an allocated socket; * called only once, may fail, aborting socket allocation. This is called * from socreate() and sonewconn(). Socket layer private. * * pru_detach() disassociates protocol layer state from an attached socket, * and will be called exactly once for sockets in which pru_attach() has * been successfully called. If pru_attach() returned an error, * pru_detach() will not be called. Socket layer private. * * pru_abort() and pru_close() notify the protocol layer that the last * consumer of a socket is starting to tear down the socket, and that the * protocol should terminate the connection. Historically, pru_abort() also * detached protocol state from the socket state, but this is no longer the * case. * * socreate() creates a socket and attaches protocol state. This is a public * interface that may be used by socket layer consumers to create new * sockets. * * sonewconn() creates a socket and attaches protocol state. This is a * public interface that may be used by protocols to create new sockets when * a new connection is received and will be available for accept() on a * listen socket. * * soclose() destroys a socket after possibly waiting for it to disconnect. * This is a public interface that socket consumers should use to close and * release a socket when done with it. * * soabort() destroys a socket without waiting for it to disconnect (used * only for incoming connections that are already partially or fully * connected). This is used internally by the socket layer when clearing * listen socket queues (due to overflow or close on the listen socket), but * is also a public interface protocols may use to abort connections in * their incomplete listen queues should they no longer be required. Sockets * placed in completed connection listen queues should not be aborted for * reasons described in the comment above the soclose() implementation. This * is not a general purpose close routine, and except in the specific * circumstances described here, should not be used. * * sofree() will free a socket and its protocol state if all references on * the socket have been released, and is the public interface to attempt to * free a socket when a reference is removed. This is a socket layer private * interface. * * NOTE: In addition to socreate() and soclose(), which provide a single * socket reference to the consumer to be managed as required, there are two * calls to explicitly manage socket references, soref(), and sorele(). * Currently, these are generally required only when transitioning a socket * from a listen queue to a file descriptor, in order to prevent garbage * collection of the socket at an untimely moment. For a number of reasons, * these interfaces are not preferred, and should be avoided. * * NOTE: With regard to VNETs the general rule is that callers do not set * curvnet. Exceptions to this rule include soabort(), sodisconnect(), * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() * and sorflush(), which are usually called from a pre-set VNET context. * sopoll() currently does not need a VNET context to be set. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include /* for struct knote */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #include #endif static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); static void so_rdknl_lock(void *); static void so_rdknl_unlock(void *); static void so_rdknl_assert_lock(void *, int); static void so_wrknl_lock(void *); static void so_wrknl_unlock(void *); static void so_wrknl_assert_lock(void *, int); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); static int filt_soempty(struct knote *kn, long hint); static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); fo_kqfilter_t soo_kqfilter; static struct filterops soread_filtops = { .f_isfd = 1, .f_detach = filt_sordetach, .f_event = filt_soread, }; static struct filterops sowrite_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_sowrite, }; static struct filterops soempty_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_soempty, }; so_gen_t so_gencnt; /* generation count for sockets */ MALLOC_DEFINE(M_SONAME, "soname", "socket name"); MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); #define VNET_SO_ASSERT(so) \ VNET_ASSERT(curvnet != NULL, \ ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); #define V_socket_hhh VNET(socket_hhh) /* * Limit on the number of connections in the listen queue waiting * for accept(2). * NB: The original sysctl somaxconn is still available but hidden * to prevent confusion about the actual purpose of this number. */ static u_int somaxconn = SOMAXCONN; static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS) { int error; int val; val = somaxconn; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); /* * The purpose of the UINT_MAX / 3 limit, is so that the formula * 3 * so_qlimit / 2 * below, will not overflow. */ if (val < 1 || val > UINT_MAX / 3) return (EINVAL); somaxconn = val; return (0); } SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size"); SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size (compat)"); static int numopensockets; SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, &numopensockets, 0, "Number of open sockets"); /* * accept_mtx locks down per-socket fields relating to accept queues. See * socketvar.h for an annotation of the protected fields of struct socket. */ struct mtx accept_mtx; MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); /* * so_global_mtx protects so_gencnt, numopensockets, and the per-socket * so_gencnt field. */ static struct mtx so_global_mtx; MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); /* * General IPC sysctl name space, used by sockets and a variety of other IPC * types. */ SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPC"); /* * Initialize the socket subsystem and set up the socket * memory allocator. */ static uma_zone_t socket_zone; int maxsockets; static void socket_zone_change(void *tag) { maxsockets = uma_zone_set_max(socket_zone, maxsockets); } static void socket_hhook_register(int subtype) { if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, &V_socket_hhh[subtype], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register hook\n", __func__); } static void socket_hhook_deregister(int subtype) { if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) printf("%s: WARNING: unable to deregister hook\n", __func__); } static void socket_init(void *tag) { socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); maxsockets = uma_zone_set_max(socket_zone, maxsockets); uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); static void socket_vnet_init(const void *unused __unused) { int i; /* We expect a contiguous range */ for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_register(i); } VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_init, NULL); static void socket_vnet_uninit(const void *unused __unused) { int i; for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_deregister(i); } VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_uninit, NULL); /* * Initialise maxsockets. This SYSINIT must be run after * tunable_mbinit(). */ static void init_maxsockets(void *ignored) { TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); maxsockets = imax(maxsockets, maxfiles); } SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); /* * Sysctl to get and set the maximum global sockets limit. Notify protocols * of the change so that they can update their dependent limits as required. */ static int sysctl_maxsockets(SYSCTL_HANDLER_ARGS) { int error, newmaxsockets; newmaxsockets = maxsockets; error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); if (error == 0 && req->newptr && newmaxsockets != maxsockets) { if (newmaxsockets > maxsockets && newmaxsockets <= maxfiles) { maxsockets = newmaxsockets; EVENTHANDLER_INVOKE(maxsockets_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &maxsockets, 0, sysctl_maxsockets, "IU", "Maximum number of sockets available"); /* * Socket operation routines. These routines are called by the routines in * sys_socket.c or from a system process, and implement the semantics of * socket operations by switching out to the protocol specific routines. */ /* * Get a socket structure from our zone, and initialize it. Note that it * would probably be better to allocate socket and PCB at the same time, but * I'm not convinced that all the protocols can be easily modified to do * this. * * soalloc() returns a socket with a ref count of 0. */ static struct socket * soalloc(struct vnet *vnet) { struct socket *so; so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); if (so == NULL) return (NULL); #ifdef MAC if (mac_socket_init(so, M_NOWAIT) != 0) { uma_zfree(socket_zone, so); return (NULL); } #endif if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { uma_zfree(socket_zone, so); return (NULL); } /* * The socket locking protocol allows to lock 2 sockets at a time, * however, the first one must be a listening socket. WITNESS lacks * a feature to change class of an existing lock, so we use DUPOK. */ mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); so->so_snd.sb_mtx = &so->so_snd_mtx; so->so_rcv.sb_mtx = &so->so_rcv_mtx; SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); so->so_rcv.sb_sel = &so->so_rdsel; so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd_sx, "so_snd_sx"); sx_init(&so->so_rcv_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); TAILQ_INIT(&so->so_rcv.sb_aiojobq); TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); #ifdef VIMAGE VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet = vnet; #endif /* We shouldn't need the so_global_mtx */ if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { /* Do we need more comprehensive error returns? */ uma_zfree(socket_zone, so); return (NULL); } mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; #ifdef VIMAGE vnet->vnet_sockcnt++; #endif mtx_unlock(&so_global_mtx); return (so); } /* * Free the storage associated with a socket at the socket layer, tear down * locks, labels, etc. All protocol state is assumed already to have been * torn down (and possibly never set up) by the caller. */ static void sodealloc(struct socket *so) { KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ #ifdef VIMAGE VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet->vnet_sockcnt--; #endif mtx_unlock(&so_global_mtx); #ifdef MAC mac_socket_destroy(so); #endif hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); khelp_destroy_osd(&so->osd); if (SOLISTENING(so)) { if (so->sol_accept_filter != NULL) accept_filt_setopt(so, NULL); } else { if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); sx_destroy(&so->so_snd_sx); sx_destroy(&so->so_rcv_sx); SOCKBUF_LOCK_DESTROY(&so->so_snd); SOCKBUF_LOCK_DESTROY(&so->so_rcv); } crfree(so->so_cred); mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } /* * socreate returns a socket with a ref count of 1. The socket should be * closed with soclose(). */ int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td) { struct protosw *prp; struct socket *so; int error; if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); if (prp == NULL) { /* No support for domain. */ if (pffinddomain(dom) == NULL) return (EAFNOSUPPORT); /* No support for socket type. */ if (proto == 0 && type != 0) return (EPROTOTYPE); return (EPROTONOSUPPORT); } if (prp->pr_usrreqs->pru_attach == NULL || prp->pr_usrreqs->pru_attach == pru_attach_notsupp) return (EPROTONOSUPPORT); if (IN_CAPABILITY_MODE(td) && (prp->pr_flags & PR_CAPATTACH) == 0) return (ECAPMODE); if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) return (EPROTONOSUPPORT); if (prp->pr_type != type) return (EPROTOTYPE); so = soalloc(CRED_TO_VNET(cred)); if (so == NULL) return (ENOBUFS); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || (prp->pr_domain->dom_family == PF_INET6) || (prp->pr_domain->dom_family == PF_ROUTE)) so->so_fibnum = td->td_proc->p_fibnum; else so->so_fibnum = 0; so->so_proto = prp; #ifdef MAC mac_socket_create(cred, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ CURVNET_SET(so->so_vnet); error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); CURVNET_RESTORE(); if (error) { sodealloc(so); return (error); } soref(so); *aso = so; return (0); } #ifdef REGRESSION static int regression_sonewconn_earlytest = 1; SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); #endif static struct timeval overinterval = { 60, 0 }; SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, &overinterval, "Delay in seconds between warnings for listen socket overflows"); /* * When an attempt at a new connection is noted on a socket which accepts * connections, sonewconn is called. If the connection is possible (subject * to space constraints, etc.) then we allocate a new structure, properly * linked into the data structure of the original socket, and return this. * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. * * Note: the ref count on the socket is 0 on return. */ struct socket * sonewconn(struct socket *head, int connstatus) { struct sbuf descrsb; struct socket *so; int len, overcount; u_int qlen; const char localprefix[] = "local:"; char descrbuf[SUNPATHLEN + sizeof(localprefix)]; #if defined(INET6) char addrbuf[INET6_ADDRSTRLEN]; #elif defined(INET) char addrbuf[INET_ADDRSTRLEN]; #endif bool dolog, over; SOLISTEN_LOCK(head); over = (head->sol_qlen > 3 * head->sol_qlimit / 2); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else if (over) { #endif head->sol_overcount++; dolog = !!ratecheck(&head->sol_lastover, &overinterval); /* * If we're going to log, copy the overflow count and queue * length from the listen socket before dropping the lock. * Also, reset the overflow count. */ if (dolog) { overcount = head->sol_overcount; head->sol_overcount = 0; qlen = head->sol_qlen; } SOLISTEN_UNLOCK(head); if (dolog) { /* * Try to print something descriptive about the * socket for the error message. */ sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), SBUF_FIXEDLEN); switch (head->so_proto->pr_domain->dom_family) { #if defined(INET) || defined(INET6) #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: if (head->so_proto->pr_domain->dom_family == AF_INET6 || (sotoinpcb(head)->inp_inc.inc_flags & INC_ISIPV6)) { ip6_sprintf(addrbuf, &sotoinpcb(head)->inp_inc.inc6_laddr); sbuf_printf(&descrsb, "[%s]", addrbuf); } else #endif { #ifdef INET inet_ntoa_r( sotoinpcb(head)->inp_inc.inc_laddr, addrbuf); sbuf_cat(&descrsb, addrbuf); #endif } sbuf_printf(&descrsb, ":%hu (proto %u)", ntohs(sotoinpcb(head)->inp_inc.inc_lport), head->so_proto->pr_protocol); break; #endif /* INET || INET6 */ case AF_UNIX: sbuf_cat(&descrsb, localprefix); if (sotounpcb(head)->unp_addr != NULL) len = sotounpcb(head)->unp_addr->sun_len - offsetof(struct sockaddr_un, sun_path); else len = 0; if (len > 0) sbuf_bcat(&descrsb, sotounpcb(head)->unp_addr->sun_path, len); else sbuf_cat(&descrsb, "(unknown)"); break; } /* * If we can't print something more specific, at least * print the domain name. */ if (sbuf_finish(&descrsb) != 0 || sbuf_len(&descrsb) <= 0) { sbuf_clear(&descrsb); sbuf_cat(&descrsb, head->so_proto->pr_domain->dom_name ?: "unknown"); sbuf_finish(&descrsb); } KASSERT(sbuf_len(&descrsb) > 0, ("%s: sbuf creation failed", __func__)); log(LOG_DEBUG, "%s: pcb %p (%s): Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences)\n", __func__, head->so_pcb, sbuf_data(&descrsb), qlen, overcount); sbuf_delete(&descrsb); overcount = 0; } return (NULL); } SOLISTEN_UNLOCK(head); VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", __func__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_listen = head; so->so_type = head->so_type; so->so_options = head->so_options & ~SO_ACCEPTCONN; so->so_linger = head->so_linger; so->so_state = head->so_state | SS_NOFDREF; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; so->so_snd.sb_lowat = head->sol_sbsnd_lowat; so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; so->so_snd.sb_timeo = head->sol_sbsnd_timeo; so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE; SOLISTEN_LOCK(head); if (head->sol_accept_filter != NULL) connstatus = 0; so->so_state |= connstatus; soref(head); /* A socket on (in)complete queue refs head. */ if (connstatus) { TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); so->so_qstate = SQ_COMP; head->sol_qlen++; solisten_wakeup(head); /* unlocks */ } else { /* * Keep removing sockets from the head until there's room for * us to insert on the tail. In pre-locking revisions, this * was a simple if(), but as we could be racing with other * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ while (head->sol_incqlen > head->sol_qlimit) { struct socket *sp; sp = TAILQ_FIRST(&head->sol_incomp); TAILQ_REMOVE(&head->sol_incomp, sp, so_list); head->sol_incqlen--; SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); sorele_locked(head); /* does SOLISTEN_UNLOCK, head stays */ soabort(sp); SOLISTEN_LOCK(head); } TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); so->so_qstate = SQ_INCOMP; head->sol_incqlen++; SOLISTEN_UNLOCK(head); } return (so); } #if defined(SCTP) || defined(SCTP_SUPPORT) /* * Socket part of sctp_peeloff(). Detach a new socket from an * association. The new socket is returned with a reference. */ struct socket * sopeeloff(struct socket *head) { struct socket *so; VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", __func__, __LINE__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_type = head->so_type; so->so_options = head->so_options; so->so_linger = head->so_linger; so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; so->so_snd.sb_lowat = head->so_snd.sb_lowat; so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; so->so_snd.sb_timeo = head->so_snd.sb_timeo; so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; soref(so); return (so); } #endif /* SCTP */ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); CURVNET_RESTORE(); return (error); } int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td); CURVNET_RESTORE(); return (error); } /* * solisten() transitions a socket from a non-listening state to a listening * state, but can also be used to update the listen queue depth on an * existing listen socket. The protocol will call back into the sockets * layer using solisten_proto_check() and solisten_proto() to check and set * socket-layer listen state. Call backs are used so that the protocol can * acquire both protocol and socket layer locks in whatever order is required * by the protocol. * * Protocol implementors are advised to hold the socket lock across the * socket-layer test and set to avoid races at the socket layer. */ int solisten(struct socket *so, int backlog, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); CURVNET_RESTORE(); return (error); } /* * Prepare for a call to solisten_proto(). Acquire all socket buffer locks in * order to interlock with socket I/O. */ int solisten_proto_check(struct socket *so) { SOCK_LOCK_ASSERT(so); if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) != 0) return (EINVAL); /* * Sleeping is not permitted here, so simply fail if userspace is * attempting to transmit or receive on the socket. This kind of * transient failure is not ideal, but it should occur only if userspace * is misusing the socket interfaces. */ if (!sx_try_xlock(&so->so_snd_sx)) return (EAGAIN); if (!sx_try_xlock(&so->so_rcv_sx)) { sx_xunlock(&so->so_snd_sx); return (EAGAIN); } mtx_lock(&so->so_snd_mtx); mtx_lock(&so->so_rcv_mtx); /* Interlock with soo_aio_queue(). */ if ((so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 || (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0) { solisten_proto_abort(so); return (EINVAL); } return (0); } /* * Undo the setup done by solisten_proto_check(). */ void solisten_proto_abort(struct socket *so) { mtx_unlock(&so->so_snd_mtx); mtx_unlock(&so->so_rcv_mtx); sx_xunlock(&so->so_snd_sx); sx_xunlock(&so->so_rcv_sx); } void solisten_proto(struct socket *so, int backlog) { int sbrcv_lowat, sbsnd_lowat; u_int sbrcv_hiwat, sbsnd_hiwat; short sbrcv_flags, sbsnd_flags; sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0, ("%s: bad socket state %p", __func__, so)); if (SOLISTENING(so)) goto listening; /* * Change this socket to listening state. */ sbrcv_lowat = so->so_rcv.sb_lowat; sbsnd_lowat = so->so_snd.sb_lowat; sbrcv_hiwat = so->so_rcv.sb_hiwat; sbsnd_hiwat = so->so_snd.sb_hiwat; sbrcv_flags = so->so_rcv.sb_flags; sbsnd_flags = so->so_snd.sb_flags; sbrcv_timeo = so->so_rcv.sb_timeo; sbsnd_timeo = so->so_snd.sb_timeo; sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); #ifdef INVARIANTS bzero(&so->so_rcv, sizeof(struct socket) - offsetof(struct socket, so_rcv)); #endif so->sol_sbrcv_lowat = sbrcv_lowat; so->sol_sbsnd_lowat = sbsnd_lowat; so->sol_sbrcv_hiwat = sbrcv_hiwat; so->sol_sbsnd_hiwat = sbsnd_hiwat; so->sol_sbrcv_flags = sbrcv_flags; so->sol_sbsnd_flags = sbsnd_flags; so->sol_sbrcv_timeo = sbrcv_timeo; so->sol_sbsnd_timeo = sbsnd_timeo; so->sol_qlen = so->sol_incqlen = 0; TAILQ_INIT(&so->sol_incomp); TAILQ_INIT(&so->sol_comp); so->sol_accept_filter = NULL; so->sol_accept_filter_arg = NULL; so->sol_accept_filter_str = NULL; so->sol_upcall = NULL; so->sol_upcallarg = NULL; so->so_options |= SO_ACCEPTCONN; listening: if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->sol_qlimit = backlog; mtx_unlock(&so->so_snd_mtx); mtx_unlock(&so->so_rcv_mtx); sx_xunlock(&so->so_snd_sx); sx_xunlock(&so->so_rcv_sx); } /* * Wakeup listeners/subsystems once we have a complete connection. * Enters with lock, returns unlocked. */ void solisten_wakeup(struct socket *sol) { if (sol->sol_upcall != NULL) (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); else { selwakeuppri(&sol->so_rdsel, PSOCK); KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); } SOLISTEN_UNLOCK(sol); wakeup_one(&sol->sol_comp); if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) pgsigio(&sol->so_sigio, SIGIO, 0); } /* * Return single connection off a listening socket queue. Main consumer of * the function is kern_accept4(). Some modules, that do their own accept * management also use the function. * * Listening socket must be locked on entry and is returned unlocked on * return. * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. */ int solisten_dequeue(struct socket *head, struct socket **ret, int flags) { struct socket *so; int error; SOLISTEN_LOCK_ASSERT(head); while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && head->so_error == 0) { error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH, "accept", 0); if (error != 0) { SOLISTEN_UNLOCK(head); return (error); } } if (head->so_error) { error = head->so_error; head->so_error = 0; } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) error = EWOULDBLOCK; else error = 0; if (error) { SOLISTEN_UNLOCK(head); return (error); } so = TAILQ_FIRST(&head->sol_comp); SOCK_LOCK(so); KASSERT(so->so_qstate == SQ_COMP, ("%s: so %p not SQ_COMP", __func__, so)); soref(so); head->sol_qlen--; so->so_qstate = SQ_NONE; so->so_listen = NULL; TAILQ_REMOVE(&head->sol_comp, so, so_list); if (flags & ACCEPT4_INHERIT) so->so_state |= (head->so_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; SOCK_UNLOCK(so); sorele_locked(head); *ret = so; return (0); } /* * Evaluate the reference count and named references on a socket; if no * references remain, free it. This should be called whenever a reference is * released, such as in sorele(), but also when named reference flags are * cleared in socket or protocol code. * * sofree() will free the socket if: * * - There are no outstanding file descriptor references or related consumers * (so_count == 0). * * - The socket has been closed by user space, if ever open (SS_NOFDREF). * * - The protocol does not have an outstanding strong reference on the socket * (SS_PROTOREF). * * - The socket is not in a completed connection queue, so a process has been * notified that it is present. If it is removed, the user process may * block in accept() despite select() saying the socket was ready. */ void sofree(struct socket *so) { struct protosw *pr = so->so_proto; bool last __diagused; SOCK_LOCK_ASSERT(so); if ((so->so_state & (SS_NOFDREF | SS_PROTOREF)) != SS_NOFDREF || refcount_load(&so->so_count) != 0 || so->so_qstate == SQ_COMP) { SOCK_UNLOCK(so); return; } if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) { struct socket *sol; sol = so->so_listen; KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so)); /* * To solve race between close of a listening socket and * a socket on its incomplete queue, we need to lock both. * The order is first listening socket, then regular. * Since we don't have SS_NOFDREF neither SS_PROTOREF, this * function and the listening socket are the only pointers * to so. To preserve so and sol, we reference both and then * relock. * After relock the socket may not move to so_comp since it * doesn't have PCB already, but it may be removed from * so_incomp. If that happens, we share responsiblity on * freeing the socket, but soclose() has already removed * it from queue. */ soref(sol); soref(so); SOCK_UNLOCK(so); SOLISTEN_LOCK(sol); SOCK_LOCK(so); if (so->so_qstate == SQ_INCOMP) { KASSERT(so->so_listen == sol, ("%s: so %p migrated out of sol %p", __func__, so, sol)); TAILQ_REMOVE(&sol->sol_incomp, so, so_list); sol->sol_incqlen--; last = refcount_release(&sol->so_count); KASSERT(!last, ("%s: released last reference for %p", __func__, sol)); so->so_qstate = SQ_NONE; so->so_listen = NULL; } else KASSERT(so->so_listen == NULL, ("%s: so %p not on (in)comp with so_listen", __func__, so)); sorele_locked(sol); KASSERT(refcount_load(&so->so_count) == 1, ("%s: so %p count %u", __func__, so, so->so_count)); so->so_count = 0; } if (SOLISTENING(so)) so->so_error = ECONNABORTED; SOCK_UNLOCK(so); if (so->so_dtor != NULL) so->so_dtor(so); VNET_SO_ASSERT(so); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(so); if (pr->pr_usrreqs->pru_detach != NULL) (*pr->pr_usrreqs->pru_detach)(so); /* * From this point on, we assume that no other references to this * socket exist anywhere else in the stack. Therefore, no locks need * to be acquired or held. * * We used to do a lot of socket buffer and socket locking here, as * well as invoke sorflush() and perform wakeups. The direct call to * dom_dispose() and sbdestroy() are an inlining of what was * necessary from sorflush(). * * Notice that the socket buffer and kqueue state are torn down * before calling pru_detach. This means that protocols shold not * assume they can perform socket wakeups, etc, in their detach code. */ if (!SOLISTENING(so)) { sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); } seldrain(&so->so_rdsel); seldrain(&so->so_wrsel); knlist_destroy(&so->so_rdsel.si_note); knlist_destroy(&so->so_wrsel.si_note); sodealloc(so); } /* * Release a reference on a socket while holding the socket lock. * Unlocks the socket lock before returning. */ void sorele_locked(struct socket *so) { SOCK_LOCK_ASSERT(so); if (refcount_release(&so->so_count)) sofree(so); else SOCK_UNLOCK(so); } /* * Close a socket on last file table reference removal. Initiate disconnect * if connected. Free socket when disconnect complete. * * This function will sorele() the socket. Note that soclose() may be called * prior to the ref count reaching zero. The actual socket structure will * not be freed until the ref count reaches zero. */ int soclose(struct socket *so) { struct accept_queue lqueue; int error = 0; bool listening, last __diagused; KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error) { if (error == ENOTCONN) error = 0; goto drop; } } if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { error = tsleep(&so->so_timeo, PSOCK | PCATCH, "soclos", so->so_linger * hz); if (error) break; } } } drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); SOCK_LOCK(so); if ((listening = SOLISTENING(so))) { struct socket *sp; TAILQ_INIT(&lqueue); TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); so->sol_qlen = so->sol_incqlen = 0; TAILQ_FOREACH(sp, &lqueue, so_list) { SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); last = refcount_release(&so->so_count); KASSERT(!last, ("%s: released last reference for %p", __func__, so)); } } KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; sorele_locked(so); if (listening) { struct socket *sp, *tsp; TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) { SOCK_LOCK(sp); if (refcount_load(&sp->so_count) == 0) { SOCK_UNLOCK(sp); soabort(sp); } else { /* See the handling of queued sockets in sofree(). */ SOCK_UNLOCK(sp); } } } CURVNET_RESTORE(); return (error); } /* * soabort() is used to abruptly tear down a connection, such as when a * resource limit is reached (listen queue depth exceeded), or if a listen * socket is closed while there are sockets waiting to be accepted. * * This interface is tricky, because it is called on an unreferenced socket, * and must be called only by a thread that has actually removed the socket * from the listen queue it was on, or races with other threads are risked. * * This interface will call into the protocol code, so must not be called * with any socket locks held. Protocols do call it while holding their own * recursible protocol mutexes, but this is something that should be subject * to review in the future. */ void soabort(struct socket *so) { /* * In as much as is possible, assert that no references to this * socket are held. This is not quite the same as asserting that the * current thread is responsible for arranging for no references, but * is as close as we can get for now. */ KASSERT(so->so_count == 0, ("soabort: so_count")); KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); VNET_SO_ASSERT(so); if (so->so_proto->pr_usrreqs->pru_abort != NULL) (*so->so_proto->pr_usrreqs->pru_abort)(so); SOCK_LOCK(so); sofree(so); } int soaccept(struct socket *so, struct sockaddr **nam) { int error; SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); so->so_state &= ~SS_NOFDREF; SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); CURVNET_RESTORE(); return (error); } int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (soconnectat(AT_FDCWD, so, nam, td)); } int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. This allows * user to disconnect by connecting to, e.g., a null address. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) { error = EISCONN; } else { /* * Prevent accumulated error from previous connection from * biting us. */ so->so_error = 0; if (fd == AT_FDCWD) { error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); } else { error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd, so, nam, td); } } CURVNET_RESTORE(); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { int error; CURVNET_SET(so1->so_vnet); error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); CURVNET_RESTORE(); return (error); } int sodisconnect(struct socket *so) { int error; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) return (EALREADY); VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); return (error); } int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); KASSERT(so->so_proto->pr_flags & PR_ATOMIC, ("sosend_dgram: !PR_ATOMIC")); if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. */ if (resid < 0) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection-based * socket if it supports implied connect. Return ENOTCONN if * not connected and no address is supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto out; } } else if (addr == NULL) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; SOCKBUF_UNLOCK(&so->so_snd); goto out; } } /* * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a * problem and need fixing. */ space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; space -= clen; SOCKBUF_UNLOCK(&so->so_snd); if (resid > space) { error = EMSGSIZE; goto out; } if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf chain. * If no data is to be copied in, a single empty mbuf * is returned. */ top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); if (top == NULL) { error = EFAULT; /* only possible error */ goto out; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } KASSERT(resid == 0, ("sosend_dgram: resid != 0")); /* * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock * than with. */ if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously done could be out * of date. We could have received a reset packet in an interrupt or * maybe we slept while doing page faults in uiomove() etc. We could * probably recheck again inside the locking protection here, but * there are probably other places that this also happens. We must * rethink this. */ VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_send)(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands this flag and * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } /* * Send on a socket. If send must go all at once and message is larger than * send buffering, then hard error. Lock against other senders. If must go * all at once and not enough room now, then inform user that this would * block and do nothing. Otherwise, if nonblocking, send as much as * possible. The data to be sent is described by "uio" if nonzero, otherwise * by the mbuf chain "top" (which must be null if uio is not). Data provided * in mbuf chain must be small enough to send all at once. * * Returns nonzero on error, timeout or signal; callers must check for short * counts if EINTR/ERESTART are returned. Data and control buffers are freed * on return. */ int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; int pru_flag; #ifdef KERN_TLS struct ktls_session *tls; int tls_enq_cnt, tls_pruflag; uint8_t tls_rtype; tls = NULL; tls_rtype = TLS_RLTYPE_APP; #endif if (uio != NULL) resid = uio->uio_resid; else if ((top->m_flags & M_PKTHDR) != 0) resid = top->m_pkthdr.len; else resid = m_length(top, NULL); /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. * * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); if (error) goto out; #ifdef KERN_TLS tls_pruflag = 0; tls = ktls_hold(so->so_snd.sb_tls_info); if (tls != NULL) { if (tls->mode == TCP_TLS_MODE_SW) tls_pruflag = PRUS_NOTREADY; if (control != NULL) { struct cmsghdr *cm = mtod(control, struct cmsghdr *); if (clen >= sizeof(*cm) && cm->cmsg_type == TLS_SET_RECORD_TYPE) { tls_rtype = *((uint8_t *)CMSG_DATA(cm)); clen = 0; m_freem(control); control = NULL; atomic = 1; } } + + if (resid == 0 && !ktls_permit_empty_frames(tls)) { + error = EINVAL; + goto release; + } } #endif restart: do { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto release; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection- * based socket if it supports implied connect. * Return ENOTCONN if not connected and no address is * supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto release; } } else if (addr == NULL) { SOCKBUF_UNLOCK(&so->so_snd); if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; goto release; } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) { SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; goto release; } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto release; } error = sbwait(&so->so_snd); SOCKBUF_UNLOCK(&so->so_snd); if (error) goto release; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); space -= clen; do { if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; #ifdef KERN_TLS if (tls != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); tls_rtype = TLS_RLTYPE_APP; } #endif } else { /* * Copy the data from userland into a mbuf * chain. If resid is 0, which can happen * only if we have control to send, then * a single empty mbuf is returned. This * is a workaround to prevent protocol send * methods to panic. */ #ifdef KERN_TLS if (tls != NULL) { top = m_uiotombuf(uio, M_WAITOK, space, tls->params.max_frame_len, M_EXTPG | ((flags & MSG_EOR) ? M_EOR : 0)); if (top != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); } tls_rtype = TLS_RLTYPE_APP; } else #endif top = m_uiotombuf(uio, M_WAITOK, space, (atomic ? max_hdr : 0), (atomic ? M_PKTHDR : 0) | ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { error = EFAULT; /* only possible error */ goto release; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date. We could have received * a reset packet in an interrupt or maybe we slept * while doing page faults in uiomove() etc. We * could probably recheck again inside the locking * protection here, but there are probably other * places that this also happens. We must rethink * this. */ VNET_SO_ASSERT(so); pru_flag = (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands * this flag and nothing left to send then use * PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; #ifdef KERN_TLS pru_flag |= tls_pruflag; #endif error = (*so->so_proto->pr_usrreqs->pru_send)(so, pru_flag, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } #ifdef KERN_TLS if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { if (error != 0) { m_freem(top); top = NULL; } else { soref(so); ktls_enqueue(top, so, tls_enq_cnt); } } #endif clen = 0; control = NULL; top = NULL; if (error) goto release; } while (resid && space > 0); } while (resid); release: SOCK_IO_SEND_UNLOCK(so); out: #ifdef KERN_TLS if (tls != NULL) ktls_free(tls); #endif if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, control, flags, td); CURVNET_RESTORE(); return (error); } /* * The part of soreceive() that implements reading non-inline out-of-band * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is * unable to return an mbuf chain to the caller. */ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { struct protosw *pr = so->so_proto; struct mbuf *m; int error; KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); VNET_SO_ASSERT(so); m = m_get(M_WAITOK, MT_DATA); error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) goto bad; do { error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: if (m != NULL) m_freem(m); return (error); } /* * Following replacement or removal of the first mbuf on the first mbuf chain * of a socket buffer, push necessary state changes back into the socket * buffer so that other consumers see the values consistently. 'nextrecord' * is the callers locally stored value of the original value of * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. * NOTE: 'nextrecord' may be NULL. */ static __inline void sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) { SOCKBUF_LOCK_ASSERT(sb); /* * First, update for the new value of nextrecord. If necessary, make * it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect the new * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the * addition of a second clause that takes care of the case where * sb_mb has been updated, but remains the last record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } /* * Implement receive operations on a socket. We depend on the way that * records are added to the sockbuf by sbappend. In particular, each record * (mbufs linked through m_next) must begin with an address if the protocol * so specifies, followed by an optional mbuf or mbufs containing ancillary * data, and then zero or more mbufs of data. In order to allow parallelism * between network receive and copying to user space, as well as avoid * sleeping with a mutex held, we release the socket buffer mutex during the * user space copy. Although the sockbuf is locked, new data may still be * appended, and thus we must maintain consistency of the sockbuf during that * time. * * The caller may receive the data as a single mbuf chain by supplying an * mbuf **mp0 for use in returning the chain. The uio is then used only for * the count in uio_resid. */ int soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, **mp; int flags, error, offset; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; ssize_t orig_resid = uio->uio_resid; mp = mp0; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp != NULL) *mp = NULL; if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) && uio->uio_resid) { VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, 0); } error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); restart: SOCKBUF_LOCK(&so->so_rcv); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more (subject * to any timeout) if: * 1. the current count is less than the low water mark, or * 2. MSG_DONTWAIT is not set */ if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && sbavail(&so->so_rcv) < uio->uio_resid) && sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { KASSERT(m != NULL || !sbavail(&so->so_rcv), ("receive: m == %p sbavail == %u", m, sbavail(&so->so_rcv))); if (so->so_error || so->so_rerror) { if (m != NULL) goto dontblock; if (so->so_error) error = so->so_error; else error = so->so_rerror; if ((flags & MSG_PEEK) == 0) { if (so->so_error) so->so_error = 0; else so->so_rerror = 0; } SOCKBUF_UNLOCK(&so->so_rcv); goto release; } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { if (m != NULL) goto dontblock; #ifdef KERN_TLS else if (so->so_rcv.sb_tlsdcc == 0 && so->so_rcv.sb_tlscc == 0) { #else else { #endif SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENOTCONN; goto release; } if (uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (error) goto release; goto restart; } dontblock: /* * From this point onward, we maintain 'nextrecord' as a cache of the * pointer to the next record in the socket buffer. We must keep the * various socket buffer pointers and local stack versions of the * pointers in sync, pushing out modifications before dropping the * socket buffer mutex, and re-reading them when picking it up. * * Otherwise, we will race with the network stack appending new data * or records onto the socket buffer by using inconsistent/stale * versions of the field, possibly resulting in socket buffer * corruption. * * By holding the high-level sblock(), we prevent simultaneous * readers from pulling off the front of the socket buffer. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); orig_resid = 0; if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; sockbuf_pushsync(&so->so_rcv, nextrecord); } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization (or freeing if controlp == NULL). */ if (m != NULL && m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; #ifdef KERN_TLS struct cmsghdr *cmsg; struct tls_get_record tgr; /* * For MSG_TLSAPPDATA, check for a non-application data * record. If found, return ENXIO without removing * it from the receive queue. This allows a subsequent * call without MSG_TLSAPPDATA to receive it. * Note that, for TLS, there should only be a single * control mbuf with the TLS_GET_RECORD message in it. */ if (flags & MSG_TLSAPPDATA) { cmsg = mtod(m, struct cmsghdr *); if (cmsg->cmsg_type == TLS_GET_RECORD && cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); /* This will need to change for TLS 1.3. */ if (tgr.tls_type != TLS_RLTYPE_APP) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENXIO; goto release; } } } #endif do { if (flags & MSG_PEEK) { if (controlp != NULL) { *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); controlp = &(*controlp)->m_next; } m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sockbuf_pushsync(&so->so_rcv, nextrecord); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); SOCKBUF_LOCK(&so->so_rcv); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } if (m != NULL) nextrecord = so->so_rcv.sb_mb->m_nextpkt; else nextrecord = so->so_rcv.sb_mb; orig_resid = 0; } if (m != NULL) { if ((flags & MSG_PEEK) == 0) { KASSERT(m->m_nextpkt == nextrecord, ("soreceive: post-control, nextrecord !sync")); if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_mb == m, ("soreceive: post-control, sb_mb!=m")); KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive: post-control, lastrecord!=m")); } } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; } else { if ((flags & MSG_PEEK) == 0) { KASSERT(so->so_rcv.sb_mb == nextrecord, ("soreceive: sb_mb != nextrecord")); if (so->so_rcv.sb_mb == NULL) { KASSERT(so->so_rcv.sb_lastrecord == NULL, ("soreceive: sb_lastercord != NULL")); } } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * Now continue to read any data mbufs off of the head of the socket * buffer until the read request is satisfied. Note that 'type' is * used to store the type of any mbuf reads that have happened so far * such that soreceive() can stop reading if the type changes, which * causes soreceive() to return only one of regular data and inline * out-of-band data in a single socket receive operation. */ moff = 0; offset = 0; while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 && error == 0) { /* * If the type of mbuf has changed since the last mbuf * examined ('type'), end the receive operation. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { if (type != m->m_type) break; } else if (type == MT_OOBDATA) break; else KASSERT(m->m_type == MT_DATA, ("m->m_type == %d", m->m_type)); so->so_rcv.sb_state &= ~SBS_RCVATMARK; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. Otherwise copy * them out via the uio, then free. Sockbuf must be * consistent here (points to current mbuf, it points to next * record) when we drop priority; we must note any additions * to the sockbuf when we block interrupts again. */ if (mp == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if ((m->m_flags & M_EXTPG) != 0) error = m_unmapped_uiomove(m, moff, uio, (int)len); else error = uiomove(mtod(m, char *) + moff, (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) { /* * The MT_SONAME mbuf has already been removed * from the record, so it is necessary to * remove the data mbufs, if any, to preserve * the invariant in the case of PR_ADDR that * requires MT_SONAME mbufs at the head of * each record. */ if (pr->pr_flags & PR_ATOMIC && ((flags & MSG_PEEK) == 0)) (void)sbdroprecord_locked(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } else uio->uio_resid -= len; SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); if (mp != NULL) { m->m_nextpkt = NULL; *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } sockbuf_pushsync(&so->so_rcv, nextrecord); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); } } else { if (flags & MSG_PEEK) moff += len; else { if (mp != NULL) { if (flags & MSG_DONTWAIT) { *mp = m_copym(m, 0, len, M_NOWAIT); if (*mp == NULL) { /* * m_copym() couldn't * allocate an mbuf. * Adjust uio_resid back * (it was adjusted * down by len bytes, * which we didn't end * up "copying" over). */ uio->uio_resid += len; break; } } else { SOCKBUF_UNLOCK(&so->so_rcv); *mp = m_copym(m, 0, len, M_WAITOK); SOCKBUF_LOCK(&so->so_rcv); } } sbcut_locked(&so->so_rcv, len); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_rcv.sb_state |= SBS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), we * must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return with a * short count but without error. Keep sockbuf locked * against other readers. */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_error || so->so_rerror || so->so_rcv.sb_state & SBS_CANTRCVMORE) break; /* * Notify the protocol that some data has been * drained before blocking. */ if (pr->pr_flags & PR_WANTRCVD) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * We could receive some data while was notifying * the protocol. Skip blocking in this case. */ if (so->so_rcv.sb_mb == NULL) { error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } m = so->so_rcv.sb_mb; if (m != NULL) nextrecord = m->m_nextpkt; } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m != NULL && pr->pr_flags & PR_ATOMIC) { flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord_locked(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * If soreceive() is being done from the socket callback, * then don't need to generate ACK to peer to update window, * since ACK will be generated on return to TCP. */ if (!(flags & MSG_SOCALLBCK) && (pr->pr_flags & PR_WANTRCVD)) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto restart; } SOCKBUF_UNLOCK(&so->so_rcv); if (flagsp != NULL) *flagsp |= flags; release: SOCK_IO_RECV_UNLOCK(so); return (error); } /* * Optimized version of soreceive() for stream (TCP) sockets. */ int soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int len = 0, error = 0, flags, oresid; struct sockbuf *sb; struct mbuf *m, *n = NULL; /* We only do stream sockets. */ if (so->so_type != SOCK_STREAM) return (EINVAL); if (psa != NULL) *psa = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (controlp != NULL) *controlp = NULL; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp0 != NULL) *mp0 = NULL; sb = &so->so_rcv; #ifdef KERN_TLS /* * KTLS store TLS records as records with a control message to * describe the framing. * * We check once here before acquiring locks to optimize the * common case. */ if (sb->sb_tls_info != NULL) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); #endif /* Prevent other readers from entering the socket. */ error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); SOCKBUF_LOCK(sb); #ifdef KERN_TLS if (sb->sb_tls_info != NULL) { SOCKBUF_UNLOCK(sb); SOCK_IO_RECV_UNLOCK(so); return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); } #endif /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) { error = EINVAL; goto out; } oresid = uio->uio_resid; /* We will never ever get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { error = ENOTCONN; goto out; } restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); /* Abort if socket has reported problems. */ if (so->so_error) { if (sbavail(sb) > 0) goto deliver; if (oresid > uio->uio_resid) goto out; error = so->so_error; if (!(flags & MSG_PEEK)) so->so_error = 0; goto out; } /* Door is closed. Deliver what is left, if any. */ if (sb->sb_state & SBS_CANTRCVMORE) { if (sbavail(sb) > 0) goto deliver; else goto out; } /* Socket buffer is empty and we shall not block. */ if (sbavail(sb) == 0 && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { error = EAGAIN; goto out; } /* Socket buffer got some data that we shall deliver now. */ if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)) || sbavail(sb) >= sb->sb_lowat || sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat) ) { goto deliver; } /* On MSG_WAITALL we must wait until all data or error arrives. */ if ((flags & MSG_WAITALL) && (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) goto deliver; /* * Wait and block until (more) data comes in. * NB: Drops the sockbuf lock during wait. */ error = sbwait(sb); if (error) goto out; goto restart; deliver: SOCKBUF_LOCK_ASSERT(&so->so_rcv); KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); /* Statistics. */ if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; /* Fill uio until full or current end of socket buffer is reached. */ len = min(uio->uio_resid, sbavail(sb)); if (mp0 != NULL) { /* Dequeue as many mbufs as possible. */ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { if (*mp0 == NULL) *mp0 = sb->sb_mb; else m_cat(*mp0, sb->sb_mb); for (m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p not available", __func__, m)); len -= m->m_len; uio->uio_resid -= m->m_len; sbfree(sb, m); n = m; } n->m_next = NULL; sb->sb_mb = m; sb->sb_lastrecord = sb->sb_mb; if (sb->sb_mb == NULL) SB_EMPTY_FIXUP(sb); } /* Copy the remainder. */ if (len > 0) { KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else uio->uio_resid -= len; if (*mp0 != NULL) m_cat(*mp0, m); else *mp0 = m; if (*mp0 == NULL) { error = ENOBUFS; goto out; } } } else { /* NB: Must unlock socket buffer as uiomove may sleep. */ SOCKBUF_UNLOCK(sb); error = m_mbuftouio(uio, sb->sb_mb, len); SOCKBUF_LOCK(sb); if (error) goto out; } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); /* * Remove the delivered data from the socket buffer unless we * were only peeking. */ if (!(flags & MSG_PEEK)) { if (len > 0) sbdrop_locked(sb, len); /* Notify protocol that we drained some data. */ if ((so->so_proto->pr_flags & PR_WANTRCVD) && (((flags & MSG_WAITALL) && uio->uio_resid > 0) || !(flags & MSG_SOCALLBCK))) { SOCKBUF_UNLOCK(sb); VNET_SO_ASSERT(so); (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(sb); } } /* * For MSG_WAITALL we may have to loop again and wait for * more data to come in. */ if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); SOCK_IO_RECV_UNLOCK(so); return (error); } /* * Optimized version of soreceive() for simple datagram cases from userspace. * Unlike in the stream case, we're able to drop a datagram if copyout() * fails, and because we handle datagrams atomically, we don't need to use a * sleep lock to prevent I/O interlacing. */ int soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, *m2; int flags, error; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; /* * For any complicated cases, fall back to the full * soreceive_generic(). */ if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); /* * Enforce restrictions on use. */ KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, ("soreceive_dgram: wantrcvd")); KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, ("soreceive_dgram: SBS_RCVATMARK")); KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, ("soreceive_dgram: P_CONNREQUIRED")); /* * Loop blocking while waiting for a datagram. */ SOCKBUF_LOCK(&so->so_rcv); while ((m = so->so_rcv.sb_mb) == NULL) { KASSERT(sbavail(&so->so_rcv) == 0, ("soreceive_dgram: sb_mb NULL but sbavail %u", sbavail(&so->so_rcv))); if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); return (error); } if (so->so_rcv.sb_state & SBS_CANTRCVMORE || uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); return (0); } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); return (EWOULDBLOCK); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); return (error); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive_dgram: lastrecord != m")); } KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, ("soreceive_dgram: m_nextpkt != nextrecord")); /* * Pull 'm' and its chain off the front of the packet queue. */ so->so_rcv.sb_mb = NULL; sockbuf_pushsync(&so->so_rcv, nextrecord); /* * Walk 'm's chain and free that many bytes from the socket buffer. */ for (m2 = m; m2 != NULL; m2 = m2->m_next) sbfree(&so->so_rcv, m2); /* * Do a few last checks before we let go of the lock. */ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); m = m_free(m); } if (m == NULL) { /* XXXRW: Can this happen? */ return (0); } /* * Packet to copyout() is now in 'm' and it is disconnected from the * queue. * * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. We call into the * protocol to perform externalization (or freeing if controlp == * NULL). In some cases there can be only MT_CONTROL mbufs without * MT_DATA mbufs. */ if (m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { m2 = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = m2; } while (m != NULL && m->m_type == MT_CONTROL); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } } KASSERT(m == NULL || m->m_type == MT_DATA, ("soreceive_dgram: !data")); while (m != NULL && uio->uio_resid > 0) { len = uio->uio_resid; if (len > m->m_len) len = m->m_len; error = uiomove(mtod(m, char *), (int)len, uio); if (error) { m_freem(m); return (error); } if (len == m->m_len) m = m_free(m); else { m->m_data += len; m->m_len -= len; } } if (m != NULL) { flags |= MSG_TRUNC; m_freem(m); } if (flagsp != NULL) *flagsp |= flags; return (0); } int soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int error; CURVNET_SET(so->so_vnet); error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, controlp, flagsp)); CURVNET_RESTORE(); return (error); } int soshutdown(struct socket *so, int how) { struct protosw *pr; int error, soerror_enotconn; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) return (EINVAL); soerror_enotconn = 0; SOCK_LOCK(so); if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { /* * POSIX mandates us to return ENOTCONN when shutdown(2) is * invoked on a datagram sockets, however historically we would * actually tear socket down. This is known to be leveraged by * some applications to unblock process waiting in recvXXX(2) * by other process that it shares that socket with. Try to meet * both backward-compatibility and POSIX requirements by forcing * ENOTCONN but still asking protocol to perform pru_shutdown(). */ if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) { SOCK_UNLOCK(so); return (ENOTCONN); } soerror_enotconn = 1; } if (SOLISTENING(so)) { if (how != SHUT_WR) { so->so_error = ECONNABORTED; solisten_wakeup(so); /* unlocks so */ } else { SOCK_UNLOCK(so); } goto done; } SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); pr = so->so_proto; if (pr->pr_usrreqs->pru_flush != NULL) (*pr->pr_usrreqs->pru_flush)(so, how); if (how != SHUT_WR) sorflush(so); if (how != SHUT_RD) { error = (*pr->pr_usrreqs->pru_shutdown)(so); wakeup(&so->so_timeo); CURVNET_RESTORE(); return ((error == 0 && soerror_enotconn) ? ENOTCONN : error); } wakeup(&so->so_timeo); CURVNET_RESTORE(); done: return (soerror_enotconn ? ENOTCONN : 0); } void sorflush(struct socket *so) { struct socket aso; struct protosw *pr; int error; VNET_SO_ASSERT(so); /* * In order to avoid calling dom_dispose with the socket buffer mutex * held, we make a partial copy of the socket buffer and clear the * original. The new socket buffer copy won't have initialized locks so * we can only call routines that won't use or assert those locks. * Ideally calling socantrcvmore() would prevent data from being added * to the buffer, but currently it merely prevents buffered data from * being read by userspace. We make this effort to free buffered data * nonetheless. * * Dislodge threads currently blocked in receive and wait to acquire * a lock against other simultaneous readers before clearing the * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. */ socantrcvmore(so); error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR); if (error != 0) { KASSERT(SOLISTENING(so), ("%s: soiolock(%p) failed", __func__, so)); return; } SOCK_RECVBUF_LOCK(so); bzero(&aso, sizeof(aso)); aso.so_pcb = so->so_pcb; bcopy(&so->so_rcv.sb_startzero, &aso.so_rcv.sb_startzero, offsetof(struct sockbuf, sb_endzero) - offsetof(struct sockbuf, sb_startzero)); bzero(&so->so_rcv.sb_startzero, offsetof(struct sockbuf, sb_endzero) - offsetof(struct sockbuf, sb_startzero)); SOCK_RECVBUF_UNLOCK(so); SOCK_IO_RECV_UNLOCK(so); /* * Dispose of special rights and flush the copied socket. Don't call * any unsafe routines (that rely on locks being initialized) on aso. */ pr = so->so_proto; if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(&aso); sbrelease_internal(&aso.so_rcv, so); } /* * Wrapper for Socket established helper hook. * Parameters: socket, context of the hook point, hook id. */ static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) { struct socket_hhook_data hhook_data = { .so = so, .hctx = hctx, .m = NULL, .status = 0 }; CURVNET_SET(so->so_vnet); HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); CURVNET_RESTORE(); /* Ugly but needed, since hhooks return void for now */ return (hhook_data.status); } /* * Perhaps this routine, and sooptcopyout(), below, ought to come in an * additional variant to handle the case where the option value needs to be * some kind of integer, but not a specific size. In addition to their use * here, these functions are also called by the protocol-level pr_ctloutput() * routines. */ int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) { size_t valsize; /* * If the user gives us more than we wanted, we ignore it, but if we * don't get the minimum length the caller wants, we return EINVAL. * On success, sopt->sopt_valsize is set to however much we actually * retrieved. */ if ((valsize = sopt->sopt_valsize) < minlen) return EINVAL; if (valsize > len) sopt->sopt_valsize = valsize = len; if (sopt->sopt_td != NULL) return (copyin(sopt->sopt_val, buf, valsize)); bcopy(sopt->sopt_val, buf, valsize); return (0); } /* * Kernel version of setsockopt(2). * * XXX: optlen is size_t, not socklen_t */ int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen) { struct sockopt sopt; sopt.sopt_level = level; sopt.sopt_name = optname; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = optval; sopt.sopt_valsize = optlen; sopt.sopt_td = NULL; return (sosetopt(so, &sopt)); } int sosetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; sbintime_t val; uint32_t val32; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_setopt(so, sopt); if (error) goto bad; break; case SO_LINGER: error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; if (l.l_linger < 0 || l.l_linger > USHRT_MAX || l.l_linger > (INT_MAX / hz)) { error = EDOM; goto bad; } SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) so->so_options |= SO_LINGER; else so->so_options &= ~SO_LINGER; SOCK_UNLOCK(so); break; case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: case SO_RERROR: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; SOCK_LOCK(so); if (optval) so->so_options |= sopt->sopt_name; else so->so_options &= ~sopt->sopt_name; SOCK_UNLOCK(so); break; case SO_SETFIB: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval >= rt_numfibs) { error = EINVAL; goto bad; } if (((so->so_proto->pr_domain->dom_family == PF_INET) || (so->so_proto->pr_domain->dom_family == PF_INET6) || (so->so_proto->pr_domain->dom_family == PF_ROUTE))) so->so_fibnum = optval; else so->so_fibnum = 0; break; case SO_USER_COOKIE: error = sooptcopyin(sopt, &val32, sizeof val32, sizeof val32); if (error) goto bad; so->so_user_cookie = val32; break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; /* * Values < 1 make no sense for any of these options, * so disallow them. */ if (optval < 1) { error = EINVAL; goto bad; } error = sbsetopt(so, sopt->sopt_name, optval); break; case SO_SNDTIMEO: case SO_RCVTIMEO: #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; error = sooptcopyin(sopt, &tv32, sizeof tv32, sizeof tv32); CP(tv32, tv, tv_sec); CP(tv32, tv, tv_usec); } else #endif error = sooptcopyin(sopt, &tv, sizeof tv, sizeof tv); if (error) goto bad; if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; goto bad; } if (tv.tv_sec > INT32_MAX) val = SBT_MAX; else val = tvtosbt(tv); switch (sopt->sopt_name) { case SO_SNDTIMEO: so->so_snd.sb_timeo = val; break; case SO_RCVTIMEO: so->so_rcv.sb_timeo = val; break; } break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof extmac, sizeof extmac); if (error) goto bad; error = mac_setsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); #else error = EOPNOTSUPP; #endif break; case SO_TS_CLOCK: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval > SO_TS_CLOCK_MAX) { error = EINVAL; goto bad; } so->so_ts_clock = optval; break; case SO_MAX_PACING_RATE: error = sooptcopyin(sopt, &val32, sizeof(val32), sizeof(val32)); if (error) goto bad; so->so_max_pacing_rate = val32; break; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } if (error == 0 && so->so_proto->pr_ctloutput != NULL) (void)(*so->so_proto->pr_ctloutput)(so, sopt); } bad: CURVNET_RESTORE(); return (error); } /* * Helper routine for getsockopt. */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) { int error; size_t valsize; error = 0; /* * Documented get behavior is that we always return a value, possibly * truncated to fit in the user's buffer. Traditional behavior is * that we always tell the user precisely how much we copied, rather * than something useful like the total amount we had available for * her. Note that this interface is not idempotent; the entire * answer must be generated ahead of time. */ valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != NULL) { if (sopt->sopt_td != NULL) error = copyout(buf, sopt->sopt_val, valsize); else bcopy(buf, sopt->sopt_val, valsize); } return (error); } int sogetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; CURVNET_RESTORE(); return (error); } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_getopt(so, sopt); break; case SO_LINGER: SOCK_LOCK(so); l.l_onoff = so->so_options & SO_LINGER; l.l_linger = so->so_linger; SOCK_UNLOCK(so); error = sooptcopyout(sopt, &l, sizeof l); break; case SO_USELOOPBACK: case SO_DONTROUTE: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_BROADCAST: case SO_OOBINLINE: case SO_ACCEPTCONN: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: case SO_RERROR: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); break; case SO_DOMAIN: optval = so->so_proto->pr_domain->dom_family; goto integer; case SO_TYPE: optval = so->so_type; goto integer; case SO_PROTOCOL: optval = so->so_proto->pr_protocol; goto integer; case SO_ERROR: SOCK_LOCK(so); if (so->so_error) { optval = so->so_error; so->so_error = 0; } else { optval = so->so_rerror; so->so_rerror = 0; } SOCK_UNLOCK(so); goto integer; case SO_SNDBUF: optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : so->so_snd.sb_hiwat; goto integer; case SO_RCVBUF: optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : so->so_rcv.sb_hiwat; goto integer; case SO_SNDLOWAT: optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : so->so_snd.sb_lowat; goto integer; case SO_RCVLOWAT: optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : so->so_rcv.sb_lowat; goto integer; case SO_SNDTIMEO: case SO_RCVTIMEO: tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? so->so_snd.sb_timeo : so->so_rcv.sb_timeo); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; CP(tv, tv32, tv_sec); CP(tv, tv32, tv_usec); error = sooptcopyout(sopt, &tv32, sizeof tv32); } else #endif error = sooptcopyout(sopt, &tv, sizeof tv); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_PEERLABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_peerlabel( sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_LISTENQLIMIT: optval = SOLISTENING(so) ? so->sol_qlimit : 0; goto integer; case SO_LISTENQLEN: optval = SOLISTENING(so) ? so->sol_qlen : 0; goto integer; case SO_LISTENINCQLEN: optval = SOLISTENING(so) ? so->sol_incqlen : 0; goto integer; case SO_TS_CLOCK: optval = so->so_ts_clock; goto integer; case SO_MAX_PACING_RATE: optval = so->so_max_pacing_rate; goto integer; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } } #ifdef MAC bad: #endif CURVNET_RESTORE(); return (error); } int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { struct mbuf *m, *m_prev; int sopt_size = sopt->sopt_valsize; MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) return ENOBUFS; if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; *mp = m; m_prev = m; while (sopt_size) { MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { m_freem(*mp); return ENOBUFS; } if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m_freem(*mp); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; m_prev->m_next = m; m_prev = m; } return (0); } int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; m = m->m_next; } if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ panic("ip6_sooptmcopyin"); return (0); } int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; size_t valsize = 0; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; valsize += m->m_len; m = m->m_next; } if (m != NULL) { /* enough soopt buffer should be given from user-land */ m_freem(m0); return(EINVAL); } sopt->sopt_valsize = valsize; return (0); } /* * sohasoutofband(): protocol notifies socket layer of the arrival of new * out-of-band data, which will then notify socket consumers. */ void sohasoutofband(struct socket *so) { if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); selwakeuppri(&so->so_rdsel, PSOCK); } int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { /* * We do not need to set or assert curvnet as long as everyone uses * sopoll_generic(). */ return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, td)); } int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { int revents; SOCK_LOCK(so); if (SOLISTENING(so)) { if (!(events & (POLLIN | POLLRDNORM))) revents = 0; else if (!TAILQ_EMPTY(&so->sol_comp)) revents = events & (POLLIN | POLLRDNORM); else if ((events & POLLINIGNEOF) == 0 && so->so_error) revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; else { selrecord(td, &so->so_rdsel); revents = 0; } } else { revents = 0; SOCKBUF_LOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (events & (POLLIN | POLLRDNORM)) if (soreadabledata(so)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (sowriteable(so)) revents |= events & (POLLOUT | POLLWRNORM); if (events & (POLLPRI | POLLRDBAND)) if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) revents |= events & (POLLPRI | POLLRDBAND); if ((events & POLLINIGNEOF) == 0) { if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { revents |= events & (POLLIN | POLLRDNORM); if (so->so_snd.sb_state & SBS_CANTSENDMORE) revents |= POLLHUP; } } if (so->so_rcv.sb_state & SBS_CANTRCVMORE) revents |= events & POLLRDHUP; if (revents == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) { selrecord(td, &so->so_rdsel); so->so_rcv.sb_flags |= SB_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &so->so_wrsel); so->so_snd.sb_flags |= SB_SEL; } } SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); } SOCK_UNLOCK(so); return (revents); } int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; knl = &so->so_rdsel.si_note; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; default: return (EINVAL); } SOCK_LOCK(so); if (SOLISTENING(so)) { knlist_add(knl, kn, 1); } else { SOCKBUF_LOCK(sb); knlist_add(knl, kn, 1); sb->sb_flags |= SB_KNOTE; SOCKBUF_UNLOCK(sb); } SOCK_UNLOCK(so); return (0); } /* * Some routines that return EOPNOTSUPP for entry points that are not * supported by a protocol. Fill in as needed. */ int pru_accept_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job) { return EOPNOTSUPP; } int pru_attach_notsupp(struct socket *so, int proto, struct thread *td) { return EOPNOTSUPP; } int pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect2_notsupp(struct socket *so1, struct socket *so2) { return EOPNOTSUPP; } int pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return EOPNOTSUPP; } int pru_disconnect_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) { return EOPNOTSUPP; } int pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_rcvd_notsupp(struct socket *so, int flags) { return EOPNOTSUPP; } int pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) { return EOPNOTSUPP; } int pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { if (control != NULL) m_freem(control); if ((flags & PRUS_NOTREADY) == 0) m_freem(m); return (EOPNOTSUPP); } int pru_ready_notsupp(struct socket *so, struct mbuf *m, int count) { return (EOPNOTSUPP); } /* * This isn't really a ``null'' operation, but it's the default one and * doesn't do anything destructive. */ int pru_sense_null(struct socket *so, struct stat *sb) { sb->st_blksize = so->so_snd.sb_hiwat; return 0; } int pru_shutdown_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { return EOPNOTSUPP; } int pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { return EOPNOTSUPP; } int pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, struct thread *td) { return EOPNOTSUPP; } static void filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_rdknl_lock(so); knlist_remove(&so->so_rdsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; so_rdknl_unlock(so); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) { SOCK_LOCK_ASSERT(so); kn->kn_data = so->sol_qlen; if (so->so_error) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } return (!TAILQ_EMPTY(&so->sol_comp)); } SOCKBUF_LOCK_ASSERT(&so->so_rcv); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error || so->so_rerror) return (1); if (kn->kn_sfflags & NOTE_LOWAT) { if (kn->kn_data >= kn->kn_sdata) return (1); } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) return (1); /* This hook returning non-zero indicates an event, not error */ return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); } static void filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_wrknl_lock(so); knlist_remove(&so->so_wrsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; so_wrknl_unlock(so); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (0); SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) return (0); else if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); else return (kn->kn_data >= so->so_snd.sb_lowat); } static int filt_soempty(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (1); SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbused(&so->so_snd); if (kn->kn_data == 0) return (1); else return (0); } int socheckuid(struct socket *so, uid_t uid) { if (so == NULL) return (EPERM); if (so->so_cred->cr_uid != uid) return (EPERM); return (0); } /* * These functions are used by protocols to notify the socket layer (and its * consumers) of state changes in the sockets driven by protocol-side events. */ /* * Procedures to manipulate state flags of socket and do appropriate wakeups. * * Normal sequence from the active (originating) side is that * soisconnecting() is called during processing of connect() call, resulting * in an eventual call to soisconnected() if/when the connection is * established. When the connection is torn down soisdisconnecting() is * called during processing of disconnect() call, and soisdisconnected() is * called when the connection to the peer is totally severed. The semantics * of these routines are such that connectionless protocols can call * soisconnected() and soisdisconnected() only, bypassing the in-progress * calls when setting up a ``connection'' takes no time. * * From the passive side, a socket is created with two queues of sockets: * so_incomp for connections in progress and so_comp for connections already * made and awaiting user acceptance. As a protocol is preparing incoming * connections, it creates a socket structure queued on so_incomp by calling * sonewconn(). When the connection is established, soisconnected() is * called, and transfers the socket structure to so_comp, making it available * to accept(). * * If a socket is closed with sockets on either so_incomp or so_comp, these * sockets are dropped. * * If higher-level protocols are implemented in the kernel, the wakeups done * here will sometimes cause software-interrupt process scheduling. */ void soisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } void soisconnected(struct socket *so) { bool last __diagused; SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; if (so->so_qstate == SQ_INCOMP) { struct socket *head = so->so_listen; int ret; KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); /* * Promoting a socket from incomplete queue to complete, we * need to go through reverse order of locking. We first do * trylock, and if that doesn't succeed, we go the hard way * leaving a reference and rechecking consistency after proper * locking. */ if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { soref(head); SOCK_UNLOCK(so); SOLISTEN_LOCK(head); SOCK_LOCK(so); if (__predict_false(head != so->so_listen)) { /* * The socket went off the listen queue, * should be lost race to close(2) of sol. * The socket is about to soabort(). */ SOCK_UNLOCK(so); sorele_locked(head); return; } last = refcount_release(&head->so_count); KASSERT(!last, ("%s: released last reference for %p", __func__, head)); } again: if ((so->so_options & SO_ACCEPTFILTER) == 0) { TAILQ_REMOVE(&head->sol_incomp, so, so_list); head->sol_incqlen--; TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); head->sol_qlen++; so->so_qstate = SQ_COMP; SOCK_UNLOCK(so); solisten_wakeup(head); /* unlocks */ } else { SOCKBUF_LOCK(&so->so_rcv); soupcall_set(so, SO_RCV, head->sol_accept_filter->accf_callback, head->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; ret = head->sol_accept_filter->accf_callback(so, head->sol_accept_filter_arg, M_NOWAIT); if (ret == SU_ISCONNECTED) { soupcall_clear(so, SO_RCV); SOCKBUF_UNLOCK(&so->so_rcv); goto again; } SOCKBUF_UNLOCK(&so->so_rcv); SOCK_UNLOCK(so); SOLISTEN_UNLOCK(head); } return; } SOCK_UNLOCK(so); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); } void soisdisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; if (!SOLISTENING(so)) { SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); SOCKBUF_LOCK(&so->so_snd); socantsendmore_locked(so); } SOCK_UNLOCK(so); wakeup(&so->so_timeo); } void soisdisconnected(struct socket *so) { SOCK_LOCK(so); /* * There is at least one reader of so_state that does not * acquire socket lock, namely soreceive_generic(). Ensure * that it never sees all flags that track connection status * cleared, by ordering the update with a barrier semantic of * our release thread fence. */ so->so_state |= SS_ISDISCONNECTED; atomic_thread_fence_rel(); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); if (!SOLISTENING(so)) { SOCK_UNLOCK(so); SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); SOCKBUF_LOCK(&so->so_snd); sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); socantsendmore_locked(so); } else SOCK_UNLOCK(so); wakeup(&so->so_timeo); } int soiolock(struct socket *so, struct sx *sx, int flags) { int error; KASSERT((flags & SBL_VALID) == flags, ("soiolock: invalid flags %#x", flags)); if ((flags & SBL_WAIT) != 0) { if ((flags & SBL_NOINTR) != 0) { sx_xlock(sx); } else { error = sx_xlock_sig(sx); if (error != 0) return (error); } } else if (!sx_try_xlock(sx)) { return (EWOULDBLOCK); } if (__predict_false(SOLISTENING(so))) { sx_xunlock(sx); return (ENOTCONN); } return (0); } void soiounlock(struct sx *sx) { sx_xunlock(sx); } /* * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. */ struct sockaddr * sodupsockaddr(const struct sockaddr *sa, int mflags) { struct sockaddr *sa2; sa2 = malloc(sa->sa_len, M_SONAME, mflags); if (sa2) bcopy(sa, sa2, sa->sa_len); return sa2; } /* * Register per-socket destructor. */ void sodtor_set(struct socket *so, so_dtor_t *func) { SOCK_LOCK_ASSERT(so); so->so_dtor = func; } /* * Register per-socket buffer upcalls. */ void soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_set: bad which"); } SOCKBUF_LOCK_ASSERT(sb); sb->sb_upcall = func; sb->sb_upcallarg = arg; sb->sb_flags |= SB_UPCALL; } void soupcall_clear(struct socket *so, int which) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_clear: bad which"); } SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_upcall != NULL, ("%s: so %p no upcall to clear", __func__, so)); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } void solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) { SOLISTEN_LOCK_ASSERT(so); so->sol_upcall = func; so->sol_upcallarg = arg; } static void so_rdknl_lock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK(so); else SOCKBUF_LOCK(&so->so_rcv); } static void so_rdknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK(so); else SOCKBUF_UNLOCK(&so->so_rcv); } static void so_rdknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOCK_LOCK_ASSERT(so); else SOCKBUF_LOCK_ASSERT(&so->so_rcv); } else { if (SOLISTENING(so)) SOCK_UNLOCK_ASSERT(so); else SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); } } static void so_wrknl_lock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK(so); else SOCKBUF_LOCK(&so->so_snd); } static void so_wrknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK(so); else SOCKBUF_UNLOCK(&so->so_snd); } static void so_wrknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOCK_LOCK_ASSERT(so); else SOCKBUF_LOCK_ASSERT(&so->so_snd); } else { if (SOLISTENING(so)) SOCK_UNLOCK_ASSERT(so); else SOCKBUF_UNLOCK_ASSERT(&so->so_snd); } } /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void sotoxsocket(struct socket *so, struct xsocket *xso) { bzero(xso, sizeof(*xso)); xso->xso_len = sizeof *xso; xso->xso_so = (uintptr_t)so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = (uintptr_t)so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; xso->so_uid = so->so_cred->cr_uid; xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; if (SOLISTENING(so)) { xso->so_qlen = so->sol_qlen; xso->so_incqlen = so->sol_incqlen; xso->so_qlimit = so->sol_qlimit; xso->so_oobmark = 0; } else { xso->so_state |= so->so_qstate; xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; xso->so_oobmark = so->so_oobmark; sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); } } struct sockbuf * so_sockbuf_rcv(struct socket *so) { return (&so->so_rcv); } struct sockbuf * so_sockbuf_snd(struct socket *so) { return (&so->so_snd); } int so_state_get(const struct socket *so) { return (so->so_state); } void so_state_set(struct socket *so, int val) { so->so_state = val; } int so_options_get(const struct socket *so) { return (so->so_options); } void so_options_set(struct socket *so, int val) { so->so_options = val; } int so_error_get(const struct socket *so) { return (so->so_error); } void so_error_set(struct socket *so, int val) { so->so_error = val; } int so_linger_get(const struct socket *so) { return (so->so_linger); } void so_linger_set(struct socket *so, int val) { KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), ("%s: val %d out of range", __func__, val)); so->so_linger = val; } struct protosw * so_protosw_get(const struct socket *so) { return (so->so_proto); } void so_protosw_set(struct socket *so, struct protosw *val) { so->so_proto = val; } void so_sorwakeup(struct socket *so) { sorwakeup(so); } void so_sowwakeup(struct socket *so) { sowwakeup(so); } void so_sorwakeup_locked(struct socket *so) { sorwakeup_locked(so); } void so_sowwakeup_locked(struct socket *so) { sowwakeup_locked(so); } void so_lock(struct socket *so) { SOCK_LOCK(so); } void so_unlock(struct socket *so) { SOCK_UNLOCK(so); } diff --git a/sys/sys/ktls.h b/sys/sys/ktls.h index 5cfca2d860a0..4fa52f13e127 100644 --- a/sys/sys/ktls.h +++ b/sys/sys/ktls.h @@ -1,247 +1,248 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014-2019 Netflix Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_KTLS_H_ #define _SYS_KTLS_H_ #ifdef _KERNEL #include #include #endif struct tls_record_layer { uint8_t tls_type; uint8_t tls_vmajor; uint8_t tls_vminor; uint16_t tls_length; uint8_t tls_data[0]; } __attribute__ ((packed)); #define TLS_MAX_MSG_SIZE_V10_2 16384 #define TLS_MAX_PARAM_SIZE 1024 /* Max key/mac/iv in sockopt */ #define TLS_AEAD_GCM_LEN 4 #define TLS_1_3_GCM_IV_LEN 12 #define TLS_CHACHA20_IV_LEN 12 #define TLS_CBC_IMPLICIT_IV_LEN 16 /* Type values for the record layer */ #define TLS_RLTYPE_APP 23 /* * Nonce for GCM for TLS 1.2 per RFC 5288. */ struct tls_nonce_data { uint8_t fixed[TLS_AEAD_GCM_LEN]; uint64_t seq; } __packed; /* * AEAD additional data format for TLS 1.2 per RFC 5246. */ struct tls_aead_data { uint64_t seq; /* In network order */ uint8_t type; uint8_t tls_vmajor; uint8_t tls_vminor; uint16_t tls_length; } __packed; /* * AEAD additional data format for TLS 1.3 per RFC 8446. */ struct tls_aead_data_13 { uint8_t type; uint8_t tls_vmajor; uint8_t tls_vminor; uint16_t tls_length; } __packed; /* * Stream Cipher MAC additional data input. This does not match the * exact data on the wire (the sequence number is not placed on the * wire, and any explicit IV after the record header is not covered by * the MAC). */ struct tls_mac_data { uint64_t seq; uint8_t type; uint8_t tls_vmajor; uint8_t tls_vminor; uint16_t tls_length; } __packed; #define TLS_MAJOR_VER_ONE 3 #define TLS_MINOR_VER_ZERO 1 /* 3, 1 */ #define TLS_MINOR_VER_ONE 2 /* 3, 2 */ #define TLS_MINOR_VER_TWO 3 /* 3, 3 */ #define TLS_MINOR_VER_THREE 4 /* 3, 4 */ /* For TCP_TXTLS_ENABLE and TCP_RXTLS_ENABLE. */ #ifdef _KERNEL struct tls_enable_v0 { const uint8_t *cipher_key; const uint8_t *iv; /* Implicit IV. */ const uint8_t *auth_key; int cipher_algorithm; /* e.g. CRYPTO_AES_CBC */ int cipher_key_len; int iv_len; int auth_algorithm; /* e.g. CRYPTO_SHA2_256_HMAC */ int auth_key_len; int flags; uint8_t tls_vmajor; uint8_t tls_vminor; }; #endif struct tls_enable { const uint8_t *cipher_key; const uint8_t *iv; /* Implicit IV. */ const uint8_t *auth_key; int cipher_algorithm; /* e.g. CRYPTO_AES_CBC */ int cipher_key_len; int iv_len; int auth_algorithm; /* e.g. CRYPTO_SHA2_256_HMAC */ int auth_key_len; int flags; uint8_t tls_vmajor; uint8_t tls_vminor; uint8_t rec_seq[8]; }; /* Structure for TLS_GET_RECORD. */ struct tls_get_record { /* TLS record header. */ uint8_t tls_type; uint8_t tls_vmajor; uint8_t tls_vminor; uint16_t tls_length; }; #ifdef _KERNEL struct tls_session_params { uint8_t *cipher_key; uint8_t *auth_key; uint8_t iv[TLS_CBC_IMPLICIT_IV_LEN]; int cipher_algorithm; int auth_algorithm; uint16_t cipher_key_len; uint16_t iv_len; uint16_t auth_key_len; uint16_t max_frame_len; uint8_t tls_vmajor; uint8_t tls_vminor; uint8_t tls_hlen; uint8_t tls_tlen; uint8_t tls_bs; uint8_t flags; }; /* Used in APIs to request RX vs TX sessions. */ #define KTLS_TX 1 #define KTLS_RX 2 struct iovec; struct ktls_ocf_session; struct ktls_ocf_encrypt_state; struct ktls_session; struct m_snd_tag; struct mbuf; struct sockbuf; struct socket; struct ktls_session { union { int (*sw_encrypt)(struct ktls_ocf_encrypt_state *state, struct ktls_session *tls, struct mbuf *m, struct iovec *outiov, int outiovcnt); int (*sw_decrypt)(struct ktls_session *tls, const struct tls_record_layer *hdr, struct mbuf *m, uint64_t seqno, int *trailer_len); }; struct ktls_ocf_session *ocf_session; struct m_snd_tag *snd_tag; struct tls_session_params params; u_int wq_index; volatile u_int refcount; int mode; struct task reset_tag_task; struct task disable_ifnet_task; struct inpcb *inp; bool reset_pending; bool disable_ifnet_pending; bool sync_dispatch; bool sequential_records; /* Only used for TLS 1.0. */ uint64_t next_seqno; STAILQ_HEAD(, mbuf) pending_records; } __aligned(CACHE_LINE_SIZE); extern unsigned int ktls_ifnet_max_rexmit_pct; void ktls_check_rx(struct sockbuf *sb); void ktls_disable_ifnet(void *arg); int ktls_enable_rx(struct socket *so, struct tls_enable *en); int ktls_enable_tx(struct socket *so, struct tls_enable *en); void ktls_destroy(struct ktls_session *tls); void ktls_frame(struct mbuf *m, struct ktls_session *tls, int *enqueue_cnt, uint8_t record_type); +bool ktls_permit_empty_frames(struct ktls_session *tls); void ktls_seq(struct sockbuf *sb, struct mbuf *m); void ktls_enqueue(struct mbuf *m, struct socket *so, int page_count); void ktls_enqueue_to_free(struct mbuf *m); int ktls_get_rx_mode(struct socket *so, int *modep); int ktls_set_tx_mode(struct socket *so, int mode); int ktls_get_tx_mode(struct socket *so, int *modep); int ktls_get_rx_sequence(struct inpcb *inp, uint32_t *tcpseq, uint64_t *tlsseq); int ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls); #ifdef RATELIMIT int ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate); #endif bool ktls_pending_rx_info(struct sockbuf *sb, uint64_t *seqnop, size_t *residp); static inline struct ktls_session * ktls_hold(struct ktls_session *tls) { if (tls != NULL) refcount_acquire(&tls->refcount); return (tls); } static inline void ktls_free(struct ktls_session *tls) { if (refcount_release(&tls->refcount)) ktls_destroy(tls); } #endif /* !_KERNEL */ #endif /* !_SYS_KTLS_H_ */ diff --git a/tests/sys/kern/ktls_test.c b/tests/sys/kern/ktls_test.c index 9525258a64bc..759a23455f25 100644 --- a/tests/sys/kern/ktls_test.c +++ b/tests/sys/kern/ktls_test.c @@ -1,1785 +1,1800 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Netflix Inc. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void require_ktls(void) { size_t len; bool enable; len = sizeof(enable); if (sysctlbyname("kern.ipc.tls.enable", &enable, &len, NULL, 0) == -1) { if (errno == ENOENT) atf_tc_skip("kernel does not support TLS offload"); atf_libc_error(errno, "Failed to read kern.ipc.tls.enable"); } if (!enable) atf_tc_skip("Kernel TLS is disabled"); } #define ATF_REQUIRE_KTLS() require_ktls() static char rdigit(void) { /* ASCII printable values between 0x20 and 0x7e */ return (0x20 + random() % (0x7f - 0x20)); } static char * alloc_buffer(size_t len) { char *buf; size_t i; if (len == 0) return (NULL); buf = malloc(len); for (i = 0; i < len; i++) buf[i] = rdigit(); return (buf); } static bool socketpair_tcp(int *sv) { struct pollfd pfd; struct sockaddr_in sin; socklen_t len; int as, cs, ls; ls = socket(PF_INET, SOCK_STREAM, 0); if (ls == -1) { warn("socket() for listen"); return (false); } memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof(sin); sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); if (bind(ls, (struct sockaddr *)&sin, sizeof(sin)) == -1) { warn("bind"); close(ls); return (false); } if (listen(ls, 1) == -1) { warn("listen"); close(ls); return (false); } len = sizeof(sin); if (getsockname(ls, (struct sockaddr *)&sin, &len) == -1) { warn("getsockname"); close(ls); return (false); } cs = socket(PF_INET, SOCK_STREAM | SOCK_NONBLOCK, 0); if (cs == -1) { warn("socket() for connect"); close(ls); return (false); } if (connect(cs, (struct sockaddr *)&sin, sizeof(sin)) == -1) { if (errno != EINPROGRESS) { warn("connect"); close(ls); close(cs); return (false); } } as = accept4(ls, NULL, NULL, SOCK_NONBLOCK); if (as == -1) { warn("accept4"); close(ls); close(cs); return (false); } close(ls); pfd.fd = cs; pfd.events = POLLOUT; pfd.revents = 0; ATF_REQUIRE(poll(&pfd, 1, INFTIM) == 1); ATF_REQUIRE(pfd.revents == POLLOUT); sv[0] = cs; sv[1] = as; return (true); } static void fd_set_blocking(int fd) { int flags; ATF_REQUIRE((flags = fcntl(fd, F_GETFL)) != -1); flags &= ~O_NONBLOCK; ATF_REQUIRE(fcntl(fd, F_SETFL, flags) != -1); } static bool cbc_decrypt(const EVP_CIPHER *cipher, const char *key, const char *iv, const char *input, char *output, size_t size) { EVP_CIPHER_CTX *ctx; int outl, total; ctx = EVP_CIPHER_CTX_new(); if (ctx == NULL) { warnx("EVP_CIPHER_CTX_new failed: %s", ERR_error_string(ERR_get_error(), NULL)); return (false); } if (EVP_CipherInit_ex(ctx, cipher, NULL, (const u_char *)key, (const u_char *)iv, 0) != 1) { warnx("EVP_CipherInit_ex failed: %s", ERR_error_string(ERR_get_error(), NULL)); EVP_CIPHER_CTX_free(ctx); return (false); } EVP_CIPHER_CTX_set_padding(ctx, 0); if (EVP_CipherUpdate(ctx, (u_char *)output, &outl, (const u_char *)input, size) != 1) { warnx("EVP_CipherUpdate failed: %s", ERR_error_string(ERR_get_error(), NULL)); EVP_CIPHER_CTX_free(ctx); return (false); } total = outl; if (EVP_CipherFinal_ex(ctx, (u_char *)output + outl, &outl) != 1) { warnx("EVP_CipherFinal_ex failed: %s", ERR_error_string(ERR_get_error(), NULL)); EVP_CIPHER_CTX_free(ctx); return (false); } total += outl; if ((size_t)total != size) { warnx("decrypt size mismatch: %zu vs %d", size, total); EVP_CIPHER_CTX_free(ctx); return (false); } EVP_CIPHER_CTX_free(ctx); return (true); } static bool verify_hash(const EVP_MD *md, const void *key, size_t key_len, const void *aad, size_t aad_len, const void *buffer, size_t len, const void *digest) { HMAC_CTX *ctx; unsigned char digest2[EVP_MAX_MD_SIZE]; u_int digest_len; ctx = HMAC_CTX_new(); if (ctx == NULL) { warnx("HMAC_CTX_new failed: %s", ERR_error_string(ERR_get_error(), NULL)); return (false); } if (HMAC_Init_ex(ctx, key, key_len, md, NULL) != 1) { warnx("HMAC_Init_ex failed: %s", ERR_error_string(ERR_get_error(), NULL)); HMAC_CTX_free(ctx); return (false); } if (HMAC_Update(ctx, aad, aad_len) != 1) { warnx("HMAC_Update (aad) failed: %s", ERR_error_string(ERR_get_error(), NULL)); HMAC_CTX_free(ctx); return (false); } if (HMAC_Update(ctx, buffer, len) != 1) { warnx("HMAC_Update (payload) failed: %s", ERR_error_string(ERR_get_error(), NULL)); HMAC_CTX_free(ctx); return (false); } if (HMAC_Final(ctx, digest2, &digest_len) != 1) { warnx("HMAC_Final failed: %s", ERR_error_string(ERR_get_error(), NULL)); HMAC_CTX_free(ctx); return (false); } HMAC_CTX_free(ctx); if (memcmp(digest, digest2, digest_len) != 0) { warnx("HMAC mismatch"); return (false); } return (true); } static bool aead_encrypt(const EVP_CIPHER *cipher, const char *key, const char *nonce, const void *aad, size_t aad_len, const char *input, char *output, size_t size, char *tag, size_t tag_len) { EVP_CIPHER_CTX *ctx; int outl, total; ctx = EVP_CIPHER_CTX_new(); if (ctx == NULL) { warnx("EVP_CIPHER_CTX_new failed: %s", ERR_error_string(ERR_get_error(), NULL)); return (false); } if (EVP_EncryptInit_ex(ctx, cipher, NULL, (const u_char *)key, (const u_char *)nonce) != 1) { warnx("EVP_EncryptInit_ex failed: %s", ERR_error_string(ERR_get_error(), NULL)); EVP_CIPHER_CTX_free(ctx); return (false); } EVP_CIPHER_CTX_set_padding(ctx, 0); if (aad != NULL) { if (EVP_EncryptUpdate(ctx, NULL, &outl, (const u_char *)aad, aad_len) != 1) { warnx("EVP_EncryptUpdate for AAD failed: %s", ERR_error_string(ERR_get_error(), NULL)); EVP_CIPHER_CTX_free(ctx); return (false); } } if (EVP_EncryptUpdate(ctx, (u_char *)output, &outl, (const u_char *)input, size) != 1) { warnx("EVP_EncryptUpdate failed: %s", ERR_error_string(ERR_get_error(), NULL)); EVP_CIPHER_CTX_free(ctx); return (false); } total = outl; if (EVP_EncryptFinal_ex(ctx, (u_char *)output + outl, &outl) != 1) { warnx("EVP_EncryptFinal_ex failed: %s", ERR_error_string(ERR_get_error(), NULL)); EVP_CIPHER_CTX_free(ctx); return (false); } total += outl; if ((size_t)total != size) { warnx("encrypt size mismatch: %zu vs %d", size, total); EVP_CIPHER_CTX_free(ctx); return (false); } if (EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_AEAD_GET_TAG, tag_len, tag) != 1) { warnx("EVP_CIPHER_CTX_ctrl(EVP_CTRL_AEAD_GET_TAG) failed: %s", ERR_error_string(ERR_get_error(), NULL)); EVP_CIPHER_CTX_free(ctx); return (false); } EVP_CIPHER_CTX_free(ctx); return (true); } static bool aead_decrypt(const EVP_CIPHER *cipher, const char *key, const char *nonce, const void *aad, size_t aad_len, const char *input, char *output, size_t size, const char *tag, size_t tag_len) { EVP_CIPHER_CTX *ctx; int outl, total; bool valid; ctx = EVP_CIPHER_CTX_new(); if (ctx == NULL) { warnx("EVP_CIPHER_CTX_new failed: %s", ERR_error_string(ERR_get_error(), NULL)); return (false); } if (EVP_DecryptInit_ex(ctx, cipher, NULL, (const u_char *)key, (const u_char *)nonce) != 1) { warnx("EVP_DecryptInit_ex failed: %s", ERR_error_string(ERR_get_error(), NULL)); EVP_CIPHER_CTX_free(ctx); return (false); } EVP_CIPHER_CTX_set_padding(ctx, 0); if (aad != NULL) { if (EVP_DecryptUpdate(ctx, NULL, &outl, (const u_char *)aad, aad_len) != 1) { warnx("EVP_DecryptUpdate for AAD failed: %s", ERR_error_string(ERR_get_error(), NULL)); EVP_CIPHER_CTX_free(ctx); return (false); } } if (EVP_DecryptUpdate(ctx, (u_char *)output, &outl, (const u_char *)input, size) != 1) { warnx("EVP_DecryptUpdate failed: %s", ERR_error_string(ERR_get_error(), NULL)); EVP_CIPHER_CTX_free(ctx); return (false); } total = outl; if (EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_AEAD_SET_TAG, tag_len, __DECONST(char *, tag)) != 1) { warnx("EVP_CIPHER_CTX_ctrl(EVP_CTRL_AEAD_SET_TAG) failed: %s", ERR_error_string(ERR_get_error(), NULL)); EVP_CIPHER_CTX_free(ctx); return (false); } valid = (EVP_DecryptFinal_ex(ctx, (u_char *)output + outl, &outl) == 1); total += outl; if ((size_t)total != size) { warnx("decrypt size mismatch: %zu vs %d", size, total); EVP_CIPHER_CTX_free(ctx); return (false); } if (!valid) warnx("tag mismatch"); EVP_CIPHER_CTX_free(ctx); return (valid); } static void build_tls_enable(int cipher_alg, size_t cipher_key_len, int auth_alg, int minor, uint64_t seqno, struct tls_enable *en) { u_int auth_key_len, iv_len; memset(en, 0, sizeof(*en)); switch (cipher_alg) { case CRYPTO_AES_CBC: if (minor == TLS_MINOR_VER_ZERO) iv_len = AES_BLOCK_LEN; else iv_len = 0; break; case CRYPTO_AES_NIST_GCM_16: if (minor == TLS_MINOR_VER_TWO) iv_len = TLS_AEAD_GCM_LEN; else iv_len = TLS_1_3_GCM_IV_LEN; break; case CRYPTO_CHACHA20_POLY1305: iv_len = TLS_CHACHA20_IV_LEN; break; default: iv_len = 0; break; } switch (auth_alg) { case CRYPTO_SHA1_HMAC: auth_key_len = SHA1_HASH_LEN; break; case CRYPTO_SHA2_256_HMAC: auth_key_len = SHA2_256_HASH_LEN; break; case CRYPTO_SHA2_384_HMAC: auth_key_len = SHA2_384_HASH_LEN; break; default: auth_key_len = 0; break; } en->cipher_key = alloc_buffer(cipher_key_len); en->iv = alloc_buffer(iv_len); en->auth_key = alloc_buffer(auth_key_len); en->cipher_algorithm = cipher_alg; en->cipher_key_len = cipher_key_len; en->iv_len = iv_len; en->auth_algorithm = auth_alg; en->auth_key_len = auth_key_len; en->tls_vmajor = TLS_MAJOR_VER_ONE; en->tls_vminor = minor; be64enc(en->rec_seq, seqno); } static void free_tls_enable(struct tls_enable *en) { free(__DECONST(void *, en->cipher_key)); free(__DECONST(void *, en->iv)); free(__DECONST(void *, en->auth_key)); } static const EVP_CIPHER * tls_EVP_CIPHER(const struct tls_enable *en) { switch (en->cipher_algorithm) { case CRYPTO_AES_CBC: switch (en->cipher_key_len) { case 128 / 8: return (EVP_aes_128_cbc()); case 256 / 8: return (EVP_aes_256_cbc()); default: return (NULL); } break; case CRYPTO_AES_NIST_GCM_16: switch (en->cipher_key_len) { case 128 / 8: return (EVP_aes_128_gcm()); case 256 / 8: return (EVP_aes_256_gcm()); default: return (NULL); } break; case CRYPTO_CHACHA20_POLY1305: return (EVP_chacha20_poly1305()); default: return (NULL); } } static const EVP_MD * tls_EVP_MD(const struct tls_enable *en) { switch (en->auth_algorithm) { case CRYPTO_SHA1_HMAC: return (EVP_sha1()); case CRYPTO_SHA2_256_HMAC: return (EVP_sha256()); case CRYPTO_SHA2_384_HMAC: return (EVP_sha384()); default: return (NULL); } } static size_t tls_header_len(struct tls_enable *en) { size_t len; len = sizeof(struct tls_record_layer); switch (en->cipher_algorithm) { case CRYPTO_AES_CBC: if (en->tls_vminor != TLS_MINOR_VER_ZERO) len += AES_BLOCK_LEN; return (len); case CRYPTO_AES_NIST_GCM_16: if (en->tls_vminor == TLS_MINOR_VER_TWO) len += sizeof(uint64_t); return (len); case CRYPTO_CHACHA20_POLY1305: return (len); default: return (0); } } static size_t tls_mac_len(struct tls_enable *en) { switch (en->cipher_algorithm) { case CRYPTO_AES_CBC: switch (en->auth_algorithm) { case CRYPTO_SHA1_HMAC: return (SHA1_HASH_LEN); case CRYPTO_SHA2_256_HMAC: return (SHA2_256_HASH_LEN); case CRYPTO_SHA2_384_HMAC: return (SHA2_384_HASH_LEN); default: return (0); } case CRYPTO_AES_NIST_GCM_16: return (AES_GMAC_HASH_LEN); case CRYPTO_CHACHA20_POLY1305: return (POLY1305_HASH_LEN); default: return (0); } } /* Includes maximum padding for MTE. */ static size_t tls_trailer_len(struct tls_enable *en) { size_t len; len = tls_mac_len(en); if (en->cipher_algorithm == CRYPTO_AES_CBC) len += AES_BLOCK_LEN; if (en->tls_vminor == TLS_MINOR_VER_THREE) len++; return (len); } /* 'len' is the length of the payload application data. */ static void tls_mte_aad(struct tls_enable *en, size_t len, const struct tls_record_layer *hdr, uint64_t seqno, struct tls_mac_data *ad) { ad->seq = htobe64(seqno); ad->type = hdr->tls_type; ad->tls_vmajor = hdr->tls_vmajor; ad->tls_vminor = hdr->tls_vminor; ad->tls_length = htons(len); } static void tls_12_aead_aad(struct tls_enable *en, size_t len, const struct tls_record_layer *hdr, uint64_t seqno, struct tls_aead_data *ad) { ad->seq = htobe64(seqno); ad->type = hdr->tls_type; ad->tls_vmajor = hdr->tls_vmajor; ad->tls_vminor = hdr->tls_vminor; ad->tls_length = htons(len); } static void tls_13_aad(struct tls_enable *en, const struct tls_record_layer *hdr, uint64_t seqno, struct tls_aead_data_13 *ad) { ad->type = hdr->tls_type; ad->tls_vmajor = hdr->tls_vmajor; ad->tls_vminor = hdr->tls_vminor; ad->tls_length = hdr->tls_length; } static void tls_12_gcm_nonce(struct tls_enable *en, const struct tls_record_layer *hdr, char *nonce) { memcpy(nonce, en->iv, TLS_AEAD_GCM_LEN); memcpy(nonce + TLS_AEAD_GCM_LEN, hdr + 1, sizeof(uint64_t)); } static void tls_13_nonce(struct tls_enable *en, uint64_t seqno, char *nonce) { static_assert(TLS_1_3_GCM_IV_LEN == TLS_CHACHA20_IV_LEN, "TLS 1.3 nonce length mismatch"); memcpy(nonce, en->iv, TLS_1_3_GCM_IV_LEN); *(uint64_t *)(nonce + 4) ^= htobe64(seqno); } /* * Decrypt a TLS record 'len' bytes long at 'src' and store the result at * 'dst'. If the TLS record header length doesn't match or 'dst' doesn't * have sufficient room ('avail'), fail the test. */ static size_t decrypt_tls_aes_cbc_mte(struct tls_enable *en, uint64_t seqno, const void *src, size_t len, void *dst, size_t avail, uint8_t *record_type) { const struct tls_record_layer *hdr; struct tls_mac_data aad; const char *iv; char *buf; size_t hdr_len, mac_len, payload_len; int padding; hdr = src; hdr_len = tls_header_len(en); mac_len = tls_mac_len(en); ATF_REQUIRE(hdr->tls_vmajor == TLS_MAJOR_VER_ONE); ATF_REQUIRE(hdr->tls_vminor == en->tls_vminor); /* First, decrypt the outer payload into a temporary buffer. */ payload_len = len - hdr_len; buf = malloc(payload_len); if (en->tls_vminor == TLS_MINOR_VER_ZERO) iv = en->iv; else iv = (void *)(hdr + 1); ATF_REQUIRE(cbc_decrypt(tls_EVP_CIPHER(en), en->cipher_key, iv, (const u_char *)src + hdr_len, buf, payload_len)); /* * Copy the last encrypted block to use as the IV for the next * record for TLS 1.0. */ if (en->tls_vminor == TLS_MINOR_VER_ZERO) memcpy(__DECONST(uint8_t *, en->iv), (const u_char *)src + (len - AES_BLOCK_LEN), AES_BLOCK_LEN); /* * Verify trailing padding and strip. * * The kernel always generates the smallest amount of padding. */ padding = buf[payload_len - 1] + 1; ATF_REQUIRE(padding > 0 && padding <= AES_BLOCK_LEN); ATF_REQUIRE(payload_len >= mac_len + padding); payload_len -= padding; /* Verify HMAC. */ payload_len -= mac_len; tls_mte_aad(en, payload_len, hdr, seqno, &aad); ATF_REQUIRE(verify_hash(tls_EVP_MD(en), en->auth_key, en->auth_key_len, &aad, sizeof(aad), buf, payload_len, buf + payload_len)); ATF_REQUIRE(payload_len <= avail); memcpy(dst, buf, payload_len); *record_type = hdr->tls_type; return (payload_len); } static size_t decrypt_tls_12_aead(struct tls_enable *en, uint64_t seqno, const void *src, size_t len, void *dst, uint8_t *record_type) { const struct tls_record_layer *hdr; struct tls_aead_data aad; char nonce[12]; size_t hdr_len, mac_len, payload_len; hdr = src; hdr_len = tls_header_len(en); mac_len = tls_mac_len(en); payload_len = len - (hdr_len + mac_len); ATF_REQUIRE(hdr->tls_vmajor == TLS_MAJOR_VER_ONE); ATF_REQUIRE(hdr->tls_vminor == TLS_MINOR_VER_TWO); tls_12_aead_aad(en, payload_len, hdr, seqno, &aad); if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16) tls_12_gcm_nonce(en, hdr, nonce); else tls_13_nonce(en, seqno, nonce); ATF_REQUIRE(aead_decrypt(tls_EVP_CIPHER(en), en->cipher_key, nonce, &aad, sizeof(aad), (const char *)src + hdr_len, dst, payload_len, (const char *)src + hdr_len + payload_len, mac_len)); *record_type = hdr->tls_type; return (payload_len); } static size_t decrypt_tls_13_aead(struct tls_enable *en, uint64_t seqno, const void *src, size_t len, void *dst, uint8_t *record_type) { const struct tls_record_layer *hdr; struct tls_aead_data_13 aad; char nonce[12]; char *buf; size_t hdr_len, mac_len, payload_len; hdr = src; hdr_len = tls_header_len(en); mac_len = tls_mac_len(en); payload_len = len - (hdr_len + mac_len); ATF_REQUIRE(payload_len >= 1); ATF_REQUIRE(hdr->tls_type == TLS_RLTYPE_APP); ATF_REQUIRE(hdr->tls_vmajor == TLS_MAJOR_VER_ONE); ATF_REQUIRE(hdr->tls_vminor == TLS_MINOR_VER_TWO); tls_13_aad(en, hdr, seqno, &aad); tls_13_nonce(en, seqno, nonce); /* * Have to use a temporary buffer for the output due to the * record type as the last byte of the trailer. */ buf = malloc(payload_len); ATF_REQUIRE(aead_decrypt(tls_EVP_CIPHER(en), en->cipher_key, nonce, &aad, sizeof(aad), (const char *)src + hdr_len, buf, payload_len, (const char *)src + hdr_len + payload_len, mac_len)); /* Trim record type. */ *record_type = buf[payload_len - 1]; payload_len--; memcpy(dst, buf, payload_len); free(buf); return (payload_len); } static size_t decrypt_tls_aead(struct tls_enable *en, uint64_t seqno, const void *src, size_t len, void *dst, size_t avail, uint8_t *record_type) { const struct tls_record_layer *hdr; size_t payload_len; hdr = src; ATF_REQUIRE(ntohs(hdr->tls_length) + sizeof(*hdr) == len); payload_len = len - (tls_header_len(en) + tls_trailer_len(en)); ATF_REQUIRE(payload_len <= avail); if (en->tls_vminor == TLS_MINOR_VER_TWO) { ATF_REQUIRE(decrypt_tls_12_aead(en, seqno, src, len, dst, record_type) == payload_len); } else { ATF_REQUIRE(decrypt_tls_13_aead(en, seqno, src, len, dst, record_type) == payload_len); } return (payload_len); } static size_t decrypt_tls_record(struct tls_enable *en, uint64_t seqno, const void *src, size_t len, void *dst, size_t avail, uint8_t *record_type) { if (en->cipher_algorithm == CRYPTO_AES_CBC) return (decrypt_tls_aes_cbc_mte(en, seqno, src, len, dst, avail, record_type)); else return (decrypt_tls_aead(en, seqno, src, len, dst, avail, record_type)); } /* * Encrypt a TLS record of type 'record_type' with payload 'len' bytes * long at 'src' and store the result at 'dst'. If 'dst' doesn't have * sufficient room ('avail'), fail the test. */ static size_t encrypt_tls_12_aead(struct tls_enable *en, uint8_t record_type, uint64_t seqno, const void *src, size_t len, void *dst) { struct tls_record_layer *hdr; struct tls_aead_data aad; char nonce[12]; size_t hdr_len, mac_len, record_len; hdr = dst; hdr_len = tls_header_len(en); mac_len = tls_mac_len(en); record_len = hdr_len + len + mac_len; hdr->tls_type = record_type; hdr->tls_vmajor = TLS_MAJOR_VER_ONE; hdr->tls_vminor = TLS_MINOR_VER_TWO; hdr->tls_length = htons(record_len - sizeof(*hdr)); if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16) memcpy(hdr + 1, &seqno, sizeof(seqno)); tls_12_aead_aad(en, len, hdr, seqno, &aad); if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16) tls_12_gcm_nonce(en, hdr, nonce); else tls_13_nonce(en, seqno, nonce); ATF_REQUIRE(aead_encrypt(tls_EVP_CIPHER(en), en->cipher_key, nonce, &aad, sizeof(aad), src, (char *)dst + hdr_len, len, (char *)dst + hdr_len + len, mac_len)); return (record_len); } static size_t encrypt_tls_13_aead(struct tls_enable *en, uint8_t record_type, uint64_t seqno, const void *src, size_t len, void *dst, size_t padding) { struct tls_record_layer *hdr; struct tls_aead_data_13 aad; char nonce[12]; char *buf; size_t hdr_len, mac_len, record_len; hdr = dst; hdr_len = tls_header_len(en); mac_len = tls_mac_len(en); record_len = hdr_len + len + 1 + padding + mac_len; hdr->tls_type = TLS_RLTYPE_APP; hdr->tls_vmajor = TLS_MAJOR_VER_ONE; hdr->tls_vminor = TLS_MINOR_VER_TWO; hdr->tls_length = htons(record_len - sizeof(*hdr)); tls_13_aad(en, hdr, seqno, &aad); tls_13_nonce(en, seqno, nonce); /* * Have to use a temporary buffer for the input so that the record * type can be appended. */ buf = malloc(len + 1 + padding); memcpy(buf, src, len); buf[len] = record_type; memset(buf + len + 1, 0, padding); ATF_REQUIRE(aead_encrypt(tls_EVP_CIPHER(en), en->cipher_key, nonce, &aad, sizeof(aad), buf, (char *)dst + hdr_len, len + 1 + padding, (char *)dst + hdr_len + len + 1 + padding, mac_len)); free(buf); return (record_len); } static size_t encrypt_tls_aead(struct tls_enable *en, uint8_t record_type, uint64_t seqno, const void *src, size_t len, void *dst, size_t avail, size_t padding) { size_t record_len; record_len = tls_header_len(en) + len + padding + tls_trailer_len(en); ATF_REQUIRE(record_len <= avail); if (en->tls_vminor == TLS_MINOR_VER_TWO) { ATF_REQUIRE(padding == 0); ATF_REQUIRE(encrypt_tls_12_aead(en, record_type, seqno, src, len, dst) == record_len); } else ATF_REQUIRE(encrypt_tls_13_aead(en, record_type, seqno, src, len, dst, padding) == record_len); return (record_len); } static size_t encrypt_tls_record(struct tls_enable *en, uint8_t record_type, uint64_t seqno, const void *src, size_t len, void *dst, size_t avail, size_t padding) { return (encrypt_tls_aead(en, record_type, seqno, src, len, dst, avail, padding)); } static void test_ktls_transmit_app_data(struct tls_enable *en, uint64_t seqno, size_t len) { struct kevent ev; struct tls_record_layer *hdr; char *plaintext, *decrypted, *outbuf; size_t decrypted_len, outbuf_len, outbuf_cap, record_len, written; ssize_t rv; int kq, sockets[2]; uint8_t record_type; plaintext = alloc_buffer(len); decrypted = malloc(len); outbuf_cap = tls_header_len(en) + TLS_MAX_MSG_SIZE_V10_2 + tls_trailer_len(en); outbuf = malloc(outbuf_cap); hdr = (struct tls_record_layer *)outbuf; ATF_REQUIRE((kq = kqueue()) != -1); ATF_REQUIRE_MSG(socketpair_tcp(sockets), "failed to create sockets"); ATF_REQUIRE(setsockopt(sockets[1], IPPROTO_TCP, TCP_TXTLS_ENABLE, en, sizeof(*en)) == 0); EV_SET(&ev, sockets[0], EVFILT_READ, EV_ADD, 0, 0, NULL); ATF_REQUIRE(kevent(kq, &ev, 1, NULL, 0, NULL) == 0); EV_SET(&ev, sockets[1], EVFILT_WRITE, EV_ADD, 0, 0, NULL); ATF_REQUIRE(kevent(kq, &ev, 1, NULL, 0, NULL) == 0); decrypted_len = 0; outbuf_len = 0; written = 0; while (decrypted_len != len) { ATF_REQUIRE(kevent(kq, NULL, 0, &ev, 1, NULL) == 1); switch (ev.filter) { case EVFILT_WRITE: /* Try to write any remaining data. */ rv = write(ev.ident, plaintext + written, len - written); ATF_REQUIRE_MSG(rv > 0, "failed to write to socket"); written += rv; if (written == len) { ev.flags = EV_DISABLE; ATF_REQUIRE(kevent(kq, &ev, 1, NULL, 0, NULL) == 0); } break; case EVFILT_READ: ATF_REQUIRE((ev.flags & EV_EOF) == 0); /* * Try to read data for the next TLS record * into outbuf. Start by reading the header * to determine how much additional data to * read. */ if (outbuf_len < sizeof(struct tls_record_layer)) { rv = read(ev.ident, outbuf + outbuf_len, sizeof(struct tls_record_layer) - outbuf_len); ATF_REQUIRE_MSG(rv > 0, "failed to read from socket"); outbuf_len += rv; } if (outbuf_len < sizeof(struct tls_record_layer)) break; record_len = sizeof(struct tls_record_layer) + ntohs(hdr->tls_length); ATF_REQUIRE(record_len <= outbuf_cap); ATF_REQUIRE(record_len > outbuf_len); rv = read(ev.ident, outbuf + outbuf_len, record_len - outbuf_len); if (rv == -1 && errno == EAGAIN) break; ATF_REQUIRE_MSG(rv > 0, "failed to read from socket"); outbuf_len += rv; if (outbuf_len == record_len) { decrypted_len += decrypt_tls_record(en, seqno, outbuf, outbuf_len, decrypted + decrypted_len, len - decrypted_len, &record_type); ATF_REQUIRE(record_type == TLS_RLTYPE_APP); seqno++; outbuf_len = 0; } break; } } ATF_REQUIRE_MSG(written == decrypted_len, "read %zu decrypted bytes, but wrote %zu", decrypted_len, written); ATF_REQUIRE(memcmp(plaintext, decrypted, len) == 0); free(outbuf); free(decrypted); free(plaintext); ATF_REQUIRE(close(sockets[1]) == 0); ATF_REQUIRE(close(sockets[0]) == 0); ATF_REQUIRE(close(kq) == 0); } static void ktls_send_control_message(int fd, uint8_t type, void *data, size_t len) { struct msghdr msg; struct cmsghdr *cmsg; char cbuf[CMSG_SPACE(sizeof(type))]; struct iovec iov; memset(&msg, 0, sizeof(msg)); msg.msg_control = cbuf; msg.msg_controllen = sizeof(cbuf); cmsg = CMSG_FIRSTHDR(&msg); cmsg->cmsg_level = IPPROTO_TCP; cmsg->cmsg_type = TLS_SET_RECORD_TYPE; cmsg->cmsg_len = CMSG_LEN(sizeof(type)); *(uint8_t *)CMSG_DATA(cmsg) = type; iov.iov_base = data; iov.iov_len = len; msg.msg_iov = &iov; msg.msg_iovlen = 1; ATF_REQUIRE(sendmsg(fd, &msg, 0) == (ssize_t)len); } static void test_ktls_transmit_control(struct tls_enable *en, uint64_t seqno, uint8_t type, size_t len) { struct tls_record_layer *hdr; char *plaintext, *decrypted, *outbuf; size_t outbuf_cap, payload_len, record_len; ssize_t rv; int sockets[2]; uint8_t record_type; ATF_REQUIRE(len <= TLS_MAX_MSG_SIZE_V10_2); plaintext = alloc_buffer(len); decrypted = malloc(len); outbuf_cap = tls_header_len(en) + len + tls_trailer_len(en); outbuf = malloc(outbuf_cap); hdr = (struct tls_record_layer *)outbuf; ATF_REQUIRE_MSG(socketpair_tcp(sockets), "failed to create sockets"); ATF_REQUIRE(setsockopt(sockets[1], IPPROTO_TCP, TCP_TXTLS_ENABLE, en, sizeof(*en)) == 0); fd_set_blocking(sockets[0]); fd_set_blocking(sockets[1]); ktls_send_control_message(sockets[1], type, plaintext, len); /* * First read the header to determine how much additional data * to read. */ rv = read(sockets[0], outbuf, sizeof(struct tls_record_layer)); ATF_REQUIRE(rv == sizeof(struct tls_record_layer)); payload_len = ntohs(hdr->tls_length); record_len = payload_len + sizeof(struct tls_record_layer); ATF_REQUIRE(record_len <= outbuf_cap); rv = read(sockets[0], outbuf + sizeof(struct tls_record_layer), payload_len); ATF_REQUIRE(rv == (ssize_t)payload_len); rv = decrypt_tls_record(en, seqno, outbuf, record_len, decrypted, len, &record_type); ATF_REQUIRE_MSG((ssize_t)len == rv, "read %zd decrypted bytes, but wrote %zu", rv, len); ATF_REQUIRE(record_type == type); ATF_REQUIRE(memcmp(plaintext, decrypted, len) == 0); free(outbuf); free(decrypted); free(plaintext); ATF_REQUIRE(close(sockets[1]) == 0); ATF_REQUIRE(close(sockets[0]) == 0); } static void test_ktls_transmit_empty_fragment(struct tls_enable *en, uint64_t seqno) { struct tls_record_layer *hdr; char *outbuf; size_t outbuf_cap, payload_len, record_len; ssize_t rv; int sockets[2]; uint8_t record_type; outbuf_cap = tls_header_len(en) + tls_trailer_len(en); outbuf = malloc(outbuf_cap); hdr = (struct tls_record_layer *)outbuf; ATF_REQUIRE_MSG(socketpair_tcp(sockets), "failed to create sockets"); ATF_REQUIRE(setsockopt(sockets[1], IPPROTO_TCP, TCP_TXTLS_ENABLE, en, sizeof(*en)) == 0); fd_set_blocking(sockets[0]); fd_set_blocking(sockets[1]); - /* A write of zero bytes should send an empty fragment. */ + /* + * A write of zero bytes should send an empty fragment only for + * TLS 1.0, otherwise an error should be raised. + */ rv = write(sockets[1], NULL, 0); - ATF_REQUIRE(rv == 0); + if (rv == 0) { + ATF_REQUIRE(en->cipher_algorithm == CRYPTO_AES_CBC); + ATF_REQUIRE(en->tls_vminor == TLS_MINOR_VER_ZERO); + } else { + ATF_REQUIRE(rv == -1); + ATF_REQUIRE(errno == EINVAL); + goto out; + } /* * First read the header to determine how much additional data * to read. */ rv = read(sockets[0], outbuf, sizeof(struct tls_record_layer)); ATF_REQUIRE(rv == sizeof(struct tls_record_layer)); payload_len = ntohs(hdr->tls_length); record_len = payload_len + sizeof(struct tls_record_layer); ATF_REQUIRE(record_len <= outbuf_cap); rv = read(sockets[0], outbuf + sizeof(struct tls_record_layer), payload_len); ATF_REQUIRE(rv == (ssize_t)payload_len); rv = decrypt_tls_record(en, seqno, outbuf, record_len, NULL, 0, &record_type); ATF_REQUIRE_MSG(rv == 0, "read %zd decrypted bytes for an empty fragment", rv); ATF_REQUIRE(record_type == TLS_RLTYPE_APP); +out: free(outbuf); ATF_REQUIRE(close(sockets[1]) == 0); ATF_REQUIRE(close(sockets[0]) == 0); } static size_t ktls_receive_tls_record(struct tls_enable *en, int fd, uint8_t record_type, void *data, size_t len) { struct msghdr msg; struct cmsghdr *cmsg; struct tls_get_record *tgr; char cbuf[CMSG_SPACE(sizeof(*tgr))]; struct iovec iov; ssize_t rv; memset(&msg, 0, sizeof(msg)); msg.msg_control = cbuf; msg.msg_controllen = sizeof(cbuf); iov.iov_base = data; iov.iov_len = len; msg.msg_iov = &iov; msg.msg_iovlen = 1; ATF_REQUIRE((rv = recvmsg(fd, &msg, 0)) > 0); ATF_REQUIRE((msg.msg_flags & (MSG_EOR | MSG_CTRUNC)) == MSG_EOR); cmsg = CMSG_FIRSTHDR(&msg); ATF_REQUIRE(cmsg != NULL); ATF_REQUIRE(cmsg->cmsg_level == IPPROTO_TCP); ATF_REQUIRE(cmsg->cmsg_type == TLS_GET_RECORD); ATF_REQUIRE(cmsg->cmsg_len == CMSG_LEN(sizeof(*tgr))); tgr = (struct tls_get_record *)CMSG_DATA(cmsg); ATF_REQUIRE(tgr->tls_type == record_type); ATF_REQUIRE(tgr->tls_vmajor == en->tls_vmajor); /* XXX: Not sure if this is what OpenSSL expects? */ if (en->tls_vminor == TLS_MINOR_VER_THREE) ATF_REQUIRE(tgr->tls_vminor == TLS_MINOR_VER_TWO); else ATF_REQUIRE(tgr->tls_vminor == en->tls_vminor); ATF_REQUIRE(tgr->tls_length == htons(rv)); return (rv); } static void test_ktls_receive_app_data(struct tls_enable *en, uint64_t seqno, size_t len, size_t padding) { struct kevent ev; char *plaintext, *received, *outbuf; size_t outbuf_cap, outbuf_len, outbuf_sent, received_len, todo, written; ssize_t rv; int kq, sockets[2]; plaintext = alloc_buffer(len); received = malloc(len); outbuf_cap = tls_header_len(en) + TLS_MAX_MSG_SIZE_V10_2 + tls_trailer_len(en); outbuf = malloc(outbuf_cap); ATF_REQUIRE((kq = kqueue()) != -1); ATF_REQUIRE_MSG(socketpair_tcp(sockets), "failed to create sockets"); ATF_REQUIRE(setsockopt(sockets[0], IPPROTO_TCP, TCP_RXTLS_ENABLE, en, sizeof(*en)) == 0); EV_SET(&ev, sockets[0], EVFILT_READ, EV_ADD, 0, 0, NULL); ATF_REQUIRE(kevent(kq, &ev, 1, NULL, 0, NULL) == 0); EV_SET(&ev, sockets[1], EVFILT_WRITE, EV_ADD, 0, 0, NULL); ATF_REQUIRE(kevent(kq, &ev, 1, NULL, 0, NULL) == 0); received_len = 0; outbuf_len = 0; written = 0; while (received_len != len) { ATF_REQUIRE(kevent(kq, NULL, 0, &ev, 1, NULL) == 1); switch (ev.filter) { case EVFILT_WRITE: /* * Compose the next TLS record to send. */ if (outbuf_len == 0) { ATF_REQUIRE(written < len); todo = len - written; if (todo > TLS_MAX_MSG_SIZE_V10_2 - padding) todo = TLS_MAX_MSG_SIZE_V10_2 - padding; outbuf_len = encrypt_tls_record(en, TLS_RLTYPE_APP, seqno, plaintext + written, todo, outbuf, outbuf_cap, padding); outbuf_sent = 0; written += todo; seqno++; } /* * Try to write the remainder of the current * TLS record. */ rv = write(ev.ident, outbuf + outbuf_sent, outbuf_len - outbuf_sent); ATF_REQUIRE_MSG(rv > 0, "failed to write to socket"); outbuf_sent += rv; if (outbuf_sent == outbuf_len) { outbuf_len = 0; if (written == len) { ev.flags = EV_DISABLE; ATF_REQUIRE(kevent(kq, &ev, 1, NULL, 0, NULL) == 0); } } break; case EVFILT_READ: ATF_REQUIRE((ev.flags & EV_EOF) == 0); rv = ktls_receive_tls_record(en, ev.ident, TLS_RLTYPE_APP, received + received_len, len - received_len); received_len += rv; break; } } ATF_REQUIRE_MSG(written == received_len, "read %zu decrypted bytes, but wrote %zu", received_len, written); ATF_REQUIRE(memcmp(plaintext, received, len) == 0); free(outbuf); free(received); free(plaintext); ATF_REQUIRE(close(sockets[1]) == 0); ATF_REQUIRE(close(sockets[0]) == 0); ATF_REQUIRE(close(kq) == 0); } #define TLS_10_TESTS(M) \ M(aes128_cbc_1_0_sha1, CRYPTO_AES_CBC, 128 / 8, \ CRYPTO_SHA1_HMAC) \ M(aes256_cbc_1_0_sha1, CRYPTO_AES_CBC, 256 / 8, \ CRYPTO_SHA1_HMAC) #define TLS_13_TESTS(M) \ M(aes128_gcm_1_3, CRYPTO_AES_NIST_GCM_16, 128 / 8, 0, \ TLS_MINOR_VER_THREE) \ M(aes256_gcm_1_3, CRYPTO_AES_NIST_GCM_16, 256 / 8, 0, \ TLS_MINOR_VER_THREE) \ M(chacha20_poly1305_1_3, CRYPTO_CHACHA20_POLY1305, 256 / 8, 0, \ TLS_MINOR_VER_THREE) #define AES_CBC_TESTS(M) \ M(aes128_cbc_1_0_sha1, CRYPTO_AES_CBC, 128 / 8, \ CRYPTO_SHA1_HMAC, TLS_MINOR_VER_ZERO) \ M(aes256_cbc_1_0_sha1, CRYPTO_AES_CBC, 256 / 8, \ CRYPTO_SHA1_HMAC, TLS_MINOR_VER_ZERO) \ M(aes128_cbc_1_1_sha1, CRYPTO_AES_CBC, 128 / 8, \ CRYPTO_SHA1_HMAC, TLS_MINOR_VER_ONE) \ M(aes256_cbc_1_1_sha1, CRYPTO_AES_CBC, 256 / 8, \ CRYPTO_SHA1_HMAC, TLS_MINOR_VER_ONE) \ M(aes128_cbc_1_2_sha1, CRYPTO_AES_CBC, 128 / 8, \ CRYPTO_SHA1_HMAC, TLS_MINOR_VER_TWO) \ M(aes256_cbc_1_2_sha1, CRYPTO_AES_CBC, 256 / 8, \ CRYPTO_SHA1_HMAC, TLS_MINOR_VER_TWO) \ M(aes128_cbc_1_2_sha256, CRYPTO_AES_CBC, 128 / 8, \ CRYPTO_SHA2_256_HMAC, TLS_MINOR_VER_TWO) \ M(aes256_cbc_1_2_sha256, CRYPTO_AES_CBC, 256 / 8, \ CRYPTO_SHA2_256_HMAC, TLS_MINOR_VER_TWO) \ M(aes128_cbc_1_2_sha384, CRYPTO_AES_CBC, 128 / 8, \ CRYPTO_SHA2_384_HMAC, TLS_MINOR_VER_TWO) \ M(aes256_cbc_1_2_sha384, CRYPTO_AES_CBC, 256 / 8, \ CRYPTO_SHA2_384_HMAC, TLS_MINOR_VER_TWO) \ #define AES_GCM_TESTS(M) \ M(aes128_gcm_1_2, CRYPTO_AES_NIST_GCM_16, 128 / 8, 0, \ TLS_MINOR_VER_TWO) \ M(aes256_gcm_1_2, CRYPTO_AES_NIST_GCM_16, 256 / 8, 0, \ TLS_MINOR_VER_TWO) \ M(aes128_gcm_1_3, CRYPTO_AES_NIST_GCM_16, 128 / 8, 0, \ TLS_MINOR_VER_THREE) \ M(aes256_gcm_1_3, CRYPTO_AES_NIST_GCM_16, 256 / 8, 0, \ TLS_MINOR_VER_THREE) #define CHACHA20_TESTS(M) \ M(chacha20_poly1305_1_2, CRYPTO_CHACHA20_POLY1305, 256 / 8, 0, \ TLS_MINOR_VER_TWO) \ M(chacha20_poly1305_1_3, CRYPTO_CHACHA20_POLY1305, 256 / 8, 0, \ TLS_MINOR_VER_THREE) #define GEN_TRANSMIT_APP_DATA_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, name, len) \ ATF_TC_WITHOUT_HEAD(ktls_transmit_##cipher_name##_##name); \ ATF_TC_BODY(ktls_transmit_##cipher_name##_##name, tc) \ { \ struct tls_enable en; \ uint64_t seqno; \ \ ATF_REQUIRE_KTLS(); \ seqno = random(); \ build_tls_enable(cipher_alg, key_size, auth_alg, minor, seqno, \ &en); \ test_ktls_transmit_app_data(&en, seqno, len); \ free_tls_enable(&en); \ } #define ADD_TRANSMIT_APP_DATA_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, name) \ ATF_TP_ADD_TC(tp, ktls_transmit_##cipher_name##_##name); #define GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, name, type, len) \ ATF_TC_WITHOUT_HEAD(ktls_transmit_##cipher_name##_##name); \ ATF_TC_BODY(ktls_transmit_##cipher_name##_##name, tc) \ { \ struct tls_enable en; \ uint64_t seqno; \ \ ATF_REQUIRE_KTLS(); \ seqno = random(); \ build_tls_enable(cipher_alg, key_size, auth_alg, minor, seqno, \ &en); \ test_ktls_transmit_control(&en, seqno, type, len); \ free_tls_enable(&en); \ } #define ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, name) \ ATF_TP_ADD_TC(tp, ktls_transmit_##cipher_name##_##name); #define GEN_TRANSMIT_EMPTY_FRAGMENT_TEST(cipher_name, cipher_alg, \ - key_size, auth_alg) \ + key_size, auth_alg, minor) \ ATF_TC_WITHOUT_HEAD(ktls_transmit_##cipher_name##_empty_fragment); \ ATF_TC_BODY(ktls_transmit_##cipher_name##_empty_fragment, tc) \ { \ struct tls_enable en; \ uint64_t seqno; \ \ ATF_REQUIRE_KTLS(); \ seqno = random(); \ - build_tls_enable(cipher_alg, key_size, auth_alg, \ - TLS_MINOR_VER_ZERO, seqno, &en); \ + build_tls_enable(cipher_alg, key_size, auth_alg, minor, seqno, \ + &en); \ test_ktls_transmit_empty_fragment(&en, seqno); \ free_tls_enable(&en); \ } #define ADD_TRANSMIT_EMPTY_FRAGMENT_TEST(cipher_name, cipher_alg, \ - key_size, auth_alg) \ + key_size, auth_alg, minor) \ ATF_TP_ADD_TC(tp, ktls_transmit_##cipher_name##_empty_fragment); #define GEN_TRANSMIT_TESTS(cipher_name, cipher_alg, key_size, auth_alg, \ minor) \ GEN_TRANSMIT_APP_DATA_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, short, 64) \ GEN_TRANSMIT_APP_DATA_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, long, 64 * 1024) \ GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, control, 0x21 /* Alert */, 32) #define ADD_TRANSMIT_TESTS(cipher_name, cipher_alg, key_size, auth_alg, \ minor) \ ADD_TRANSMIT_APP_DATA_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, short) \ ADD_TRANSMIT_APP_DATA_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, long) \ ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, control) /* * For each supported cipher suite, run three transmit tests: * * - a short test which sends 64 bytes of application data (likely as * a single TLS record) * * - a long test which sends 64KB of application data (split across * multiple TLS records) * * - a control test which sends a single record with a specific * content type via sendmsg() */ AES_CBC_TESTS(GEN_TRANSMIT_TESTS); AES_GCM_TESTS(GEN_TRANSMIT_TESTS); CHACHA20_TESTS(GEN_TRANSMIT_TESTS); #define GEN_TRANSMIT_PADDING_TESTS(cipher_name, cipher_alg, key_size, \ auth_alg, minor) \ GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_1, 0x21 /* Alert */, 1) \ GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_2, 0x21 /* Alert */, 2) \ GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_3, 0x21 /* Alert */, 3) \ GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_4, 0x21 /* Alert */, 4) \ GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_5, 0x21 /* Alert */, 5) \ GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_6, 0x21 /* Alert */, 6) \ GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_7, 0x21 /* Alert */, 7) \ GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_8, 0x21 /* Alert */, 8) \ GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_9, 0x21 /* Alert */, 9) \ GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_10, 0x21 /* Alert */, 10) \ GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_11, 0x21 /* Alert */, 11) \ GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_12, 0x21 /* Alert */, 12) \ GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_13, 0x21 /* Alert */, 13) \ GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_14, 0x21 /* Alert */, 14) \ GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_15, 0x21 /* Alert */, 15) \ GEN_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_16, 0x21 /* Alert */, 16) #define ADD_TRANSMIT_PADDING_TESTS(cipher_name, cipher_alg, key_size, \ auth_alg, minor) \ ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_1) \ ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_2) \ ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_3) \ ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_4) \ ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_5) \ ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_6) \ ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_7) \ ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_8) \ ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_9) \ ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_10) \ ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_11) \ ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_12) \ ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_13) \ ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_14) \ ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_15) \ ADD_TRANSMIT_CONTROL_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, padding_16) /* * For AES-CBC MTE cipher suites using padding, add tests of messages * with each possible padding size. Note that the padding_ tests * do not necessarily test bytes of padding as the padding is a * function of the cipher suite's MAC length. However, cycling * through all of the payload sizes from 1 to 16 should exercise all * of the possible padding lengths for each suite. */ AES_CBC_TESTS(GEN_TRANSMIT_PADDING_TESTS); /* * Test "empty fragments" which are TLS records with no payload that * OpenSSL can send for TLS 1.0 connections. */ -TLS_10_TESTS(GEN_TRANSMIT_EMPTY_FRAGMENT_TEST); +AES_CBC_TESTS(GEN_TRANSMIT_EMPTY_FRAGMENT_TEST); +AES_GCM_TESTS(GEN_TRANSMIT_EMPTY_FRAGMENT_TEST); +CHACHA20_TESTS(GEN_TRANSMIT_EMPTY_FRAGMENT_TEST); static void test_ktls_invalid_transmit_cipher_suite(struct tls_enable *en) { int sockets[2]; ATF_REQUIRE_MSG(socketpair_tcp(sockets), "failed to create sockets"); ATF_REQUIRE(setsockopt(sockets[1], IPPROTO_TCP, TCP_TXTLS_ENABLE, en, sizeof(*en)) == -1); ATF_REQUIRE(errno == EINVAL); ATF_REQUIRE(close(sockets[1]) == 0); ATF_REQUIRE(close(sockets[0]) == 0); } #define GEN_INVALID_TRANSMIT_TEST(name, cipher_alg, key_size, auth_alg, \ minor) \ ATF_TC_WITHOUT_HEAD(ktls_transmit_invalid_##name); \ ATF_TC_BODY(ktls_transmit_invalid_##name, tc) \ { \ struct tls_enable en; \ uint64_t seqno; \ \ ATF_REQUIRE_KTLS(); \ seqno = random(); \ build_tls_enable(cipher_alg, key_size, auth_alg, minor, seqno, \ &en); \ test_ktls_invalid_transmit_cipher_suite(&en); \ free_tls_enable(&en); \ } #define ADD_INVALID_TRANSMIT_TEST(name, cipher_alg, key_size, auth_alg, \ minor) \ ATF_TP_ADD_TC(tp, ktls_transmit_invalid_##name); #define INVALID_CIPHER_SUITES(M) \ M(aes128_cbc_1_0_sha256, CRYPTO_AES_CBC, 128 / 8, \ CRYPTO_SHA2_256_HMAC, TLS_MINOR_VER_ZERO) \ M(aes128_cbc_1_0_sha384, CRYPTO_AES_CBC, 128 / 8, \ CRYPTO_SHA2_384_HMAC, TLS_MINOR_VER_ZERO) \ M(aes128_gcm_1_0, CRYPTO_AES_NIST_GCM_16, 128 / 8, 0, \ TLS_MINOR_VER_ZERO) \ M(chacha20_poly1305_1_0, CRYPTO_CHACHA20_POLY1305, 256 / 8, 0, \ TLS_MINOR_VER_ZERO) \ M(aes128_cbc_1_1_sha256, CRYPTO_AES_CBC, 128 / 8, \ CRYPTO_SHA2_256_HMAC, TLS_MINOR_VER_ONE) \ M(aes128_cbc_1_1_sha384, CRYPTO_AES_CBC, 128 / 8, \ CRYPTO_SHA2_384_HMAC, TLS_MINOR_VER_ONE) \ M(aes128_gcm_1_1, CRYPTO_AES_NIST_GCM_16, 128 / 8, 0, \ TLS_MINOR_VER_ONE) \ M(chacha20_poly1305_1_1, CRYPTO_CHACHA20_POLY1305, 256 / 8, 0, \ TLS_MINOR_VER_ONE) \ M(aes128_cbc_1_3_sha1, CRYPTO_AES_CBC, 128 / 8, \ CRYPTO_SHA1_HMAC, TLS_MINOR_VER_THREE) \ M(aes128_cbc_1_3_sha256, CRYPTO_AES_CBC, 128 / 8, \ CRYPTO_SHA2_256_HMAC, TLS_MINOR_VER_THREE) \ M(aes128_cbc_1_3_sha384, CRYPTO_AES_CBC, 128 / 8, \ CRYPTO_SHA2_384_HMAC, TLS_MINOR_VER_THREE) /* * Ensure that invalid cipher suites are rejected for transmit. */ INVALID_CIPHER_SUITES(GEN_INVALID_TRANSMIT_TEST); #define GEN_RECEIVE_APP_DATA_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, name, len, padding) \ ATF_TC_WITHOUT_HEAD(ktls_receive_##cipher_name##_##name); \ ATF_TC_BODY(ktls_receive_##cipher_name##_##name, tc) \ { \ struct tls_enable en; \ uint64_t seqno; \ \ ATF_REQUIRE_KTLS(); \ seqno = random(); \ build_tls_enable(cipher_alg, key_size, auth_alg, minor, seqno, \ &en); \ test_ktls_receive_app_data(&en, seqno, len, padding); \ free_tls_enable(&en); \ } #define ADD_RECEIVE_APP_DATA_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, name) \ ATF_TP_ADD_TC(tp, ktls_receive_##cipher_name##_##name); #define GEN_RECEIVE_TESTS(cipher_name, cipher_alg, key_size, auth_alg, \ minor) \ GEN_RECEIVE_APP_DATA_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, short, 64, 0) \ GEN_RECEIVE_APP_DATA_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, long, 64 * 1024, 0) #define ADD_RECEIVE_TESTS(cipher_name, cipher_alg, key_size, auth_alg, \ minor) \ ADD_RECEIVE_APP_DATA_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, short) \ ADD_RECEIVE_APP_DATA_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, long) /* * For each supported cipher suite, run two receive tests: * * - a short test which sends 64 bytes of application data (likely as * a single TLS record) * * - a long test which sends 64KB of application data (split across * multiple TLS records) * * Note that receive is currently only supported for TLS 1.2 AEAD * cipher suites. */ AES_GCM_TESTS(GEN_RECEIVE_TESTS); CHACHA20_TESTS(GEN_RECEIVE_TESTS); #define GEN_PADDING_RECEIVE_TESTS(cipher_name, cipher_alg, key_size, \ auth_alg, minor) \ GEN_RECEIVE_APP_DATA_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, short_padded, 64, 16) \ GEN_RECEIVE_APP_DATA_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, long_padded, 64 * 1024, 15) #define ADD_PADDING_RECEIVE_TESTS(cipher_name, cipher_alg, key_size, \ auth_alg, minor) \ ADD_RECEIVE_APP_DATA_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, short_padded) \ ADD_RECEIVE_APP_DATA_TEST(cipher_name, cipher_alg, key_size, \ auth_alg, minor, long_padded) /* * For TLS 1.3 cipher suites, run two additional receive tests which * use add padding to each record. */ TLS_13_TESTS(GEN_PADDING_RECEIVE_TESTS); static void test_ktls_invalid_receive_cipher_suite(struct tls_enable *en) { int sockets[2]; ATF_REQUIRE_MSG(socketpair_tcp(sockets), "failed to create sockets"); ATF_REQUIRE(setsockopt(sockets[1], IPPROTO_TCP, TCP_RXTLS_ENABLE, en, sizeof(*en)) == -1); ATF_REQUIRE(errno == EINVAL); ATF_REQUIRE(close(sockets[1]) == 0); ATF_REQUIRE(close(sockets[0]) == 0); } #define GEN_INVALID_RECEIVE_TEST(name, cipher_alg, key_size, auth_alg, \ minor) \ ATF_TC_WITHOUT_HEAD(ktls_receive_invalid_##name); \ ATF_TC_BODY(ktls_receive_invalid_##name, tc) \ { \ struct tls_enable en; \ uint64_t seqno; \ \ ATF_REQUIRE_KTLS(); \ seqno = random(); \ build_tls_enable(cipher_alg, key_size, auth_alg, minor, seqno, \ &en); \ test_ktls_invalid_receive_cipher_suite(&en); \ free_tls_enable(&en); \ } #define ADD_INVALID_RECEIVE_TEST(name, cipher_alg, key_size, auth_alg, \ minor) \ ATF_TP_ADD_TC(tp, ktls_receive_invalid_##name); /* * Ensure that invalid cipher suites are rejected for receive. */ INVALID_CIPHER_SUITES(GEN_INVALID_RECEIVE_TEST); static void test_ktls_unsupported_receive_cipher_suite(struct tls_enable *en) { int sockets[2]; ATF_REQUIRE_MSG(socketpair_tcp(sockets), "failed to create sockets"); ATF_REQUIRE(setsockopt(sockets[1], IPPROTO_TCP, TCP_RXTLS_ENABLE, en, sizeof(*en)) == -1); ATF_REQUIRE(errno == EPROTONOSUPPORT); ATF_REQUIRE(close(sockets[1]) == 0); ATF_REQUIRE(close(sockets[0]) == 0); } #define GEN_UNSUPPORTED_RECEIVE_TEST(name, cipher_alg, key_size, \ auth_alg, minor) \ ATF_TC_WITHOUT_HEAD(ktls_receive_unsupported_##name); \ ATF_TC_BODY(ktls_receive_unsupported_##name, tc) \ { \ struct tls_enable en; \ uint64_t seqno; \ \ ATF_REQUIRE_KTLS(); \ seqno = random(); \ build_tls_enable(cipher_alg, key_size, auth_alg, minor, seqno, \ &en); \ test_ktls_unsupported_receive_cipher_suite(&en); \ free_tls_enable(&en); \ } #define ADD_UNSUPPORTED_RECEIVE_TEST(name, cipher_alg, key_size, \ auth_alg, minor) \ ATF_TP_ADD_TC(tp, ktls_receive_unsupported_##name); /* * Ensure that valid cipher suites not supported for receive are * rejected. */ AES_CBC_TESTS(GEN_UNSUPPORTED_RECEIVE_TEST); /* * Try to perform an invalid sendto(2) on a TXTLS-enabled socket, to exercise * KTLS error handling in the socket layer. */ ATF_TC_WITHOUT_HEAD(ktls_sendto_baddst); ATF_TC_BODY(ktls_sendto_baddst, tc) { char buf[32]; struct sockaddr_in dst; struct tls_enable en; ssize_t n; int s; ATF_REQUIRE_KTLS(); s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); ATF_REQUIRE(s >= 0); build_tls_enable(CRYPTO_AES_NIST_GCM_16, 128 / 8, 0, TLS_MINOR_VER_THREE, (uint64_t)random(), &en); ATF_REQUIRE(setsockopt(s, IPPROTO_TCP, TCP_TXTLS_ENABLE, &en, sizeof(en)) == 0); memset(&dst, 0, sizeof(dst)); dst.sin_family = AF_INET; dst.sin_len = sizeof(dst); dst.sin_addr.s_addr = htonl(INADDR_BROADCAST); dst.sin_port = htons(12345); memset(buf, 0, sizeof(buf)); n = sendto(s, buf, sizeof(buf), 0, (struct sockaddr *)&dst, sizeof(dst)); /* Can't transmit to the broadcast address over TCP. */ ATF_REQUIRE_ERRNO(EACCES, n == -1); ATF_REQUIRE(close(s) == 0); } ATF_TP_ADD_TCS(tp) { /* Transmit tests */ AES_CBC_TESTS(ADD_TRANSMIT_TESTS); AES_GCM_TESTS(ADD_TRANSMIT_TESTS); CHACHA20_TESTS(ADD_TRANSMIT_TESTS); AES_CBC_TESTS(ADD_TRANSMIT_PADDING_TESTS); - TLS_10_TESTS(ADD_TRANSMIT_EMPTY_FRAGMENT_TEST); + AES_CBC_TESTS(ADD_TRANSMIT_EMPTY_FRAGMENT_TEST); + AES_GCM_TESTS(ADD_TRANSMIT_EMPTY_FRAGMENT_TEST); + CHACHA20_TESTS(ADD_TRANSMIT_EMPTY_FRAGMENT_TEST); INVALID_CIPHER_SUITES(ADD_INVALID_TRANSMIT_TEST); /* Receive tests */ AES_CBC_TESTS(ADD_UNSUPPORTED_RECEIVE_TEST); AES_GCM_TESTS(ADD_RECEIVE_TESTS); CHACHA20_TESTS(ADD_RECEIVE_TESTS); TLS_13_TESTS(ADD_PADDING_RECEIVE_TESTS); INVALID_CIPHER_SUITES(ADD_INVALID_RECEIVE_TEST); /* Miscellaneous */ ATF_TP_ADD_TC(tp, ktls_sendto_baddst); return (atf_no_error()); }