Index: share/man/man4/tcp.4 =================================================================== --- share/man/man4/tcp.4 +++ share/man/man4/tcp.4 @@ -319,14 +319,11 @@ The payload of this control message is a single byte holding the desired TLS record type. .Pp -Data read from this socket will still be encrypted and must be parsed by -a TLS-aware consumer. -.Pp -At present, only a single key may be set on a socket. +At present, only a single transmit key may be set on a socket. As such, users of this option must disable rekeying. .It Dv TCP_TXTLS_MODE -The integer argument can be used to get or set the current TLS mode of a -socket. +The integer argument can be used to get or set the current TLS transmit mode +of a socket. Setting the mode can only used to toggle between software and NIC TLS after TLS has been initially enabled via the .Dv TCP_TXTLS_ENABLE @@ -342,6 +339,45 @@ .It Dv TCP_TLS_MODE_IFNET TLS records are encrypted by the network interface card (NIC). .El +.It Dv TCP_RXTLS_ENABLE +Enable in-kernel TLS for data read from this socket. +The +.Vt struct tls_so_enable +argument defines the encryption and authentication algorithms and keys +used to decrypt the socket data. +.Pp +Each received TLS record must be read from the socket using +.Xr recvmsg 2 . +Each received TLS record will contain a +.Dv TLS_GET_RECORD +control message along with the decrypted payload. +The control message contains a +.Vt struct tls_get_record +which includes fields from the TLS record header. +If an invalid or corrupted TLS record is received, +recvmsg 2 +will fail with one of the following errors: +.Bl -tag -width Er +.It Bq Er EINVAL +The version fields in a TLS record's header did not match the version required +by the +.Vt struct tls_so_enable +structure used to enable in-kernel TLS. +.It Bq Er EMSGSIZE +A TLS record's length was either too small or too large. +.It Bq Er EMSGSIZE +The connection was closed after sending a truncated TLS record. +.It Bq Er EBADMSG +The TLS record failed to match the included authentication tag. +.El +.Pp +At present, only a single receive key may be set on a socket. +As such, users of this option must disable rekeying. +.It Dv TCP_RXTLS_MODE +The integer argument can be used to get the current TLS receive mode +of a socket. +The available modes are the same as for +.Dv TCP_TXTLS_MODE . .El .Pp The option level for the Index: sys/dev/cxgbe/tom/t4_tls.c =================================================================== --- sys/dev/cxgbe/tom/t4_tls.c +++ sys/dev/cxgbe/tom/t4_tls.c @@ -379,7 +379,7 @@ int proto_ver = kctx->proto_ver; kwr->u.rxhdr.flitcnt_hmacctrl = - ((kctx->tx_key_info_size >> 4) << 3) | kctx->hmac_ctrl; + ((kctx->rx_key_info_size >> 4) << 3) | kctx->hmac_ctrl; kwr->u.rxhdr.protover_ciphmode = V_TLS_KEYCTX_TX_WR_PROTOVER(get_proto_ver(proto_ver)) | @@ -408,7 +408,7 @@ (IPAD_SIZE + OPAD_SIZE)); } else { memcpy(kwr->keys.edkey, kctx->rx.key, - (kctx->tx_key_info_size - SALT_SIZE)); + (kctx->rx_key_info_size - SALT_SIZE)); memcpy(kwr->u.rxhdr.rxsalt, kctx->rx.salt, SALT_SIZE); } } @@ -674,6 +674,13 @@ if ((G_KEY_GET_LOC(k_ctx->l_p_key) == KEY_WRITE_RX) || (tls_ofld->key_location == TLS_SFO_WR_CONTEXTLOC_DDR)) { + + /* + * XXX: The userland library sets tx_key_info_size, not + * rx_key_info_size. + */ + k_ctx->rx_key_info_size = k_ctx->tx_key_info_size; + error = tls_program_key_id(toep, k_ctx); if (error) { /* XXX: Only clear quiesce for KEY_WRITE_RX? */ @@ -866,31 +873,36 @@ #ifdef KERN_TLS static void -init_ktls_key_context(struct ktls_session *tls, struct tls_key_context *k_ctx) +init_ktls_key_context(struct ktls_session *tls, struct tls_key_context *k_ctx, + bool transmit) { struct auth_hash *axf; - u_int mac_key_size; - char *hash; + u_int key_info_size, mac_key_size; + char *hash, *key; - k_ctx->l_p_key = V_KEY_GET_LOC(KEY_WRITE_TX); - if (tls->params.tls_vminor == TLS_MINOR_VER_ONE) - k_ctx->proto_ver = SCMD_PROTO_VERSION_TLS_1_1; - else - k_ctx->proto_ver = SCMD_PROTO_VERSION_TLS_1_2; + k_ctx->l_p_key = V_KEY_GET_LOC(transmit ? KEY_WRITE_TX : KEY_WRITE_RX); + k_ctx->proto_ver = tls->params.tls_vmajor << 8 | tls->params.tls_vminor; k_ctx->cipher_secret_size = tls->params.cipher_key_len; - k_ctx->tx_key_info_size = sizeof(struct tx_keyctx_hdr) + + key_info_size = sizeof(struct tx_keyctx_hdr) + k_ctx->cipher_secret_size; - memcpy(k_ctx->tx.key, tls->params.cipher_key, - tls->params.cipher_key_len); - hash = k_ctx->tx.key + tls->params.cipher_key_len; + if (transmit) + key = k_ctx->tx.key; + else + key = k_ctx->rx.key; + memcpy(key, tls->params.cipher_key, tls->params.cipher_key_len); + hash = key + tls->params.cipher_key_len; if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16) { k_ctx->state.auth_mode = SCMD_AUTH_MODE_GHASH; k_ctx->state.enc_mode = SCMD_CIPH_MODE_AES_GCM; k_ctx->iv_size = 4; k_ctx->mac_first = 0; k_ctx->hmac_ctrl = SCMD_HMAC_CTRL_NOP; - k_ctx->tx_key_info_size += GMAC_BLOCK_LEN; - memcpy(k_ctx->tx.salt, tls->params.iv, SALT_SIZE); + key_info_size += GMAC_BLOCK_LEN; + k_ctx->mac_secret_size = 0; + if (transmit) + memcpy(k_ctx->tx.salt, tls->params.iv, SALT_SIZE); + else + memcpy(k_ctx->rx.salt, tls->params.iv, SALT_SIZE); t4_init_gmac_hash(tls->params.cipher_key, tls->params.cipher_key_len * 8, hash); } else { @@ -917,29 +929,38 @@ k_ctx->iv_size = 8; /* for CBC, iv is 16B, unit of 2B */ k_ctx->mac_first = 1; k_ctx->hmac_ctrl = SCMD_HMAC_CTRL_NO_TRUNC; - k_ctx->tx_key_info_size += roundup2(mac_key_size, 16) * 2; + key_info_size += roundup2(mac_key_size, 16) * 2; k_ctx->mac_secret_size = mac_key_size; t4_init_hmac_digest(axf, mac_key_size, tls->params.auth_key, tls->params.auth_key_len * 8, hash); } + if (transmit) + k_ctx->tx_key_info_size = key_info_size; + else + k_ctx->rx_key_info_size = key_info_size; k_ctx->frag_size = tls->params.max_frame_len; k_ctx->iv_ctrl = 1; } int -tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls) +tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, bool transmit) { + struct adapter *sc = td_adapter(toep->td); struct tls_key_context *k_ctx; - int error; + int error, key_offset; if (toep->tls.mode == TLS_MODE_TLSOM) return (EINVAL); if (!can_tls_offload(td_adapter(toep->td))) return (EINVAL); switch (ulp_mode(toep)) { + case ULP_MODE_TLS: + break; case ULP_MODE_NONE: case ULP_MODE_TCPDDP: + if (!transmit) + return (EINVAL); break; default: return (EINVAL); @@ -987,47 +1008,80 @@ tls->params.tls_vminor > TLS_MINOR_VER_TWO) return (EPROTONOSUPPORT); + /* Bail if we already have a key. */ + if (transmit) { + if (toep->tls.tx_key_addr != -1) + return (EOPNOTSUPP); + } else { + if (toep->tls.rx_key_addr != -1) + return (EOPNOTSUPP); + } + /* * XXX: This assumes no key renegotation. If KTLS ever supports * that we will want to allocate TLS sessions dynamically rather * than as a static member of toep. */ k_ctx = &toep->tls.k_ctx; - init_ktls_key_context(tls, k_ctx); - - toep->tls.scmd0.seqno_numivs = - (V_SCMD_SEQ_NO_CTRL(3) | - V_SCMD_PROTO_VERSION(k_ctx->proto_ver) | - V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) | - V_SCMD_CIPH_AUTH_SEQ_CTRL((k_ctx->mac_first == 0)) | - V_SCMD_CIPH_MODE(k_ctx->state.enc_mode) | - V_SCMD_AUTH_MODE(k_ctx->state.auth_mode) | - V_SCMD_HMAC_CTRL(k_ctx->hmac_ctrl) | - V_SCMD_IV_SIZE(k_ctx->iv_size)); - - toep->tls.scmd0.ivgen_hdrlen = - (V_SCMD_IV_GEN_CTRL(k_ctx->iv_ctrl) | - V_SCMD_KEY_CTX_INLINE(0) | - V_SCMD_TLS_FRAG_ENABLE(1)); - - if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16) - toep->tls.iv_len = 8; - else - toep->tls.iv_len = AES_BLOCK_LEN; - - toep->tls.mac_length = k_ctx->mac_secret_size; - - toep->tls.tx_key_addr = -1; + init_ktls_key_context(tls, k_ctx, transmit); error = tls_program_key_id(toep, k_ctx); if (error) return (error); - toep->tls.fcplenmax = get_tp_plen_max(&toep->tls); - toep->tls.expn_per_ulp = tls->params.tls_hlen + tls->params.tls_tlen; - toep->tls.pdus_per_ulp = 1; - toep->tls.adjusted_plen = toep->tls.expn_per_ulp + - toep->tls.k_ctx.frag_size; + if (transmit) { + toep->tls.scmd0.seqno_numivs = + (V_SCMD_SEQ_NO_CTRL(3) | + V_SCMD_PROTO_VERSION(get_proto_ver(k_ctx->proto_ver)) | + V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) | + V_SCMD_CIPH_AUTH_SEQ_CTRL((k_ctx->mac_first == 0)) | + V_SCMD_CIPH_MODE(k_ctx->state.enc_mode) | + V_SCMD_AUTH_MODE(k_ctx->state.auth_mode) | + V_SCMD_HMAC_CTRL(k_ctx->hmac_ctrl) | + V_SCMD_IV_SIZE(k_ctx->iv_size)); + + toep->tls.scmd0.ivgen_hdrlen = + (V_SCMD_IV_GEN_CTRL(k_ctx->iv_ctrl) | + V_SCMD_KEY_CTX_INLINE(0) | + V_SCMD_TLS_FRAG_ENABLE(1)); + + if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16) + toep->tls.iv_len = 8; + else + toep->tls.iv_len = AES_BLOCK_LEN; + + toep->tls.mac_length = k_ctx->mac_secret_size; + + toep->tls.fcplenmax = get_tp_plen_max(&toep->tls); + toep->tls.expn_per_ulp = tls->params.tls_hlen + + tls->params.tls_tlen; + toep->tls.pdus_per_ulp = 1; + toep->tls.adjusted_plen = toep->tls.expn_per_ulp + + toep->tls.k_ctx.frag_size; + } else { + /* Stop timer on handshake completion */ + tls_stop_handshake_timer(toep); + + toep->flags &= ~TPF_FORCE_CREDITS; + + /* + * RX key tags are an index into the key portion of MA + * memory stored as an offset from the base address in + * units of 64 bytes. + */ + key_offset = toep->tls.rx_key_addr - sc->vres.key.start; + t4_set_tls_keyid(toep, key_offset / 64); + t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW, + V_TCB_ULP_RAW(M_TCB_ULP_RAW), + V_TCB_ULP_RAW((V_TF_TLS_KEY_SIZE(3) | + V_TF_TLS_CONTROL(1) | + V_TF_TLS_ACTIVE(1) | + V_TF_TLS_ENABLE(1)))); + t4_set_tls_tcb_field(toep, W_TCB_TLS_SEQ, + V_TCB_TLS_SEQ(M_TCB_TLS_SEQ), + V_TCB_TLS_SEQ(0)); + t4_clear_rx_quiesce(toep); + } toep->tls.mode = TLS_MODE_KTLS; @@ -1669,7 +1723,7 @@ ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); KASSERT(ulp_mode(toep) == ULP_MODE_NONE || - ulp_mode(toep) == ULP_MODE_TCPDDP, + ulp_mode(toep) == ULP_MODE_TCPDDP || ulp_mode(toep) == ULP_MODE_TLS, ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); KASSERT(tls_tx_key(toep), ("%s: TX key not set for toep %p", __func__, toep)); @@ -1952,6 +2006,10 @@ struct socket *so; struct sockbuf *sb; struct mbuf *tls_data; +#ifdef KERN_TLS + struct tls_get_record *tgr; + struct mbuf *control; +#endif int len, pdu_length, rx_credits; KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); @@ -1978,6 +2036,7 @@ pdu_length = G_CPL_RX_TLS_CMP_PDULENGTH(be32toh(cpl->pdulength_length)); + so = inp_inpcbtosocket(inp); tp = intotcpcb(inp); #ifdef VERBOSE_TRACES @@ -2002,35 +2061,94 @@ ("%s: payload too small", __func__)); tls_hdr_pkt = mtod(m, void *); - /* - * Only the TLS header is sent to OpenSSL, so report errors by - * altering the record type. - */ - if ((tls_hdr_pkt->res_to_mac_error & M_TLSRX_HDR_PKT_ERROR) != 0) - tls_hdr_pkt->type = CONTENT_TYPE_ERROR; - - /* Trim this CPL's mbuf to only include the TLS header. */ - KASSERT(m->m_len == len && m->m_next == NULL, - ("%s: CPL spans multiple mbufs", __func__)); - m->m_len = TLS_HEADER_LENGTH; - m->m_pkthdr.len = TLS_HEADER_LENGTH; - tls_data = mbufq_dequeue(&toep->ulp_pdu_reclaimq); if (tls_data != NULL) { KASSERT(be32toh(cpl->seq) == tls_data->m_pkthdr.tls_tcp_seq, ("%s: sequence mismatch", __func__)); + } +#ifdef KERN_TLS + if (toep->tls.mode == TLS_MODE_KTLS) { + /* Report decryption errors as EBADMSG. */ + if ((tls_hdr_pkt->res_to_mac_error & M_TLSRX_HDR_PKT_ERROR) != + 0) { + m_freem(m); + m_freem(tls_data); + + CURVNET_SET(toep->vnet); + so->so_error = EBADMSG; + sorwakeup(so); + + INP_WUNLOCK(inp); + CURVNET_RESTORE(); + + return (0); + } + + /* Allocate the control message mbuf. */ + control = sbcreatecontrol(NULL, sizeof(*tgr), TLS_GET_RECORD, + IPPROTO_TCP); + if (control == NULL) { + m_freem(m); + m_freem(tls_data); + + CURVNET_SET(toep->vnet); + so->so_error = ENOBUFS; + sorwakeup(so); + + INP_WUNLOCK(inp); + CURVNET_RESTORE(); + + return (0); + } + + tgr = (struct tls_get_record *) + CMSG_DATA(mtod(control, struct cmsghdr *)); + tgr->tls_type = tls_hdr_pkt->type; + tgr->tls_vmajor = be16toh(tls_hdr_pkt->version) >> 8; + tgr->tls_vminor = be16toh(tls_hdr_pkt->version) & 0xff; + + m_freem(m); + + if (tls_data != NULL) { + m_last(tls_data)->m_flags |= M_EOR; + tgr->tls_length = htobe16(tls_data->m_pkthdr.len); + } else + tgr->tls_length = 0; + m = tls_data; + } else +#endif + { /* - * Update the TLS header length to be the length of - * the payload data. + * Only the TLS header is sent to OpenSSL, so report + * errors by altering the record type. */ - tls_hdr_pkt->length = htobe16(tls_data->m_pkthdr.len); + if ((tls_hdr_pkt->res_to_mac_error & M_TLSRX_HDR_PKT_ERROR) != + 0) + tls_hdr_pkt->type = CONTENT_TYPE_ERROR; - m->m_next = tls_data; - m->m_pkthdr.len += tls_data->m_len; + /* Trim this CPL's mbuf to only include the TLS header. */ + KASSERT(m->m_len == len && m->m_next == NULL, + ("%s: CPL spans multiple mbufs", __func__)); + m->m_len = TLS_HEADER_LENGTH; + m->m_pkthdr.len = TLS_HEADER_LENGTH; + + if (tls_data != NULL) { + /* + * Update the TLS header length to be the length of + * the payload data. + */ + tls_hdr_pkt->length = htobe16(tls_data->m_pkthdr.len); + + m->m_next = tls_data; + m->m_pkthdr.len += tls_data->m_len; + } + +#ifdef KERN_TLS + control = NULL; +#endif } - so = inp_inpcbtosocket(inp); sb = &so->so_rcv; SOCKBUF_LOCK(sb); @@ -2040,6 +2158,9 @@ CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", __func__, tid, pdu_length); m_freem(m); +#ifdef KERN_TLS + m_freem(control); +#endif SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); @@ -2076,7 +2197,12 @@ sb->sb_flags &= ~SB_AUTOSIZE; } - sbappendstream_locked(sb, m, 0); +#ifdef KERN_TLS + if (control != NULL) + sbappendcontrol_locked(sb, m, control); + else +#endif + sbappendstream_locked(sb, m, 0); rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %u rx_credits %u rcv_wnd %u", Index: sys/dev/cxgbe/tom/t4_tom.h =================================================================== --- sys/dev/cxgbe/tom/t4_tom.h +++ sys/dev/cxgbe/tom/t4_tom.h @@ -451,6 +451,6 @@ void tls_stop_handshake_timer(struct toepcb *); int tls_tx_key(struct toepcb *); void tls_uninit_toep(struct toepcb *); -int tls_alloc_ktls(struct toepcb *, struct ktls_session *); +int tls_alloc_ktls(struct toepcb *, struct ktls_session *, bool); #endif Index: sys/dev/cxgbe/tom/t4_tom.c =================================================================== --- sys/dev/cxgbe/tom/t4_tom.c +++ sys/dev/cxgbe/tom/t4_tom.c @@ -814,14 +814,14 @@ #ifdef KERN_TLS static int t4_alloc_tls_session(struct toedev *tod, struct tcpcb *tp, - struct ktls_session *tls) + struct ktls_session *tls, bool transmit) { struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(tp->t_inpcb); MPASS(tls != NULL); - return (tls_alloc_ktls(toep, tls)); + return (tls_alloc_ktls(toep, tls, transmit)); } #endif @@ -1080,7 +1080,9 @@ struct inpcb *inp = sotoinpcb(so); int i, rc; - /* XXX: Eventually add a SO_WANT_TLS socket option perhaps? */ + if (so_options_get(so) & SO_WANT_KTLS) + return (1); + rc = 0; ADAPTER_LOCK(sc); for (i = 0; i < sc->tt.num_tls_rx_ports; i++) { Index: sys/kern/uipc_ktls.c =================================================================== --- sys/kern/uipc_ktls.c +++ sys/kern/uipc_ktls.c @@ -76,7 +76,8 @@ struct ktls_wq { struct mtx mtx; - STAILQ_HEAD(, mbuf_ext_pgs) head; + STAILQ_HEAD(, mbuf_ext_pgs) pgs_head; + STAILQ_HEAD(, socket) so_head; bool running; } __aligned(CACHE_LINE_SIZE); @@ -128,9 +129,15 @@ SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD, &ktls_tasks_active, "Number of active tasks"); -static counter_u64_t ktls_cnt_on; -SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, so_inqueue, CTLFLAG_RD, - &ktls_cnt_on, "Number of TLS records in queue to tasks for SW crypto"); +static counter_u64_t ktls_cnt_tx_queued; +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD, + &ktls_cnt_tx_queued, + "Number of TLS records in queue to tasks for SW encryption"); + +static counter_u64_t ktls_cnt_rx_queued; +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_rx_inqueue, CTLFLAG_RD, + &ktls_cnt_rx_queued, + "Number of TLS sockets in queue to tasks for SW decryption"); static counter_u64_t ktls_offload_total; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total, @@ -146,6 +153,10 @@ SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD, &ktls_offload_active, "Total Active TLS sessions"); +static counter_u64_t ktls_offload_corrupted_records; +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, corrupted_records, CTLFLAG_RD, + &ktls_offload_corrupted_records, "Total corrupted TLS records received"); + static counter_u64_t ktls_offload_failed_crypto; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD, &ktls_offload_failed_crypto, "Total TLS crypto failures"); @@ -331,10 +342,12 @@ int error, i; ktls_tasks_active = counter_u64_alloc(M_WAITOK); - ktls_cnt_on = counter_u64_alloc(M_WAITOK); + ktls_cnt_tx_queued = counter_u64_alloc(M_WAITOK); + ktls_cnt_rx_queued = counter_u64_alloc(M_WAITOK); ktls_offload_total = counter_u64_alloc(M_WAITOK); ktls_offload_enable_calls = counter_u64_alloc(M_WAITOK); ktls_offload_active = counter_u64_alloc(M_WAITOK); + ktls_offload_corrupted_records = counter_u64_alloc(M_WAITOK); ktls_offload_failed_crypto = counter_u64_alloc(M_WAITOK); ktls_switch_to_ifnet = counter_u64_alloc(M_WAITOK); ktls_switch_to_sw = counter_u64_alloc(M_WAITOK); @@ -367,7 +380,8 @@ * work queue for each CPU. */ CPU_FOREACH(i) { - STAILQ_INIT(&ktls_wq[i].head); + STAILQ_INIT(&ktls_wq[i].pgs_head); + STAILQ_INIT(&ktls_wq[i].so_head); mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF); error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i], &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i); @@ -696,7 +710,7 @@ #ifdef TCP_OFFLOAD static int -ktls_try_toe(struct socket *so, struct ktls_session *tls) +ktls_try_toe(struct socket *so, struct ktls_session *tls, bool transmit) { struct inpcb *inp; struct tcpcb *tp; @@ -722,7 +736,7 @@ return (EOPNOTSUPP); } - error = tcp_offload_alloc_tls_session(tp, tls); + error = tcp_offload_alloc_tls_session(tp, tls, transmit); INP_WUNLOCK(inp); if (error == 0) { tls->mode = TCP_TLS_MODE_TOE; @@ -853,7 +867,7 @@ } static int -ktls_try_sw(struct socket *so, struct ktls_session *tls) +ktls_try_sw(struct socket *so, struct ktls_session *tls, bool transmit) { struct rm_priotracker prio; struct ktls_crypto_backend *be; @@ -868,7 +882,7 @@ if (ktls_allow_unload) rm_rlock(&ktls_backends_lock, &prio); LIST_FOREACH(be, &ktls_backends, next) { - if (be->try(so, tls) == 0) + if (be->try(so, tls, transmit) == 0) break; KASSERT(tls->cipher == NULL, ("ktls backend leaked a cipher pointer")); @@ -894,6 +908,125 @@ return (0); } +/* + * KTLS RX stores data in the socket buffer as a list of TLS records, + * where each record is stored as a control message containg the TLS + * header followed by data mbufs containing the decrypted data. This + * is different from KTLS TX which always uses an mb_ext_pgs mbuf for + * both encrypted and decrypted data. TLS records decrypted by a NIC + * should be queued to the socket buffer as records, but encrypted + * data which needs to be decrypted by software arrives as a stream of + * regular mbufs which need to be converted. In addition, there may + * already be pending encrypted data in the socket buffer when KTLS RX + * is enabled. + * + * To manage not-yet-decrypted data for KTLS RX, the following scheme + * is used: + * + * - A single chain of NOTREADY mbufs is hung off of sb_mtls. + * + * - ktls_check_rx checks this chain of mbufs reading the TLS header + * from the first mbuf. Once all of the data for that TLS record is + * queued, the socket is queued to a worker thread. + * + * - The worker thread calls ktls_decrypt to decrypt TLS records in + * the TLS chain. Each TLS record is detached from the TLS chain, + * decrypted, and inserted into the regular socket buffer chain as + * record starting with a control message holding the TLS header and + * a chain of mbufs holding the encrypted data. + */ + +static void +sb_mark_notready(struct sockbuf *sb) +{ + struct mbuf *m; + + m = sb->sb_mb; + sb->sb_mtls = m; + sb->sb_mb = NULL; + sb->sb_mbtail = NULL; + sb->sb_lastrecord = NULL; + for (; m != NULL; m = m->m_next) { + KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL", + __func__)); + KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail", + __func__)); + KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len", + __func__)); + m->m_flags |= M_NOTREADY; + sb->sb_acc -= m->m_len; + sb->sb_tlscc += m->m_len; + sb->sb_mtlstail = m; + } + KASSERT(sb->sb_acc == 0 && sb->sb_tlscc == sb->sb_ccc, + ("%s: acc %u tlscc %u ccc %u", __func__, sb->sb_acc, sb->sb_tlscc, + sb->sb_ccc)); +} + +int +ktls_enable_rx(struct socket *so, struct tls_enable *en) +{ + struct ktls_session *tls; + int error; + + if (!ktls_offload_enable) + return (ENOTSUP); + + counter_u64_add(ktls_offload_enable_calls, 1); + + /* + * This should always be true since only the TCP socket option + * invokes this function. + */ + if (so->so_proto->pr_protocol != IPPROTO_TCP) + return (EINVAL); + + /* + * XXX: Don't overwrite existing sessions. We should permit + * this to support rekeying in the future. + */ + if (so->so_rcv.sb_tls_info != NULL) + return (EALREADY); + + if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable) + return (ENOTSUP); + + /* TLS 1.3 is not yet supported. */ + if (en->tls_vmajor == TLS_MAJOR_VER_ONE && + en->tls_vminor == TLS_MINOR_VER_THREE) + return (ENOTSUP); + + error = ktls_create_session(so, en, &tls); + if (error) + return (error); + +#ifdef TCP_OFFLOAD + error = ktls_try_toe(so, tls, false); + if (error) +#endif + error = ktls_try_sw(so, tls, false); + + if (error) { + ktls_cleanup(tls); + return (error); + } + + /* Mark the socket as using TLS offload. */ + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_tls_seqno = be64dec(en->rec_seq); + so->so_rcv.sb_tls_info = tls; + so->so_rcv.sb_flags |= SB_TLS_RX; + + /* Mark existing data as not ready until it can be decrypted. */ + sb_mark_notready(&so->so_rcv); + ktls_check_rx(&so->so_rcv); + SOCKBUF_UNLOCK(&so->so_rcv); + + counter_u64_add(ktls_offload_total, 1); + + return (0); +} + int ktls_enable_tx(struct socket *so, struct tls_enable *en) { @@ -932,12 +1065,12 @@ /* Prefer TOE -> ifnet TLS -> software TLS. */ #ifdef TCP_OFFLOAD - error = ktls_try_toe(so, tls); + error = ktls_try_toe(so, tls, true); if (error) #endif error = ktls_try_ifnet(so, tls, false); if (error) - error = ktls_try_sw(so, tls); + error = ktls_try_sw(so, tls, true); if (error) { ktls_cleanup(tls); @@ -951,6 +1084,7 @@ } SOCKBUF_LOCK(&so->so_snd); + so->so_snd.sb_tls_seqno = be64dec(en->rec_seq); so->so_snd.sb_tls_info = tls; if (tls->mode != TCP_TLS_MODE_SW) so->so_snd.sb_flags |= SB_TLS_IFNET; @@ -962,6 +1096,25 @@ return (0); } +int +ktls_get_rx_mode(struct socket *so) +{ + struct ktls_session *tls; + struct inpcb *inp; + int mode; + + inp = so->so_pcb; + INP_WLOCK_ASSERT(inp); + SOCKBUF_LOCK(&so->so_rcv); + tls = so->so_rcv.sb_tls_info; + if (tls == NULL) + mode = TCP_TLS_MODE_NONE; + else + mode = tls->mode; + SOCKBUF_UNLOCK(&so->so_rcv); + return (mode); +} + int ktls_get_tx_mode(struct socket *so) { @@ -1022,7 +1175,7 @@ if (mode == TCP_TLS_MODE_IFNET) error = ktls_try_ifnet(so, tls_new, true); else - error = ktls_try_sw(so, tls_new); + error = ktls_try_sw(so, tls_new, true); if (error) { counter_u64_add(ktls_switch_failed, 1); ktls_free(tls_new); @@ -1348,6 +1501,371 @@ } } +void +ktls_check_rx(struct sockbuf *sb) +{ + struct tls_record_layer hdr; + struct ktls_wq *wq; + struct socket *so; + bool running; + + SOCKBUF_LOCK_ASSERT(sb); + KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX", + __func__, sb)); + so = __containerof(sb, struct socket, so_rcv); + + if (sb->sb_flags & SB_TLS_RX_RUNNING) + return; + + /* Is there enough queued for a TLS header? */ + if (sb->sb_tlscc < sizeof(hdr)) { + if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc != 0) + so->so_error = EMSGSIZE; + return; + } + + m_copydata(sb->sb_mtls, 0, sizeof(hdr), (void *)&hdr); + + /* Is the entire record queued? */ + if (sb->sb_tlscc < sizeof(hdr) + ntohs(hdr.tls_length)) { + if ((sb->sb_state & SBS_CANTRCVMORE) != 0) + so->so_error = EMSGSIZE; + return; + } + + sb->sb_flags |= SB_TLS_RX_RUNNING; + + soref(so); + wq = &ktls_wq[so->so_rcv.sb_tls_info->wq_index]; + mtx_lock(&wq->mtx); + STAILQ_INSERT_TAIL(&wq->so_head, so, so_ktls_rx_list); + running = wq->running; + mtx_unlock(&wq->mtx); + if (!running) + wakeup(wq); + counter_u64_add(ktls_cnt_rx_queued, 1); +} + +static struct mbuf * +ktls_detach_record(struct sockbuf *sb, int len) +{ + struct mbuf *m, *n, *top; + int remain; + + SOCKBUF_LOCK_ASSERT(sb); + MPASS(len <= sb->sb_tlscc); + + /* + * If TLS chain is the exact size of the record, + * just grab the whole record. + */ + top = sb->sb_mtls; + if (sb->sb_tlscc == len) { + sb->sb_mtls = NULL; + sb->sb_mtlstail = NULL; + goto out; + } + + /* + * While it would be nice to use m_split() here, we need + * to know exactly what m_split() allocates to update the + * accounting, so do it inline instead. + */ + remain = len; + for (m = top; remain > m->m_len; m = m->m_next) + remain -= m->m_len; + + /* Easy case: don't have to split 'm'. */ + if (remain == m->m_len) { + sb->sb_mtls = m->m_next; + if (sb->sb_mtls == NULL) + sb->sb_mtlstail = NULL; + m->m_next = NULL; + goto out; + } + + /* + * Need to allocate an mbuf to hold the remainder of 'm'. Try + * with M_NOWAIT first. + */ + n = m_get(M_NOWAIT, MT_DATA); + if (n == NULL) { + /* + * Use M_WAITOK with socket buffer unlocked. If + * 'sb_mtls' changes while the lock is dropped, return + * NULL to force the caller to retry. + */ + SOCKBUF_UNLOCK(sb); + + n = m_get(M_WAITOK, MT_DATA); + + SOCKBUF_LOCK(sb); + if (sb->sb_mtls != top) { + m_free(n); + return (NULL); + } + } + n->m_flags |= M_NOTREADY; + + /* Store remainder in 'n'. */ + n->m_len = m->m_len - remain; + if (m->m_flags & M_EXT) { + n->m_data = m->m_data + remain; + mb_dupcl(n, m); + } else { + bcopy(mtod(m, caddr_t) + remain, mtod(n, caddr_t), n->m_len); + } + + /* Trim 'm' and update accounting. */ + m->m_len -= n->m_len; + sb->sb_tlscc -= n->m_len; + sb->sb_ccc -= n->m_len; + + /* Account for 'n'. */ + sballoc_ktls_rx(sb, n); + + /* Insert 'n' into the TLS chain. */ + sb->sb_mtls = n; + n->m_next = m->m_next; + if (sb->sb_mtlstail == m) + sb->sb_mtlstail = n; + + /* Detach the record from the TLS chain. */ + m->m_next = NULL; + +out: + MPASS(m_length(top, NULL) == len); + for (m = top; m != NULL; m = m->m_next) + sbfree_ktls_rx(sb, m); + sb->sb_tlsdcc = len; + sb->sb_ccc += len; + SBCHECK(sb); + return (top); +} + +static int +m_segments(struct mbuf *m, int skip) +{ + int count; + + while (skip >= m->m_len) { + skip -= m->m_len; + m = m->m_next; + } + + for (count = 0; m != NULL; count++) + m = m->m_next; + return (count); +} + +static void +ktls_decrypt(struct socket *so) +{ + char tls_header[MBUF_PEXT_HDR_LEN]; + struct ktls_session *tls; + struct sockbuf *sb; + struct tls_record_layer *hdr; + struct iovec *iov; + struct tls_get_record tgr; + struct mbuf *control, *data, *m; + uint64_t seqno; + int error, i, iov_cap, iov_count, remain, tls_len, trail_len; + + hdr = (struct tls_record_layer *)tls_header; + sb = &so->so_rcv; + SOCKBUF_LOCK(sb); + KASSERT(sb->sb_flags & SB_TLS_RX_RUNNING, + ("%s: socket %p not running", __func__, so)); + + tls = sb->sb_tls_info; + MPASS(tls != NULL); + + iov = NULL; + iov_cap = 0; + for (;;) { + /* Is there enough queued for a TLS header? */ + if (sb->sb_tlscc < tls->params.tls_hlen) + break; + + m_copydata(sb->sb_mtls, 0, tls->params.tls_hlen, tls_header); + tls_len = sizeof(*hdr) + ntohs(hdr->tls_length); + + if (hdr->tls_vmajor != tls->params.tls_vmajor || + hdr->tls_vminor != tls->params.tls_vminor) + error = EINVAL; + else if (tls_len < tls->params.tls_hlen || tls_len > + tls->params.tls_hlen + TLS_MAX_MSG_SIZE_V10_2 + + tls->params.tls_tlen) + error = EMSGSIZE; + else + error = 0; + if (__predict_false(error != 0)) { + /* + * We have a corrupted record and are likely + * out of sync. The connection isn't + * recoverable at this point, so abort it. + */ + SOCKBUF_UNLOCK(sb); + counter_u64_add(ktls_offload_corrupted_records, 1); + + CURVNET_SET(so->so_vnet); + so->so_proto->pr_usrreqs->pru_abort(so); + so->so_error = error; + CURVNET_RESTORE(); + goto deref; + } + + /* Is the entire record queued? */ + if (sb->sb_tlscc < tls_len) + break; + + /* + * Split out the portion of the mbuf chain containing + * this TLS record. + */ + data = ktls_detach_record(sb, tls_len); + if (data == NULL) + continue; + MPASS(sb->sb_tlsdcc == tls_len); + + seqno = sb->sb_tls_seqno; + sb->sb_tls_seqno++; + SBCHECK(sb); + SOCKBUF_UNLOCK(sb); + + /* + * Build an I/O vector spanning the TLS record payload + * and trailer but skipping the header. + */ + iov_count = m_segments(data, tls->params.tls_hlen); + if (iov_count > iov_cap) { + free(iov, M_KTLS); + iov = malloc(sizeof(*iov) * iov_count, M_KTLS, + M_WAITOK); + iov_cap = iov_count; + } + remain = tls->params.tls_hlen; + for (m = data; remain >= m->m_len; m = m->m_next) + remain -= m->m_len; + iov[0].iov_base = m->m_data + remain; + iov[0].iov_len = m->m_len - remain; + for (m = m->m_next, i = 1; m != NULL; m = m->m_next, i++) { + iov[i].iov_base = m->m_data; + iov[i].iov_len = m->m_len; + } + MPASS(i == iov_count); + + error = tls->sw_decrypt(tls, hdr, iov, iov_count, seqno, + &trail_len); + if (error) { + counter_u64_add(ktls_offload_failed_crypto, 1); + + SOCKBUF_LOCK(sb); + if (sb->sb_tlsdcc == 0) { + /* + * sbcut/drop/flush discarded these + * mbufs. + */ + m_freem(data); + break; + } + + /* + * Drop this TLS record's data, but keep + * decrypting subsequent records. + */ + sb->sb_ccc -= tls_len; + sb->sb_tlsdcc = 0; + + CURVNET_SET(so->so_vnet); + so->so_error = EBADMSG; + sorwakeup_locked(so); + CURVNET_RESTORE(); + + m_freem(data); + + SOCKBUF_LOCK(sb); + continue; + } + + /* Allocate the control mbuf. */ + tgr.tls_type = hdr->tls_type; + tgr.tls_vmajor = hdr->tls_vmajor; + tgr.tls_vminor = hdr->tls_vminor; + tgr.tls_length = htobe16(tls_len - tls->params.tls_hlen - + trail_len); + control = sbcreatecontrol_how(&tgr, sizeof(tgr), + TLS_GET_RECORD, IPPROTO_TCP, M_WAITOK); + + SOCKBUF_LOCK(sb); + if (sb->sb_tlsdcc == 0) { + /* sbcut/drop/flush discarded these mbufs. */ + MPASS(sb->sb_tlscc == 0); + m_freem(data); + m_freem(control); + break; + } + + /* + * Clear the 'dcc' accounting in preparation for + * adding the decrypted record. + */ + sb->sb_ccc -= tls_len; + sb->sb_tlsdcc = 0; + SBCHECK(sb); + + /* If there is no payload, drop all of the data. */ + if (tgr.tls_length == htobe16(0)) { + m_freem(data); + data = NULL; + } else { + /* Trim header. */ + remain = tls->params.tls_hlen; + while (remain > 0) { + if (data->m_len > remain) { + data->m_data += remain; + data->m_len -= remain; + break; + } + remain -= data->m_len; + data = m_free(data); + } + + /* Trim trailer and clear M_NOTREADY. */ + remain = be16toh(tgr.tls_length); + m = data; + for (m = data; remain > m->m_len; m = m->m_next) { + m->m_flags &= ~M_NOTREADY; + remain -= m->m_len; + } + m->m_len = remain; + m_freem(m->m_next); + m->m_next = NULL; + m->m_flags &= ~M_NOTREADY; + + /* Set EOR on the final mbuf. */ + m->m_flags |= M_EOR; + } + + sbappendcontrol_locked(sb, data, control); + } + + sb->sb_flags &= ~SB_TLS_RX_RUNNING; + + if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc > 0) + so->so_error = EMSGSIZE; + + sorwakeup_locked(so); + +deref: + SOCKBUF_UNLOCK_ASSERT(sb); + + CURVNET_SET(so->so_vnet); + SOCK_LOCK(so); + sorele(so); + CURVNET_RESTORE(); +} + void ktls_enqueue_to_free(struct mbuf_ext_pgs *pgs) { @@ -1358,7 +1876,7 @@ pgs->mbuf = NULL; wq = &ktls_wq[pgs->tls->wq_index]; mtx_lock(&wq->mtx); - STAILQ_INSERT_TAIL(&wq->head, pgs, stailq); + STAILQ_INSERT_TAIL(&wq->pgs_head, pgs, stailq); running = wq->running; mtx_unlock(&wq->mtx); if (!running) @@ -1392,12 +1910,12 @@ wq = &ktls_wq[pgs->tls->wq_index]; mtx_lock(&wq->mtx); - STAILQ_INSERT_TAIL(&wq->head, pgs, stailq); + STAILQ_INSERT_TAIL(&wq->pgs_head, pgs, stailq); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); - counter_u64_add(ktls_cnt_on, 1); + counter_u64_add(ktls_cnt_tx_queued, 1); } static __noinline void @@ -1551,34 +2069,44 @@ ktls_work_thread(void *ctx) { struct ktls_wq *wq = ctx; - struct mbuf_ext_pgs *p, *n; + struct mbuf_ext_pgs *p, *pn; + struct socket *so, *son; struct ktls_session *tls; - STAILQ_HEAD(, mbuf_ext_pgs) local_head; + STAILQ_HEAD(, mbuf_ext_pgs) local_pgs_head; + STAILQ_HEAD(, socket) local_so_head; #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) fpu_kern_thread(0); #endif for (;;) { mtx_lock(&wq->mtx); - while (STAILQ_EMPTY(&wq->head)) { + while (STAILQ_EMPTY(&wq->pgs_head) && + STAILQ_EMPTY(&wq->so_head)) { wq->running = false; mtx_sleep(wq, &wq->mtx, 0, "-", 0); wq->running = true; } - STAILQ_INIT(&local_head); - STAILQ_CONCAT(&local_head, &wq->head); + STAILQ_INIT(&local_pgs_head); + STAILQ_CONCAT(&local_pgs_head, &wq->pgs_head); + STAILQ_INIT(&local_so_head); + STAILQ_CONCAT(&local_so_head, &wq->so_head); mtx_unlock(&wq->mtx); - STAILQ_FOREACH_SAFE(p, &local_head, stailq, n) { + STAILQ_FOREACH_SAFE(p, &local_pgs_head, stailq, pn) { if (p->mbuf != NULL) { ktls_encrypt(p); - counter_u64_add(ktls_cnt_on, -1); + counter_u64_add(ktls_cnt_tx_queued, -1); } else { tls = p->tls; ktls_free(tls); uma_zfree(zone_extpgs, p); } } + + STAILQ_FOREACH_SAFE(so, &local_so_head, so_ktls_rx_list, son) { + ktls_decrypt(so); + counter_u64_add(ktls_cnt_rx_queued, -1); + } } } Index: sys/kern/uipc_sockbuf.c =================================================================== --- sys/kern/uipc_sockbuf.c +++ sys/kern/uipc_sockbuf.c @@ -70,6 +70,8 @@ static u_long sb_efficiency = 8; /* parameter for sbreserve() */ +static void sbcompress_ktls_rx(struct sockbuf *sb, struct mbuf *m, + struct mbuf *n); static struct mbuf *sbcut_internal(struct sockbuf *sb, int len); static void sbflush_internal(struct sockbuf *sb); @@ -313,6 +315,51 @@ sb->sb_sndptroff -= m->m_len; } +#ifdef KERN_TLS +/* + * Similar to sballoc/sbfree but does not adjust state associated with + * the sb_mb chain such a sb_fnrdy or sb_sndptr*. Also assumes mbufs + * are not ready. + */ +void +sballoc_ktls_rx(struct sockbuf *sb, struct mbuf *m) +{ + + SOCKBUF_LOCK_ASSERT(sb); + + sb->sb_ccc += m->m_len; + sb->sb_tlscc += m->m_len; + + sb->sb_mbcnt += MSIZE; + sb->sb_mcnt += 1; + + if (m->m_flags & M_EXT) { + sb->sb_mbcnt += m->m_ext.ext_size; + sb->sb_ccnt += 1; + } +} + +void +sbfree_ktls_rx(struct sockbuf *sb, struct mbuf *m) +{ + +#if 0 /* XXX: not yet: soclose() call path comes here w/o lock. */ + SOCKBUF_LOCK_ASSERT(sb); +#endif + + sb->sb_ccc -= m->m_len; + sb->sb_tlscc -= m->m_len; + + sb->sb_mbcnt -= MSIZE; + sb->sb_mcnt -= 1; + + if (m->m_flags & M_EXT) { + sb->sb_mbcnt -= m->m_ext.ext_size; + sb->sb_ccnt -= 1; + } +} +#endif + /* * Socantsendmore indicates that no more data will be sent on the socket; it * would normally be applied to a socket when the user informs the system @@ -349,6 +396,10 @@ SOCKBUF_LOCK_ASSERT(&so->so_rcv); so->so_rcv.sb_state |= SBS_CANTRCVMORE; +#ifdef KERN_TLS + if (so->so_rcv.sb_flags & SB_TLS_RX) + ktls_check_rx(&so->so_rcv); +#endif sorwakeup_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } @@ -749,6 +800,24 @@ } panic("%s from %s:%u", __func__, file, line); } + +#ifdef KERN_TLS + m = sb->sb_mtls; + while (m && m->m_next) + m = m->m_next; + + if (m != sb->sb_mtlstail) { + printf("%s: sb_mtls %p sb_mtlstail %p last %p\n", + __func__, sb->sb_mtls, sb->sb_mtlstail, m); + printf("TLS packet tree:\n"); + printf("\t"); + for (m = sb->sb_mtls; m != NULL; m = m->m_next) { + printf("%p ", m); + } + printf("\n"); + panic("%s from %s:%u", __func__, file, line); + } +#endif } #endif /* SOCKBUF_DEBUG */ @@ -826,6 +895,29 @@ SOCKBUF_UNLOCK(sb); } +#ifdef KERN_TLS +/* + * Append an mbuf containing encrypted TLS data. The data + * is marked M_NOTREADY until it has been decrypted and + * stored as a TLS record. + */ +static void +sbappend_ktls_rx(struct sockbuf *sb, struct mbuf *m) +{ + struct mbuf *n; + + SBLASTMBUFCHK(sb); + + /* Remove all packet headers and mbuf tags to get a pure data chain. */ + m_demote(m, 1, 0); + + for (n = m; n != NULL; n = n->m_next) + n->m_flags |= M_NOTREADY; + sbcompress_ktls_rx(sb, m, sb->sb_mtlstail); + ktls_check_rx(sb); +} +#endif + /* * This version of sbappend() should only be used when the caller absolutely * knows that there will never be more than one record in the socket buffer, @@ -837,6 +929,19 @@ SOCKBUF_LOCK_ASSERT(sb); KASSERT(m->m_nextpkt == NULL,("sbappendstream 0")); + +#ifdef KERN_TLS + /* + * Decrypted TLS records are appended as records via + * sbappendrecord(). TCP passes encrypted TLS records to this + * function which must be scheduled for decryption. + */ + if (sb->sb_flags & SB_TLS_RX) { + sbappend_ktls_rx(sb, m); + return; + } +#endif + KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1")); SBLASTMBUFCHK(sb); @@ -875,6 +980,9 @@ { struct mbuf *m, *n, *fnrdy; u_long acc, ccc, mbcnt; +#ifdef KERN_TLS + u_long tlscc; +#endif SOCKBUF_LOCK_ASSERT(sb); @@ -910,9 +1018,46 @@ mbcnt += m->m_ext.ext_size; } } +#ifdef KERN_TLS + /* + * Account for mbufs "detached" by ktls_detach_record() while + * they are decrypted by ktls_decrypt(). tlsdcc gives a count + * of the detached bytes that are included in ccc. The mbufs + * and clusters are not included in the socket buffer + * accounting. + */ + ccc += sb->sb_tlsdcc; + + tlscc = 0; + for (m = sb->sb_mtls; m; m = m->m_next) { + if (m->m_nextpkt != NULL) { + printf("sb %p TLS mbuf %p with nextpkt\n", sb, m); + goto fail; + } + if ((m->m_flags & M_NOTREADY) == 0) { + printf("sb %p TLS mbuf %p ready\n", sb, m); + goto fail; + } + tlscc += m->m_len; + ccc += m->m_len; + mbcnt += MSIZE; + if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ + mbcnt += m->m_ext.ext_size; + } + + if (sb->sb_tlscc != tlscc) { + printf("tlscc %ld/%u dcc %u\n", tlscc, sb->sb_tlscc, + sb->sb_tlsdcc); + goto fail; + } +#endif if (acc != sb->sb_acc || ccc != sb->sb_ccc || mbcnt != sb->sb_mbcnt) { printf("acc %ld/%u ccc %ld/%u mbcnt %ld/%u\n", acc, sb->sb_acc, ccc, sb->sb_ccc, mbcnt, sb->sb_mbcnt); +#ifdef KERN_TLS + printf("tlscc %ld/%u dcc %u\n", tlscc, sb->sb_tlscc, + sb->sb_tlsdcc); +#endif goto fail; } return; @@ -1187,6 +1332,64 @@ SBLASTMBUFCHK(sb); } +#ifdef KERN_TLS +/* + * A version of sbcompress() for encrypted TLS RX mbufs. These mbufs + * are appended to the 'sb_mtls' chain instead of 'sb_mb' and are also + * a bit simpler (no EOR markers, always MT_DATA, etc.). + */ +static void +sbcompress_ktls_rx(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) +{ + + SOCKBUF_LOCK_ASSERT(sb); + + while (m) { + KASSERT((m->m_flags & M_EOR) == 0, + ("TLS RX mbuf %p with EOR", m)); + KASSERT(m->m_type == MT_DATA, + ("TLS RX mbuf %p is not MT_DATA", m)); + KASSERT((m->m_flags & M_NOTREADY) != 0, + ("TLS RX mbuf %p ready", m)); + KASSERT((m->m_flags & M_NOMAP) == 0, + ("TLS RX mbuf %p unmapped", m)); + + if (m->m_len == 0) { + m = m_free(m); + continue; + } + + /* + * Even though both 'n' and 'm' are NOTREADY, it's ok + * to coalesce the data. + */ + if (n && + M_WRITABLE(n) && + ((sb->sb_flags & SB_NOCOALESCE) == 0) && + !(n->m_flags & (M_NOMAP)) && + m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ + m->m_len <= M_TRAILINGSPACE(n)) { + m_copydata(m, 0, m->m_len, mtodo(n, n->m_len)); + n->m_len += m->m_len; + sb->sb_ccc += m->m_len; + sb->sb_tlscc += m->m_len; + m = m_free(m); + continue; + } + if (n) + n->m_next = m; + else + sb->sb_mtls = m; + sb->sb_mtlstail = m; + sballoc_ktls_rx(sb, m); + n = m; + m = m->m_next; + n->m_next = NULL; + } + SBLASTMBUFCHK(sb); +} +#endif + /* * Free all mbufs in a sockbuf. Check that all resources are reclaimed. */ @@ -1194,7 +1397,7 @@ sbflush_internal(struct sockbuf *sb) { - while (sb->sb_mbcnt) { + while (sb->sb_mbcnt || sb->sb_tlsdcc) { /* * Don't call sbcut(sb, 0) if the leading mbuf is non-empty: * we would loop forever. Panic instead. @@ -1232,6 +1435,7 @@ sbcut_internal(struct sockbuf *sb, int len) { struct mbuf *m, *next, *mfree; + bool is_tls; KASSERT(len >= 0, ("%s: len is %d but it is supposed to be >= 0", __func__, len)); @@ -1239,10 +1443,25 @@ __func__, len, sb->sb_ccc)); next = (m = sb->sb_mb) ? m->m_nextpkt : 0; + is_tls = false; mfree = NULL; while (len > 0) { if (m == NULL) { +#ifdef KERN_TLS + if (next == NULL && !is_tls) { + if (sb->sb_tlsdcc != 0) { + MPASS(len >= sb->sb_tlsdcc); + len -= sb->sb_tlsdcc; + sb->sb_ccc -= sb->sb_tlsdcc; + sb->sb_tlsdcc = 0; + if (len == 0) + break; + } + next = sb->sb_mtls; + is_tls = true; + } +#endif KASSERT(next, ("%s: no next, len %d", __func__, len)); m = next; next = m->m_nextpkt; @@ -1261,12 +1480,17 @@ break; } len -= m->m_len; - sbfree(sb, m); +#ifdef KERN_TLS + if (is_tls) + sbfree_ktls_rx(sb, m); + else +#endif + sbfree(sb, m); /* * Do not put M_NOTREADY buffers to the free list, they * are referenced from outside. */ - if (m->m_flags & M_NOTREADY) + if (m->m_flags & M_NOTREADY && !is_tls) m = m->m_next; else { struct mbuf *n; @@ -1292,6 +1516,14 @@ mfree = m; m = n; } +#ifdef KERN_TLS + if (is_tls) { + sb->sb_mb = NULL; + sb->sb_mtls = m; + if (m == NULL) + sb->sb_mtlstail = NULL; + } else +#endif if (m) { sb->sb_mb = m; m->m_nextpkt = next; @@ -1467,17 +1699,18 @@ * type for presentation on a socket buffer. */ struct mbuf * -sbcreatecontrol(caddr_t p, int size, int type, int level) +sbcreatecontrol_how(void *p, int size, int type, int level, int wait) { struct cmsghdr *cp; struct mbuf *m; + MBUF_CHECKSLEEP(wait); if (CMSG_SPACE((u_int)size) > MCLBYTES) return ((struct mbuf *) NULL); if (CMSG_SPACE((u_int)size) > MLEN) - m = m_getcl(M_NOWAIT, MT_CONTROL, 0); + m = m_getcl(wait, MT_CONTROL, 0); else - m = m_get(M_NOWAIT, MT_CONTROL); + m = m_get(wait, MT_CONTROL); if (m == NULL) return ((struct mbuf *) NULL); cp = mtod(m, struct cmsghdr *); @@ -1498,6 +1731,13 @@ return (m); } +struct mbuf * +sbcreatecontrol(caddr_t p, int size, int type, int level) +{ + + return (sbcreatecontrol_how(p, size, type, level, M_NOWAIT)); +} + /* * This does the same for socket buffers that sotoxsocket does for sockets: * generate an user-format data structure describing the socket buffer. Note Index: sys/kern/uipc_socket.c =================================================================== --- sys/kern/uipc_socket.c +++ sys/kern/uipc_socket.c @@ -1840,7 +1840,8 @@ * 1. the current count is less than the low water mark, or * 2. MSG_DONTWAIT is not set */ - if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && + if (m == NULL || (m->m_flags & M_NOTAVAIL) != 0 || + (((flags & MSG_DONTWAIT) == 0 && sbavail(&so->so_rcv) < uio->uio_resid) && sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { @@ -1848,7 +1849,7 @@ ("receive: m == %p sbavail == %u", m, sbavail(&so->so_rcv))); if (so->so_error) { - if (m != NULL) + if (m != NULL && (m->m_flags & M_NOTAVAIL) == 0) goto dontblock; error = so->so_error; if ((flags & MSG_PEEK) == 0) @@ -1858,13 +1859,15 @@ } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { - if (m == NULL) { + if (m == NULL && so->so_rcv.sb_tlsdcc == 0 && + so->so_rcv.sb_tlscc == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; - } else + } else if (m != NULL && (m->m_flags & M_NOTAVAIL) == 0) goto dontblock; } - for (; m != NULL; m = m->m_next) + for (; m != NULL && (m->m_flags & M_NOTAVAIL) == 0; + m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; @@ -2274,12 +2277,34 @@ sb = &so->so_rcv; +#ifdef KERN_TLS + /* + * KTLS store TLS records as records with a control message to + * describe the framing. + * + * We check once here before acquiring locks to optimize the + * common case. + */ + if (sb->sb_tls_info != NULL) + return (soreceive_generic(so, psa, uio, mp0, controlp, + flagsp)); +#endif + /* Prevent other readers from entering the socket. */ error = sblock(sb, SBLOCKWAIT(flags)); if (error) return (error); SOCKBUF_LOCK(sb); +#ifdef KERN_TLS + if (sb->sb_tls_info != NULL) { + SOCKBUF_UNLOCK(sb); + sbunlock(sb); + return (soreceive_generic(so, psa, uio, mp0, controlp, + flagsp)); + } +#endif + /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) { error = EINVAL; @@ -2882,6 +2907,7 @@ case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: + case SO_WANT_KTLS: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) @@ -3101,6 +3127,9 @@ case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: + case SO_NO_DDP: + case SO_NO_OFFLOAD: + case SO_WANT_KTLS: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); Index: sys/netinet/tcp.h =================================================================== --- sys/netinet/tcp.h +++ sys/netinet/tcp.h @@ -178,6 +178,8 @@ device */ #define TCP_TXTLS_ENABLE 39 /* TLS framing and encryption for transmit */ #define TCP_TXTLS_MODE 40 /* Transmit TLS mode */ +#define TCP_RXTLS_ENABLE 41 /* TLS framing and encryption for receive */ +#define TCP_RXTLS_MODE 42 /* Receive TLS mode */ #define TCP_CONGESTION 64 /* get/set congestion control algorithm */ #define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */ #define TCP_DELACK 72 /* socket option for delayed ack */ @@ -365,6 +367,7 @@ * TCP Control message types */ #define TLS_SET_RECORD_TYPE 1 +#define TLS_GET_RECORD 2 /* * TCP specific variables of interest for tp->t_stats stats(9) accounting. Index: sys/netinet/tcp_offload.h =================================================================== --- sys/netinet/tcp_offload.h +++ sys/netinet/tcp_offload.h @@ -46,7 +46,7 @@ void tcp_offload_rcvd(struct tcpcb *); void tcp_offload_ctloutput(struct tcpcb *, int, int); void tcp_offload_tcp_info(struct tcpcb *, struct tcp_info *); -int tcp_offload_alloc_tls_session(struct tcpcb *, struct ktls_session *); +int tcp_offload_alloc_tls_session(struct tcpcb *, struct ktls_session *, bool); void tcp_offload_detach(struct tcpcb *); #endif Index: sys/netinet/tcp_offload.c =================================================================== --- sys/netinet/tcp_offload.c +++ sys/netinet/tcp_offload.c @@ -179,14 +179,15 @@ } int -tcp_offload_alloc_tls_session(struct tcpcb *tp, struct ktls_session *tls) +tcp_offload_alloc_tls_session(struct tcpcb *tp, struct ktls_session *tls, + bool transmit) { struct toedev *tod = tp->tod; KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp)); INP_WLOCK_ASSERT(tp->t_inpcb); - return (tod->tod_alloc_tls_session(tod, tp, tls)); + return (tod->tod_alloc_tls_session(tod, tp, tls, transmit)); } void Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c +++ sys/netinet/tcp_usrreq.c @@ -1823,6 +1823,37 @@ CTASSERT(TCP_LOG_REASON_LEN <= TCP_LOG_ID_LEN); #endif +#ifdef KERN_TLS +static int +copyin_tls_enable(struct sockopt *sopt, struct tls_enable *tls) +{ + struct tls_enable_v0 tls_v0; + int error; + + if (sopt->sopt_valsize == sizeof(tls_v0)) { + error = sooptcopyin(sopt, &tls_v0, sizeof(tls_v0), + sizeof(tls_v0)); + if (error) + return (error); + memset(tls, 0, sizeof(*tls)); + tls->cipher_key = tls_v0.cipher_key; + tls->iv = tls_v0.iv; + tls->auth_key = tls_v0.auth_key; + tls->cipher_algorithm = tls_v0.cipher_algorithm; + tls->cipher_key_len = tls_v0.cipher_key_len; + tls->iv_len = tls_v0.iv_len; + tls->auth_algorithm = tls_v0.auth_algorithm; + tls->auth_key_len = tls_v0.auth_key_len; + tls->flags = tls_v0.flags; + tls->tls_vmajor = tls_v0.tls_vmajor; + tls->tls_vminor = tls_v0.tls_vminor; + return (0); + } + + return (sooptcopyin(sopt, tls, sizeof(*tls), sizeof(*tls))); +} +#endif + int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) { @@ -2034,8 +2065,7 @@ #ifdef KERN_TLS case TCP_TXTLS_ENABLE: INP_WUNLOCK(inp); - error = sooptcopyin(sopt, &tls, sizeof(tls), - sizeof(tls)); + error = copyin_tls_enable(sopt, &tls); if (error) break; error = ktls_enable_tx(so, &tls); @@ -2050,6 +2080,14 @@ error = ktls_set_tx_mode(so, ui); INP_WUNLOCK(inp); break; + case TCP_RXTLS_ENABLE: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &tls, sizeof(tls), + sizeof(tls)); + if (error) + break; + error = ktls_enable_rx(so, &tls); + break; #endif case TCP_KEEPIDLE: @@ -2388,6 +2426,11 @@ INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; + case TCP_RXTLS_MODE: + optval = ktls_get_rx_mode(so); + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof(optval)); + break; #endif default: INP_WUNLOCK(inp); Index: sys/netinet/toecore.h =================================================================== --- sys/netinet/toecore.h +++ sys/netinet/toecore.h @@ -112,7 +112,7 @@ /* Create a TLS session */ int (*tod_alloc_tls_session)(struct toedev *, struct tcpcb *, - struct ktls_session *); + struct ktls_session *, bool); }; typedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *); Index: sys/netinet/toecore.c =================================================================== --- sys/netinet/toecore.c +++ sys/netinet/toecore.c @@ -193,7 +193,7 @@ static int toedev_alloc_tls_session(struct toedev *tod __unused, struct tcpcb *tp __unused, - struct ktls_session *tls __unused) + struct ktls_session *tls __unused, bool transmit __unused) { return (EINVAL); Index: sys/opencrypto/ktls_ocf.c =================================================================== --- sys/opencrypto/ktls_ocf.c +++ sys/opencrypto/ktls_ocf.c @@ -208,6 +208,110 @@ return (error); } +static int +ktls_ocf_tls12_gcm_decrypt(struct ktls_session *tls, + const struct tls_record_layer *hdr, struct iovec *iniov, int iovcnt, + uint64_t seqno, int *trailer_len) +{ + struct uio uio; + struct tls_aead_data ad; + struct tls_nonce_data nd; + struct cryptodesc *crde, *crda; + struct cryptop *crp; + struct ocf_session *os; + struct ocf_operation *oo; + struct iovec *iov; + int error; + uint16_t tls_comp_len; + + os = tls->cipher; + + oo = malloc(sizeof(*oo) + (iovcnt + 1) * sizeof(*iov), M_KTLS_OCF, + M_WAITOK | M_ZERO); + oo->os = os; + iov = oo->iov; + + crp = crypto_getreq(2); + if (crp == NULL) { + free(oo, M_KTLS_OCF); + return (ENOMEM); + } + + /* Setup the IV. */ + memcpy(nd.fixed, tls->params.iv, TLS_AEAD_GCM_LEN); + memcpy(&nd.seq, hdr + 1, sizeof(nd.seq)); + + /* Setup the AAD. */ + tls_comp_len = ntohs(hdr->tls_length) - + (AES_GMAC_HASH_LEN + sizeof(nd.seq)); + ad.seq = htobe64(seqno); + ad.type = hdr->tls_type; + ad.tls_vmajor = hdr->tls_vmajor; + ad.tls_vminor = hdr->tls_vminor; + ad.tls_length = htons(tls_comp_len); + iov[0].iov_base = &ad; + iov[0].iov_len = sizeof(ad); + uio.uio_resid = sizeof(ad); + + /* Copy over IOV entries for the payload and trailer. */ + memcpy(iov + 1, iniov, iovcnt * sizeof(*iov)); + + uio.uio_resid = sizeof(ad) + tls_comp_len + AES_GMAC_HASH_LEN; + uio.uio_iov = iov; + uio.uio_iovcnt = iovcnt + 1; + uio.uio_offset = 0; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_td = curthread; + + crp->crp_session = os->sid; + crp->crp_flags = CRYPTO_F_IOV | CRYPTO_F_CBIMM; + crp->crp_uio = &uio; + crp->crp_ilen = uio.uio_resid; + crp->crp_opaque = oo; + crp->crp_callback = ktls_ocf_callback; + + crde = crp->crp_desc; + crda = crde->crd_next; + + crda->crd_alg = os->crda_alg; + crda->crd_skip = 0; + crda->crd_len = sizeof(ad); + crda->crd_inject = crp->crp_ilen - AES_GMAC_HASH_LEN; + + crde->crd_alg = CRYPTO_AES_NIST_GCM_16; + crde->crd_skip = sizeof(ad); + crde->crd_len = crp->crp_ilen - (sizeof(ad) + AES_GMAC_HASH_LEN); + crde->crd_flags = CRD_F_IV_EXPLICIT | CRD_F_IV_PRESENT; + memcpy(crde->crd_iv, &nd, sizeof(nd)); + + counter_u64_add(ocf_tls12_gcm_crypts, 1); + for (;;) { + error = crypto_dispatch(crp); + if (error) + break; + + mtx_lock(&os->lock); + while (!oo->done) + mtx_sleep(oo, &os->lock, 0, "ocfktls", 0); + mtx_unlock(&os->lock); + + if (crp->crp_etype != EAGAIN) { + error = crp->crp_etype; + break; + } + + crp->crp_etype = 0; + crp->crp_flags &= ~CRYPTO_F_DONE; + oo->done = false; + counter_u64_add(ocf_retries, 1); + } + + crypto_freereq(crp); + free(oo, M_KTLS_OCF); + *trailer_len = AES_GMAC_HASH_LEN; + return (error); +} + static int ktls_ocf_tls13_gcm_encrypt(struct ktls_session *tls, const struct tls_record_layer *hdr, uint8_t *trailer, struct iovec *iniov, @@ -332,7 +436,7 @@ } static int -ktls_ocf_try(struct socket *so, struct ktls_session *tls) +ktls_ocf_try(struct socket *so, struct ktls_session *tls, bool transmit) { struct cryptoini cria, crie; struct ocf_session *os; @@ -366,6 +470,10 @@ tls->params.tls_vminor > TLS_MINOR_VER_THREE) return (EPROTONOSUPPORT); + /* TLS 1.3 is not yet supported for receive. */ + if (!transmit && tls->params.tls_vminor == TLS_MINOR_VER_THREE) + return (EPROTONOSUPPORT); + os = malloc(sizeof(*os), M_KTLS_OCF, M_NOWAIT | M_ZERO); if (os == NULL) return (ENOMEM); @@ -385,10 +493,14 @@ os->crda_alg = cria.cri_alg; mtx_init(&os->lock, "ktls_ocf", NULL, MTX_DEF); tls->cipher = os; - if (tls->params.tls_vminor == TLS_MINOR_VER_THREE) - tls->sw_encrypt = ktls_ocf_tls13_gcm_encrypt; - else - tls->sw_encrypt = ktls_ocf_tls12_gcm_encrypt; + if (transmit) { + if (tls->params.tls_vminor == TLS_MINOR_VER_THREE) + tls->sw_encrypt = ktls_ocf_tls13_gcm_encrypt; + else + tls->sw_encrypt = ktls_ocf_tls12_gcm_encrypt; + } else { + tls->sw_decrypt = ktls_ocf_tls12_gcm_decrypt; + } tls->free = ktls_ocf_free; return (0); } Index: sys/sys/ktls.h =================================================================== --- sys/sys/ktls.h +++ sys/sys/ktls.h @@ -98,7 +98,23 @@ #define TLS_MINOR_VER_TWO 3 /* 3, 3 */ #define TLS_MINOR_VER_THREE 4 /* 3, 4 */ -/* For TCP_TXTLS_ENABLE */ +/* For TCP_TXTLS_ENABLE and TCP_RXTLS_ENABLE. */ +#ifdef _KERNEL +struct tls_enable_v0 { + const uint8_t *cipher_key; + const uint8_t *iv; /* Implicit IV. */ + const uint8_t *auth_key; + int cipher_algorithm; /* e.g. CRYPTO_AES_CBC */ + int cipher_key_len; + int iv_len; + int auth_algorithm; /* e.g. CRYPTO_SHA2_256_HMAC */ + int auth_key_len; + int flags; + uint8_t tls_vmajor; + uint8_t tls_vminor; +}; +#endif + struct tls_enable { const uint8_t *cipher_key; const uint8_t *iv; /* Implicit IV. */ @@ -111,8 +127,20 @@ int flags; uint8_t tls_vmajor; uint8_t tls_vminor; + uint8_t rec_seq[8]; }; +/* Structure for TLS_GET_RECORD. */ +struct tls_get_record { + /* TLS record header. */ + uint8_t tls_type; + uint8_t tls_vmajor; + uint8_t tls_vminor; + uint16_t tls_length; +}; + +#ifdef _KERNEL + struct tls_session_params { uint8_t *cipher_key; uint8_t *auth_key; @@ -131,9 +159,7 @@ uint8_t flags; }; -#ifdef _KERNEL - -#define KTLS_API_VERSION 6 +#define KTLS_API_VERSION 7 struct iovec; struct ktls_session; @@ -145,7 +171,7 @@ struct ktls_crypto_backend { LIST_ENTRY(ktls_crypto_backend) next; - int (*try)(struct socket *so, struct ktls_session *tls); + int (*try)(struct socket *so, struct ktls_session *tls, bool transmit); int prio; int api_version; int use_count; @@ -153,10 +179,15 @@ }; struct ktls_session { - int (*sw_encrypt)(struct ktls_session *tls, - const struct tls_record_layer *hdr, uint8_t *trailer, - struct iovec *src, struct iovec *dst, int iovcnt, - uint64_t seqno, uint8_t record_type); + union { + int (*sw_encrypt)(struct ktls_session *tls, + const struct tls_record_layer *hdr, uint8_t *trailer, + struct iovec *src, struct iovec *dst, int iovcnt, + uint64_t seqno, uint8_t record_type); + int (*sw_decrypt)(struct ktls_session *tls, + const struct tls_record_layer *hdr, struct iovec *iov, + int iovcnt, uint64_t seqno, int *trailer_len); + }; union { void *cipher; struct m_snd_tag *snd_tag; @@ -173,8 +204,10 @@ bool reset_pending; } __aligned(CACHE_LINE_SIZE); +void ktls_check_rx(struct sockbuf *sb); int ktls_crypto_backend_register(struct ktls_crypto_backend *be); int ktls_crypto_backend_deregister(struct ktls_crypto_backend *be); +int ktls_enable_rx(struct socket *so, struct tls_enable *en); int ktls_enable_tx(struct socket *so, struct tls_enable *en); void ktls_destroy(struct ktls_session *tls); void ktls_frame(struct mbuf *m, struct ktls_session *tls, int *enqueue_cnt, @@ -182,6 +215,7 @@ void ktls_seq(struct sockbuf *sb, struct mbuf *m); void ktls_enqueue(struct mbuf *m, struct socket *so, int page_count); void ktls_enqueue_to_free(struct mbuf_ext_pgs *pgs); +int ktls_get_rx_mode(struct socket *so); int ktls_set_tx_mode(struct socket *so, int mode); int ktls_get_tx_mode(struct socket *so); int ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls); Index: sys/sys/sockbuf.h =================================================================== --- sys/sys/sockbuf.h +++ sys/sys/sockbuf.h @@ -38,6 +38,8 @@ /* * Constants for sb_flags field of struct sockbuf/xsockbuf. */ +#define SB_TLS_RX 0x01 /* using KTLS on RX */ +#define SB_TLS_RX_RUNNING 0x02 /* KTLS RX operation running */ #define SB_WAIT 0x04 /* someone is waiting for data/space */ #define SB_SEL 0x08 /* someone is selecting */ #define SB_ASYNC 0x10 /* ASYNC I/O, need signals */ @@ -99,10 +101,14 @@ u_int sb_ccnt; /* (a) number of clusters in buffer */ u_int sb_mbmax; /* (a) max chars of mbufs to use */ u_int sb_ctl; /* (a) non-data chars in buffer */ + u_int sb_tlscc; /* (a) TLS chain characters */ + u_int sb_tlsdcc; /* (a) TLS characters being decrypted */ int sb_lowat; /* (a) low water mark */ sbintime_t sb_timeo; /* (a) timeout for read/write */ uint64_t sb_tls_seqno; /* (a) TLS seqno */ struct ktls_session *sb_tls_info; /* (a + b) TLS state */ + struct mbuf *sb_mtls; /* (a) TLS mbuf chain */ + struct mbuf *sb_mtlstail; /* (a) last mbuf in TLS chain */ short sb_flags; /* (a) flags, see above */ int (*sb_upcall)(struct socket *, void *, int); /* (a) */ void *sb_upcallarg; /* (a) */ @@ -153,6 +159,9 @@ void sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n); struct mbuf * sbcreatecontrol(caddr_t p, int size, int type, int level); +struct mbuf * + sbcreatecontrol_how(void *p, int size, int type, int level, + int wait); void sbdestroy(struct sockbuf *sb, struct socket *so); void sbdrop(struct sockbuf *sb, int len); void sbdrop_locked(struct sockbuf *sb, int len); @@ -178,6 +187,8 @@ void sbunlock(struct sockbuf *sb); void sballoc(struct sockbuf *, struct mbuf *); void sbfree(struct sockbuf *, struct mbuf *); +void sballoc_ktls_rx(struct sockbuf *sb, struct mbuf *m); +void sbfree_ktls_rx(struct sockbuf *sb, struct mbuf *m); int sbready(struct sockbuf *, struct mbuf *, int); /* Index: sys/sys/socket.h =================================================================== --- sys/sys/socket.h +++ sys/sys/socket.h @@ -147,6 +147,9 @@ #define SO_NO_OFFLOAD 0x00004000 /* socket cannot be offloaded */ #define SO_NO_DDP 0x00008000 /* disable direct data placement */ #define SO_REUSEPORT_LB 0x00010000 /* reuse with load balancing */ +#if __BSD_VISIBLE +#define SO_WANT_KTLS 0x00020000 /* kernel TLS might be requested */ +#endif /* * Additional options, not kept in so_options. Index: sys/sys/socketvar.h =================================================================== --- sys/sys/socketvar.h +++ sys/sys/socketvar.h @@ -77,6 +77,7 @@ * (f) not locked since integer reads/writes are atomic. * (g) used only as a sleep/wakeup address, no value. * (h) locked by global mutex so_global_mtx. + * (k) locked by KTLS workqueue mutex */ TAILQ_HEAD(accept_queue, socket); struct socket { @@ -131,6 +132,9 @@ /* (b) cached MAC label for peer */ struct label *so_peerlabel; u_long so_oobmark; /* chars to oob mark */ + + /* (k) Our place on KTLS RX work queue. */ + STAILQ_ENTRY(socket) so_ktls_rx_list; }; /* * Listening socket, where accepts occur, is so_listen in all